# Summary of stuff

## Model Data Import

In [2]:
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

import os
import numpy as np
import pickle
import pandas as pd

In [3]:
model_df = pd.read_csv(r"..\data\training_data_eda.csv")

In [4]:
model_df = model_df.drop(columns="Unnamed: 0")

In [5]:
model_df.head()

Unnamed: 0,max_credit,gender,education,marital_status,age,pay_status_sep,pay_status_aug,pay_status_jul,pay_status_jun,pay_status_may,...,full_pay_aug,full_pay_jul,full_pay_jun,full_pay__may,full_pay_apr,apr_may_bill_change,may_jun_bill_change,jun_jul_bill_change,jul_aug_bill_change,aug_sep_bill_change
0,220000,1,1,2,36,0,0,0,0,0,...,0,0,0,0,0,-2746,39334,-3293,4268,430
1,200000,1,3,2,29,-1,-1,-1,-1,-1,...,0,0,0,0,0,0,0,0,0,0
2,180000,1,1,2,27,-2,-2,-2,-2,-2,...,1,1,1,1,1,0,0,0,0,0
3,80000,0,2,2,32,0,0,0,0,0,...,0,0,0,0,0,-271,1626,3711,4279,-500
4,10000,0,2,2,27,0,0,0,0,0,...,0,0,0,0,0,-58,2805,-566,3117,262


## Upsample Minority Result

In [6]:
model_df.groupby("default").age.count()

default
0    17471
1     5028
Name: age, dtype: int64

In [7]:
defaulted = model_df[model_df.default == 1]
undefaulted = model_df[model_df.default == 0]

In [8]:
defaulted_upsampled = resample(defaulted,
                          replace=True, # sample with replacement
                          n_samples=undefaulted.shape[0], # match number in majority class
                          random_state=42) # reproducible result

In [9]:
resampled_df  = pd.concat([undefaulted, defaulted_upsampled])
resampled_df.default.value_counts()

1    17471
0    17471
Name: default, dtype: int64

In [10]:
X = resampled_df.drop(columns="default")
y = resampled_df.default

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### Standard Scaling for KNN

In [12]:
scaler = StandardScaler()
scaled_data_train = scaler.fit_transform(X_train)
scaled_data_test = scaler.transform(X_test)

## Baseline Models

In [13]:
dummy = DummyClassifier()
logreg = LogisticRegression(max_iter = 10**5, verbose=1)
tree = DecisionTreeClassifier()
knn = KNeighborsClassifier()

## All Features GridSearchCV
This is gonna be a long one

In [17]:
tree_param_dict = {"max_depth":range(5,20,1),
              "criterion":["gini", "entropy"], 
              "min_samples_leaf":range(100,500,50),
              "splitter":["best", "random"],
              "max_features":range(6, len(model_df.columns))} 

knn_param_dict = {"n_neighbors":range(10, 100, 10),
                 "weights":["distance", "uniform"],
                 "algorithm":["ball_tree", "kd_tree"],
                 "leaf_size": range(10, 100, 10)} #optimal at 50

In [18]:
grid_tree = GridSearchCV(tree, tree_param_dict, cv=10, scoring="f1", n_jobs=-1, verbose=1)
grid_knn = GridSearchCV(knn, knn_param_dict, cv=5, scoring="f1", n_jobs=-1, verbose=1)

In [19]:
y_all_grid_tree = grid_tree.fit(X_train, y_train)
y_all_grid_knn = grid_knn.fit(scaled_data_train, y_train)

Fitting 10 folds for each of 20640 candidates, totalling 206400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 656 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done 1656 tasks      | elapsed:   28.9s
[Parallel(n_jobs=-1)]: Done 3056 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 4856 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 7056 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 9656 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 12656 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 16056 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 19856 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 24056 tasks      | elapsed: 12.0min
[Parallel(n_jobs=-1)]: Done 27504 tasks      | elapsed: 14.6min
[Parallel(n_jobs=-1)]: Done 31810 tasks      | elapsed: 16.9min
[Parallel(n_jobs=-1)]: Done 33160 tasks      | elapsed: 18.0min
[Parallel(n_jobs=-1)]: Done 35068 tas

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 

In [20]:
# gridsearch decision tree parameter result
print(grid_tree.best_params_)
print(grid_tree.best_estimator_)

# make predictions for train and test
grid_tree_pred_train = grid_tree.best_estimator_.predict(X_train)
grid_tree_pred_test = grid_tree.best_estimator_.predict(X_test)

#check for significant difference in F1 between train and test for overfitting
print("Overfitting Check")
print("Train GridSearch DecisionTree F1 score: ", f1_score(y_train, grid_tree_pred_train))
print("Test GridSearch DecisionTree F1 score: ", f1_score(y_test, grid_tree_pred_test))

{'criterion': 'gini', 'max_depth': 14, 'max_features': 31, 'min_samples_leaf': 100, 'splitter': 'best'}
DecisionTreeClassifier(max_depth=14, max_features=31, min_samples_leaf=100)
Overfitting Check
Train GridSearch DecisionTree F1 score:  0.739447893483269
Test GridSearch DecisionTree F1 score:  0.7173607861488067


In [None]:
# gridsearch knn parameter result
print(grid_knn.best_params_)
print(grid_knn.best_estimator_)

grid_knn_pred_train = grid_knn.best_estimator_.predict(scaled_data_train_eng)
grid_knn_pred_test = grid_knn.best_estimator_.predict(scaled_data_test_eng)

print("Overfitting Check")
print("Train GridSearch KNN F1 score: ", f1_score(y_train, grid_knn_pred_train))
print("Test GridSearch KNN F1 score: ", f1_score(y_test, grid_knn_pred_test))

### Current Metrics to Beat:

Current:Decision Tree <br>
Train GridSearch DecisionTree F1 score:  0.739447893483269 <br>
Test GridSearch DecisionTree F1 score:  0.7173607861488067 <br>

Old:Decision Tree <br>
Train GridSearch DecisionTree F1 score:  0.6587963557038369 <br>
Test GridSearch DecisionTree F1 score:  0.6583419155509784 <br>


Current: KNN <br>
GridSearch KNN F1 score: <br>
GridSearch KNN F1 score: <br>

**THIS IS THE BEST PERFORMER YET**
Old: KNN <br>
GridSearch KNN F1 score:  0.9956541628545289 <br>
GridSearch KNN F1 score:  0.8771084337349397 <br>

## Saving Models Through Pickle

In [50]:
# GridSearchCV KNN Pickle
with open(r"..\model\grid_knn.pickle", "wb") as model:
    pickle.dump(grid_knn, model)

In [21]:
# GridSearchCV Decision Tree Pickel
with open(r"..\model\grid_tree.pickle", "wb") as grid_tree_pickle:
    pickle.dump(grid_tree, grid_tree_pickle)

In [52]:
# Engineered Feature Only Standard Scaler
with open(r"..\model\eng_scale.pickle", "wb") as scaler_eng_features:
    pickle.dump(eng_scaler, scaler_eng_features)