# Summary of stuff

## Model Data Import

In [1]:
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

import os
import numpy as np
import pickle
import pandas as pd

In [2]:
model_df = pd.read_csv(r"..\data\training_data_eda.csv")

In [3]:
model_df = model_df.drop(columns="Unnamed: 0")

In [4]:
model_df.head()

Unnamed: 0,max_credit,gender,education,marital_status,age,pay_status_sep,pay_status_aug,pay_status_jul,pay_status_jun,pay_status_may,...,full_pay_aug,full_pay_jul,full_pay_jun,full_pay__may,full_pay_apr,apr_may_bill_change,may_jun_bill_change,jun_jul_bill_change,jul_aug_bill_change,aug_sep_bill_change
0,220000,1,1,2,36,0,0,0,0,0,...,0,0,0,0,0,-2746,39334,-3293,4268,430
1,200000,1,3,2,29,-1,-1,-1,-1,-1,...,0,0,0,0,0,0,0,0,0,0
2,180000,1,1,2,27,-2,-2,-2,-2,-2,...,1,1,1,1,1,0,0,0,0,0
3,80000,0,2,2,32,0,0,0,0,0,...,0,0,0,0,0,-271,1626,3711,4279,-500
4,10000,0,2,2,27,0,0,0,0,0,...,0,0,0,0,0,-58,2805,-566,3117,262


## Upsample Minority Result

In [5]:
model_df.groupby("default").age.count()

default
0    17471
1     5028
Name: age, dtype: int64

In [6]:
defaulted = model_df[model_df.default == 1]
undefaulted = model_df[model_df.default == 0]

In [7]:
defaulted_upsampled = resample(defaulted,
                          replace=True, # sample with replacement
                          n_samples=undefaulted.shape[0], # match number in majority class
                          random_state=42) # reproducible result

In [8]:
resampled_df  = pd.concat([undefaulted, defaulted_upsampled])
resampled_df.default.value_counts()

1    17471
0    17471
Name: default, dtype: int64

In [9]:
X = resampled_df.drop(columns="default")
y = resampled_df.default

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### Standard Scaling for KNN

In [11]:
scaler = StandardScaler()
scaled_data_train = scaler.fit_transform(X_train)
scaled_data_test = scaler.transform(X_test)

## Baseline Models

In [12]:
dummy = DummyClassifier()
logreg = LogisticRegression(max_iter = 10**5, verbose=1)
tree = DecisionTreeClassifier()
knn = KNeighborsClassifier()

## All Features GridSearchCV
This is gonna be a long one

In [13]:
tree_param_dict = {"max_depth":range(15,23,1),
              "criterion":["gini", "entropy"], 
              "min_samples_leaf":range(30,60,10),
              "splitter":["best", "random"],
              "max_features":range(15, len(model_df.columns))} 

knn_param_dict = {"n_neighbors":range(50, 110, 10),
                 "weights":["distance", "uniform"],
                 "algorithm":["ball_tree", "kd_tree"],
                 "leaf_size": range(5, 16, 2)}

In [14]:
grid_tree = GridSearchCV(tree, tree_param_dict, cv=10, scoring="f1", n_jobs=-1, verbose=1)
grid_knn = GridSearchCV(knn, knn_param_dict, cv=10, scoring="f1", n_jobs=-1, verbose=1)

In [15]:
y_all_grid_tree = grid_tree.fit(X_train, y_train)
y_all_grid_knn = grid_knn.fit(scaled_data_train, y_train)

Fitting 10 folds for each of 6800 candidates, totalling 68000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   18.1s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:   32.4s
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:   53.0s
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 4616 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 6516 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 8616 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 10916 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done 13416 tasks      | elapsed: 12.7min
[Parallel(n_jobs=-1)]: Done 16116 tasks      | elapsed: 15.1min
[Parallel(n_jobs=-1)]: Done 19016 tasks    

Fitting 10 folds for each of 324 candidates, totalling 3240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 18.3min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 32.4min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 50.6min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 74.0min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 98.9min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 128.0min
[Parallel(n_jobs=-1)]: Done 3240 out of 3240 | elapsed: 129.9min finished


In [16]:
# gridsearch decision tree parameter result
print(grid_tree.best_params_)
print(grid_tree.best_estimator_)

# make predictions for train and test
grid_tree_pred_train = grid_tree.best_estimator_.predict(X_train)
grid_tree_pred_test = grid_tree.best_estimator_.predict(X_test)

#check for significant difference in F1 between train and test for overfitting
print("Overfitting Check")
print("Train GridSearch DecisionTree F1 score: ", f1_score(y_train, grid_tree_pred_train))
print("Test GridSearch DecisionTree F1 score: ", f1_score(y_test, grid_tree_pred_test))

{'criterion': 'entropy', 'max_depth': 18, 'max_features': 39, 'min_samples_leaf': 50, 'splitter': 'best'}
DecisionTreeClassifier(criterion='entropy', max_depth=18, max_features=39,
                       min_samples_leaf=50)
Overfitting Check
Train GridSearch DecisionTree F1 score:  0.7747906229683735
Test GridSearch DecisionTree F1 score:  0.7344999420558581


In [18]:
# gridsearch knn parameter result
print(grid_knn.best_params_)
print(grid_knn.best_estimator_)

grid_knn_pred_train = grid_knn.best_estimator_.predict(scaled_data_train)
grid_knn_pred_test = grid_knn.best_estimator_.predict(scaled_data_test)

print("Overfitting Check")
print("Train GridSearch KNN F1 score: ", f1_score(y_train, grid_knn_pred_train))
print("Test GridSearch KNN F1 score: ", f1_score(y_test, grid_knn_pred_test))

{'algorithm': 'ball_tree', 'leaf_size': 10, 'n_neighbors': 90, 'weights': 'distance'}
KNeighborsClassifier(algorithm='ball_tree', leaf_size=10, n_neighbors=90,
                     weights='distance')
Overfitting Check
Train GridSearch KNN F1 score:  0.9996182623301267
Test GridSearch KNN F1 score:  0.8888417029408642


### Current Metrics to Beat:

Current:Decision Tree <br>
Train GridSearch DecisionTree F1 score:  0.739447893483269 <br>
Test GridSearch DecisionTree F1 score:  0.7173607861488067 <br>

Old:Decision Tree <br>
Train GridSearch DecisionTree F1 score:  0.6587963557038369 <br>
Test GridSearch DecisionTree F1 score:  0.6583419155509784 <br>


Current: KNN <br>
GridSearch KNN F1 score: <br>
GridSearch KNN F1 score: <br>

**THIS IS THE BEST PERFORMER YET**
Old: KNN <br>
GridSearch KNN F1 score:  0.9956541628545289 <br>
GridSearch KNN F1 score:  0.8771084337349397 <br>

## Saving Models Through Pickle

In [50]:
# GridSearchCV KNN Pickle
with open(r"..\model\grid_knn.pickle", "wb") as model:
    pickle.dump(grid_knn, model)

In [19]:
# GridSearchCV Decision Tree Pickel
with open(r"..\model\grid_tree.pickle", "wb") as grid_tree_pickle:
    pickle.dump(grid_tree, grid_tree_pickle)

In [52]:
# Engineered Feature Only Standard Scaler
with open(r"..\model\eng_scale.pickle", "wb") as scaler_eng_features:
    pickle.dump(eng_scaler, scaler_eng_features)