# Summary of stuff

## Model Data Import

In [1]:
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

import os
import numpy as np
import pickle
import pandas as pd

In [2]:
model_df = pd.read_csv(r"..\data\training_data_eda.csv")

In [3]:
model_df = model_df.drop(columns="Unnamed: 0")

In [4]:
model_df.head()

Unnamed: 0,max_credit,gender,education,marital_status,age,pay_status_sep,pay_status_aug,pay_status_jul,pay_status_jun,pay_status_may,...,average_bill_change,full_pay_sep,full_pay_aug,full_pay_jul,full_pay_jun,full_pay__may,full_pay_apr,pay_status_sum,habit_delay,default
0,220000,1,1,2,36,0,0,0,0,0,...,7598.6,0,0,0,0,0,0,0,0,1
1,200000,1,3,2,29,-1,-1,-1,-1,-1,...,0.0,0,0,0,0,0,0,-6,0,0
2,180000,1,1,2,27,-2,-2,-2,-2,-2,...,0.0,1,1,1,1,1,1,-12,0,0
3,80000,0,2,2,32,0,0,0,0,0,...,1769.0,0,0,0,0,0,0,0,0,0
4,10000,0,2,2,27,0,0,0,0,0,...,1112.0,0,0,0,0,0,0,0,0,1


## Upsample Minority Result

In [5]:
model_df.groupby("default").age.count()

default
0    17471
1     5028
Name: age, dtype: int64

In [6]:
defaulted = model_df[model_df.default == 1]
undefaulted = model_df[model_df.default == 0]

In [7]:
defaulted_upsampled = resample(defaulted,
                          replace=True, # sample with replacement
                          n_samples=undefaulted.shape[0], # match number in majority class
                          random_state=42) # reproducible result

In [8]:
resampled_df  = pd.concat([undefaulted, defaulted_upsampled])
resampled_df.default.value_counts()

1    17471
0    17471
Name: default, dtype: int64

In [9]:
X = resampled_df.drop(columns="default")
y = resampled_df.default

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=40)

### Standard Scaling for KNN

In [11]:
scaler = StandardScaler()
scaled_data_train = scaler.fit_transform(X_train)
scaled_data_test = scaler.transform(X_test)

## Baseline Models

In [12]:
dummy = DummyClassifier()
logreg = LogisticRegression(max_iter = 10**5, verbose=1)
tree = DecisionTreeClassifier()
knn = KNeighborsClassifier()

## All Features GridSearchCV
This is gonna be a long one

In [18]:
len(model_df.columns)

52

In [30]:
# run history
# DecisionTreeClassifier(criterion='entropy', max_depth=22, max_features=39, min_samples_leaf=30)
# DecisionTreeClassifier(criterion='entropy', max_depth=18, max_features=47, min_samples_leaf=20)
tree_param_dict = {"max_depth":range(15,25,1),
              "criterion":["entropy"], 
              "min_samples_leaf":range(50,80,5),
              "splitter":["best"],
              "max_features":range(30, len(model_df.columns))} 

# last run
# KNeighborsClassifier(algorithm='ball_tree', leaf_size=10, n_neighbors=90, weights='distance')
knn_param_dict = {"n_neighbors":range(50, 110, 10),
                 "weights":["distance", "uniform"],
                 "algorithm":["ball_tree", "kd_tree"],
                 "leaf_size": range(5, 16, 2)}

In [31]:
grid_tree = GridSearchCV(tree, tree_param_dict, cv=10, scoring="f1", n_jobs=-1, verbose=1)
grid_knn = GridSearchCV(knn, knn_param_dict, cv=10, scoring="f1", n_jobs=-1, verbose=1)

In [32]:
y_all_grid_tree = grid_tree.fit(X_train, y_train)


Fitting 10 folds for each of 396 candidates, totalling 3960 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   29.8s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 10.6min
[Parallel(n_jobs=-1)]: Done 3960 out of 3960 | elapsed: 13.7min finished


In [41]:
# gridsearch decision tree parameter result
print(grid_tree.best_params_)
print(grid_tree.best_estimator_)
# make predictions for train and test
grid_tree_pred_train = grid_tree.best_estimator_.predict(X_train)
grid_tree_pred_test = grid_tree.best_estimator_.predict(X_test)

#check for significant difference in F1 between train and test for overfitting
print("Overfitting Check")
print("Train GridSearch DecisionTree F1 score: ", f1_score(y_train, grid_tree_pred_train))
print("Test GridSearch DecisionTree F1 score: ", f1_score(y_test, grid_tree_pred_test))

{'criterion': 'entropy', 'max_depth': 20, 'max_features': 49, 'min_samples_leaf': 10, 'splitter': 'best'}
DecisionTreeClassifier(criterion='entropy', max_depth=20, max_features=49,
                       min_samples_leaf=10)
Overfitting Check
Train GridSearch DecisionTree F1 score:  0.855721393034826
Test GridSearch DecisionTree F1 score:  0.8631032927379341


In [None]:
y_all_grid_knn = grid_knn.fit(scaled_data_train, y_train)

In [18]:
# gridsearch knn parameter result
print(grid_knn.best_params_)
print(grid_knn.best_estimator_)

grid_knn_pred_train = grid_knn.best_estimator_.predict(scaled_data_train)
grid_knn_pred_test = grid_knn.best_estimator_.predict(scaled_data_test)

print("Overfitting Check")
print("Train GridSearch KNN F1 score: ", f1_score(y_train, grid_knn_pred_train))
print("Test GridSearch KNN F1 score: ", f1_score(y_test, grid_knn_pred_test))

{'algorithm': 'ball_tree', 'leaf_size': 10, 'n_neighbors': 90, 'weights': 'distance'}
KNeighborsClassifier(algorithm='ball_tree', leaf_size=10, n_neighbors=90,
                     weights='distance')
Overfitting Check
Train GridSearch KNN F1 score:  0.9996182623301267
Test GridSearch KNN F1 score:  0.8888417029408642


### Current Metrics to Beat:

Current:Decision Tree <br>
Train GridSearch DecisionTree F1 score:  0.855721393034826 <br>
Test GridSearch DecisionTree F1 score:  0.8631032927379341 <br>

Old:Decision Tree <br>
Train GridSearch DecisionTree F1 score:  0.68 <br>
Test GridSearch DecisionTree F1 score:  0.68 <br>

## Saving Models Through Pickle

In [23]:
# GridSearchCV Decision Tree Pickel
with open(r"..\model\grid_tree.pickle", "wb") as grid_tree_pickle:
    pickle.dump(grid_tree, grid_tree_pickle)

In [52]:
# Engineered Feature Only Standard Scaler
with open(r"..\model\eng_scale.pickle", "wb") as scaler_eng_features:
    pickle.dump(eng_scaler, scaler_eng_features)
# GridSearchCV KNN Pickle
with open(r"..\model\grid_knn.pickle", "wb") as model:
    pickle.dump(grid_knn, model)