# Summary of stuff

## Model Data Import

In [1]:
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

import os
import numpy as np
import pickle
import pandas as pd

In [5]:
model_df = pd.read_csv(r"..\data\training_data_eda.csv")

In [6]:
model_df = model_df.drop(columns="Unnamed: 0")

In [7]:
model_df.head()

Unnamed: 0,max_credit,gender,education,marital_status,age,pay_status_sep,pay_status_aug,pay_status_jul,pay_status_jun,pay_status_may,...,average_bill_change,full_pay_sep,full_pay_aug,full_pay_jul,full_pay_jun,full_pay__may,full_pay_apr,pay_status_sum,habit_delay,default
0,220000,1,1,2,36,0,0,0,0,0,...,7598.6,0,0,0,0,0,0,0,0,1
1,200000,1,3,2,29,-1,-1,-1,-1,-1,...,0.0,0,0,0,0,0,0,-6,0,0
2,180000,1,1,2,27,-2,-2,-2,-2,-2,...,0.0,1,1,1,1,1,1,-12,0,0
3,80000,0,2,2,32,0,0,0,0,0,...,1769.0,0,0,0,0,0,0,0,0,0
4,10000,0,2,2,27,0,0,0,0,0,...,1112.0,0,0,0,0,0,0,0,0,1


## Upsample Minority Result

In [8]:
model_df.groupby("default").age.count()

default
0    17471
1     5028
Name: age, dtype: int64

In [9]:
defaulted = model_df[model_df.default == 1]
undefaulted = model_df[model_df.default == 0]

In [10]:
defaulted_upsampled = resample(defaulted,
                          replace=True, # sample with replacement
                          n_samples=undefaulted.shape[0], # match number in majority class
                          random_state=42) # reproducible result

In [11]:
resampled_df  = pd.concat([undefaulted, defaulted_upsampled])
resampled_df.default.value_counts()

1    17471
0    17471
Name: default, dtype: int64

In [12]:
X = resampled_df.drop(columns="default")
y = resampled_df.default

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=40)

## Decision Tree GridSearchCV
This is gonna be a long one

In [54]:
tree = DecisionTreeClassifier()

In [44]:
# run history
# DecisionTreeClassifier(criterion='entropy', max_depth=22, max_features=39, min_samples_leaf=30)
# DecisionTreeClassifier(criterion='entropy', max_depth=18, max_features=47, min_samples_leaf=20)
tree_param_dict = {"max_depth":range(15,25,1),
              "criterion":["entropy"], 
              "min_samples_leaf":range(50,80,5),
              "splitter":["best"],
              "max_features":range(30, len(model_df.columns))} 

In [31]:
grid_tree = GridSearchCV(tree, tree_param_dict, cv=10, scoring="f1", n_jobs=-1, verbose=1)

In [45]:
grid_tree.fit(X_train, y_train)

Fitting 10 folds for each of 396 candidates, totalling 3960 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   36.4s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done 3960 out of 3960 | elapsed: 14.3min finished


In [46]:
# gridsearch decision tree parameter result
print(grid_tree.best_params_)
print(grid_tree.best_estimator_)
# make predictions for train and test
grid_tree_pred_train = grid_tree.best_estimator_.predict(X_train)
grid_tree_pred_test = grid_tree.best_estimator_.predict(X_test)

#check for significant difference in F1 between train and test for overfitting
print("Overfitting Check")
print("Train GridSearch DecisionTree F1 score: ", f1_score(y_train, grid_tree_pred_train))
print("Test GridSearch DecisionTree F1 score: ", f1_score(y_test, grid_tree_pred_test))

{'criterion': 'entropy', 'max_depth': 20, 'max_features': 51, 'min_samples_leaf': 10, 'splitter': 'best'}
DecisionTreeClassifier(criterion='entropy', max_depth=20, max_features=51,
                       min_samples_leaf=10)
Overfitting Check
Train GridSearch DecisionTree F1 score:  0.8844993968636912
Test GridSearch DecisionTree F1 score:  0.7994763255509492


In [61]:
avg_train_f1 = 0
avg_test_f1 = 0
for i in range(0, 100):
    # Get training set
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    grid_tree_pred_train = grid_tree.best_estimator_.predict(X_train)
    grid_tree_pred_test = grid_tree.best_estimator_.predict(X_test)
    
    avg_train_f1 += f1_score(y_train, grid_tree_pred_train)
    avg_test_f1 += f1_score(y_test, grid_tree_pred_test)
    
#check for significant difference in F1 between train and test for overfitting
print("Overfitting Check")
print("Train GridSearch DecisionTree F1 score: ", avg_train_f1/100)
print("Test GridSearch DecisionTree F1 score: ", avg_test_f1/100)

Overfitting Check
Train GridSearch DecisionTree F1 score:  0.8624406496375339
Test GridSearch DecisionTree F1 score:  0.522740901108458


### Saving Models Through Pickle

In [60]:
# GridSearchCV Decision Tree Pickle
with open(r"..\model\grid_tree.pickle", "wb") as grid_tree_pickle:
    pickle.dump(grid_tree, grid_tree_pickle)

## Random Forrest with GridSearchCV

In [46]:
rfc = RandomForestClassifier(verbose=1)

In [47]:
rf_param_dict = {"criterion":["entropy"],
                 "max_depth":range(10,41, 10), 
                 "min_samples_leaf":range(50, 101,10),
                 "max_features":range(30, len(model_df.columns), 2),
                 "n_estimators":range(50,201,50)
                 }

In [48]:
grid_rfc = GridSearchCV(rfc, rf_param_dict, cv=10, scoring="f1", n_jobs=-1, verbose=1)

In [49]:
grid_rfc.fit(X_train, y_train)

Fitting 10 folds for each of 1056 candidates, totalling 10560 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 25.1min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 60.8min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 115.5min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 192.8min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 297.7min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 431.1min


KeyboardInterrupt: 

In [None]:
# gridsearch decision tree parameter result
print(grid_rfc.best_params_)
print(grid_rfc.best_estimator_)
# make predictions for train and test
grid_rfc_pred_train = grid_rfc.best_estimator_.predict(X_train)
grid_rfc_pred_test = grid_rfc.best_estimator_.predict(X_test)

#check for significant difference in F1 between train and test for overfitting
print("Overfitting Check")
print("Train GridSearch DecisionTree F1 score: ", f1_score(y_train, grid_rfc_pred_train))
print("Test GridSearch DecisionTree F1 score: ", f1_score(y_test, grid_rfc_pred_test))

In [None]:
avg_train_f1 = 0
avg_test_f1 = 0
for i in range(0, 100):
    # Get training set
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    grid_rfc_pred_train = grid_rfc.best_estimator_.predict(X_train)
    grid_rfc_pred_test = grid_rfc.best_estimator_.predict(X_test)
    
    avg_train_f1 += f1_score(y_train, grid_rfc_pred_train)
    avg_test_f1 += f1_score(y_test, grid_rfc_pred_test)
    
#check for significant difference in F1 between train and test for overfitting
print("Overfitting Check")
print("Train GridSearch DecisionTree F1 score: ", avg_train_f1/100)
print("Test GridSearch DecisionTree F1 score: ", avg_test_f1/100)

In [None]:
# Pickle RandomForrest Model
with open(r"..\model\grid_rfc.pickle", "rb") as grid_rfc_pickle:
    pickle.dump(grid_rfc, grid_rfc_pickle)

## KNN Model Check

In [54]:
# Pickled KNN Model
with open(r"..\model\grid_knn.pickle", "rb") as grid_knn_pickle:
    grid_knn = pickle.load(grid_knn_pickle)

In [56]:
eng_feature = ['max_credit', 'gender', 'education',
       'pay_status_sep', 'pay_status_aug', 'pay_status_jul', 'pay_status_jun', 'pay_status_may', 'pay_status_apr',
       'married', 'carry_sep', 'carry_aug', 'carry_jul', 'carry_jun', 'carry_may', 'carry_apr', 'carry_ratio_sep', 'carry_ratio_aug',
       'carry_ratio_jul', 'carry_ratio_jun', 'carry_ration_may',
       'carry_ratio_apr', 'pay_status_sum']

In [59]:
# initialize scaler and average values
scaler = StandardScaler()
avg_train_f1 = 0
avg_test_f1 = 0

X = resampled_df[eng_feature]
y = resampled_df.default

for i in range(0, 30): #check for fit for 100 times
   
    # Get training set
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    print("iteration #", i)
    grid_knn_pred_train = grid_knn.best_estimator_.predict(X_train_scaled)
    grid_knn_pred_test = grid_knn.best_estimator_.predict(X_test_scaled)
    print("complete #", i)
    avg_train_f1 += f1_score(y_train, grid_knn_pred_train)
    avg_test_f1 += f1_score(y_test, grid_knn_pred_test)
    
#check for significant difference in F1 between train and test for overfitting
print("Overfitting Check")
print("Train GridSearch DecisionTree F1 score: ", avg_train_f1/100)
print("Test GridSearch DecisionTree F1 score: ", avg_test_f1/100)

iteration # 0
complete # 0
iteration # 1
complete # 1
iteration # 2
complete # 2
iteration # 3
complete # 3
iteration # 4
complete # 4
iteration # 5
complete # 5
iteration # 6
complete # 6
iteration # 7
complete # 7
iteration # 8
complete # 8
iteration # 9
complete # 9
iteration # 10
complete # 10
iteration # 11
complete # 11
iteration # 12
complete # 12
iteration # 13
complete # 13
iteration # 14
complete # 14
iteration # 15
complete # 15
iteration # 16
complete # 16
iteration # 17
complete # 17
iteration # 18
complete # 18
iteration # 19
complete # 19
iteration # 20
complete # 20
iteration # 21
complete # 21
iteration # 22
complete # 22
iteration # 23
complete # 23
iteration # 24
complete # 24
iteration # 25
complete # 25
iteration # 26
complete # 26
iteration # 27
complete # 27
iteration # 28
complete # 28
iteration # 29
complete # 29
Overfitting Check
Train GridSearch DecisionTree F1 score:  0.28453479143258875
Test GridSearch DecisionTree F1 score:  0.2847837513617297


In [None]:
# save the scaler
with open(r"..\model\scaler.pickle", "wb") as scaler_full:
    pickle.dump(scaler, scaler_full)