# Summary of stuff

## Model Data Import

In [1]:
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

import os
import numpy as np
import pickle
import pandas as pd

In [2]:
model_df = pd.read_csv(r"..\data\training_data_eda.csv")

In [3]:
model_df = model_df.drop(columns="Unnamed: 0")

In [4]:
model_df.head()

Unnamed: 0,max_credit,gender,education,marital_status,age,pay_status_sep,pay_status_aug,pay_status_jul,pay_status_jun,pay_status_may,...,carry_jun,carry_may,carry_apr,carry_ratio_sep,carry_ratio_aug,carry_ratio_jul,carry_ratio_jun,carry_ration_may,carry_ratio_apr,pay_status_sum
0,220000,1,1,2,36,0,0,0,0,0,...,215187,170872,40826,0.966355,0.973409,0.94445,0.978123,0.776691,0.185573,0
1,200000,1,3,2,29,-1,-1,-1,-1,-1,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,-6
2,180000,1,1,2,27,-2,-2,-2,-2,-2,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,-12
3,80000,0,2,2,32,0,0,0,0,0,...,42334,40768,41027,0.618988,0.62715,0.575887,0.529175,0.5096,0.512837,0
4,10000,0,2,2,27,0,0,0,0,0,...,5144,2339,1697,0.6257,0.6895,0.4278,0.5144,0.2339,0.1697,0


## Upsample Minority Result

In [5]:
model_df.groupby("default").age.count()

default
0    17471
1     5028
Name: age, dtype: int64

In [6]:
defaulted = model_df[model_df.default == 1]
undefaulted = model_df[model_df.default == 0]

In [7]:
defaulted_upsampled = resample(defaulted,
                          replace=True, # sample with replacement
                          n_samples=undefaulted.shape[0], # match number in majority class
                          random_state=42) # reproducible result

In [8]:
resampled_df  = pd.concat([undefaulted, defaulted_upsampled])
resampled_df.default.value_counts()

1    17471
0    17471
Name: default, dtype: int64

In [9]:
X = resampled_df.drop(columns="default")
y = resampled_df.default

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### Standard Scaling for KNN

In [11]:
scaler = StandardScaler()
scaled_data_train = scaler.fit_transform(X_train)
scaled_data_test = scaler.transform(X_test)

## Baseline Models

In [19]:
dummy = DummyClassifier()
logreg = LogisticRegression(max_iter = 10**5, verbose=1)
tree = DecisionTreeClassifier()
knn = KNeighborsClassifier()

In [13]:
dummy.fit(X_train, y_train)
logreg.fit(X_train, y_train)
tree.fit(X_train, y_train)
knn.fit(scaled_data_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s finished


KNeighborsClassifier()

In [14]:
dummy_pred = dummy.predict(X_test)
logreg_pred = logreg.predict(X_test)
tree_pred = tree.predict(X_test)
knn_pred = knn.predict(scaled_data_test)

In [15]:
# checking accuracy
print("Baseline Dummy F1 score: ", f1_score(y_test, dummy_pred))
print("Baseline LogReg F1 score: ", f1_score(y_test, logreg_pred))
print("Baseline DecisionTree F1 score: ", f1_score(y_test, tree_pred))
print("Baseline KNN F1 score: ", f1_score(y_test, knn_pred))

Baseline Dummy F1 score:  0.5075800112296462
Baseline LogReg F1 score:  0.6664610322845979
Baseline DecisionTree F1 score:  0.8898054145516074
Baseline KNN F1 score:  0.7646932646932647


### Overfitting check

In [16]:
log_pred_train = logreg.predict(X_train)
log_pred_test = logreg.predict(X_test)

In [17]:
print("Baseline Test LogReg F1 score: ", f1_score(y_train, log_pred_train))
print("Baseline Test LogReg F1 score: ", f1_score(y_test, log_pred_test))

Baseline Test LogReg F1 score:  0.657944052608266
Baseline Test LogReg F1 score:  0.6664610322845979


## Using Engineered Feature Only
Keeping upsampled dataset

In [12]:
resampled_df.columns

Index(['max_credit', 'gender', 'education', 'marital_status', 'age',
       'pay_status_sep', 'pay_status_aug', 'pay_status_jul', 'pay_status_jun',
       'pay_status_may', 'pay_status_apr', 'bill_sep', 'bill_aug', 'bill_jul',
       'bill_jun', 'bill_may', 'bill_apr', 'payment_sep', 'payment_aug',
       'payments_jul', 'payment_jun', 'payment_may', 'payment_apr', 'default',
       'married', 'carry_sep', 'carry_aug', 'carry_jul', 'carry_jun',
       'carry_may', 'carry_apr', 'carry_ratio_sep', 'carry_ratio_aug',
       'carry_ratio_jul', 'carry_ratio_jun', 'carry_ration_may',
       'carry_ratio_apr', 'pay_status_sum'],
      dtype='object')

In [13]:
eng_feature = ['max_credit', 'gender', 'education',
       'pay_status_sep', 'pay_status_aug', 'pay_status_jul', 'pay_status_jun', 'pay_status_may', 'pay_status_apr',
       'married', 'carry_sep', 'carry_aug', 'carry_jul', 'carry_jun', 'carry_may', 'carry_apr', 'carry_ratio_sep', 'carry_ratio_aug',
       'carry_ratio_jul', 'carry_ratio_jun', 'carry_ration_may',
       'carry_ratio_apr', 'pay_status_sum']

In [14]:
eng_X = resampled_df[eng_feature]

X_train_eng, X_test_eng, y_train_eng, y_test_eng = train_test_split(eng_X, y, random_state=42)

In [28]:
eng_scaler = StandardScaler()
X_eng_scale_train = eng_scaler.fit_transform(X_train_eng)
X_eng_scale_test = eng_scaler.transform(X_test_eng)

### Scaled Feature Engineerd Data for KNN

In [23]:
scaled_data_train_eng = scaler.fit_transform(X_train_eng)
scaled_data_test_eng = scaler.transform(X_test_eng)

In [30]:
dummy.fit(X_train_eng, y_train_eng)
logreg.fit(X_train_eng, y_train_eng)
tree.fit(X_train_eng, y_train_eng)
knn.fit(scaled_data_train_eng, y_train_eng)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


KNeighborsClassifier()

In [31]:
dummy_pred = dummy.predict(X_test_eng)
logreg_pred = logreg.predict(X_test_eng)
tree_pred = tree.predict(X_test_eng)
knn_pred = knn.predict(scaled_data_test_eng)

In [32]:
# checking accuracy
print("F1 Scores with Engineerd Features Only")
print("Dummy F1 score: ", f1_score(y_test, dummy_pred))
print("LogReg F1 score: ", f1_score(y_test, logreg_pred))
print("DecisionTree F1 score: ", f1_score(y_test, tree_pred))
print("KNN F1 score: ", f1_score(y_test, knn_pred))

F1 Scores with Engineerd Features Only
Dummy F1 score:  0.5040299693495289
LogReg F1 score:  0.45867943122624244
DecisionTree F1 score:  0.8867763298440504
KNN F1 score:  0.7721478307445098


## GridSearchCV for DecisionTree and KNN

In [44]:
tree_param_dict = {"max_depth":range(5,10,1), # optimal max depth is 6-8
              "criterion":["gini", "entropy"], #test for both criterion
              "min_samples_leaf":range(600,1000,50), #min sample leaf is 600-850
              "splitter":["best"], #check for best splitter
              "max_features":range(6, 14)} #default is sqrt(len(eng_features))

knn_param_dict = {"n_neighbors":range(35, 55, 5), #optimal at 45
                 "weights":["distance"],
                 "algorithm":["ball_tree"],
                 "leaf_size": range(40, 70, 10)} #optimal at 50

In [40]:
grid_tree = GridSearchCV(tree, tree_param_dict, cv=10, scoring="f1", n_jobs=-1, verbose=1)

In [45]:
grid_knn = GridSearchCV(knn, knn_param_dict, cv=5, scoring="f1", n_jobs=-1, verbose=1)

In [42]:
y_grid_tree = grid_tree.fit(X_train_eng, y_train_eng)

Fitting 10 folds for each of 640 candidates, totalling 6400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 656 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done 1656 tasks      | elapsed:   31.7s
[Parallel(n_jobs=-1)]: Done 3056 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 4856 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 6400 out of 6400 | elapsed:  2.3min finished


In [43]:
# gridsearch decision tree parameter result
print(grid_tree.best_params_)
print(grid_tree.best_estimator_)

# make predictions for train and test
grid_tree_pred_train = grid_tree.best_estimator_.predict(X_train_eng)
grid_tree_pred_test = grid_tree.best_estimator_.predict(X_test_eng)

#check for significant difference in F1 between train and test for overfitting
print("Overfitting Check")
print("Train GridSearch DecisionTree F1 score: ", f1_score(y_train_eng, grid_tree_pred_train))
print("Test GridSearch DecisionTree F1 score: ", f1_score(y_test_eng, grid_tree_pred_test))

{'criterion': 'entropy', 'max_depth': 7, 'max_features': 11, 'min_samples_leaf': 900, 'splitter': 'best'}
DecisionTreeClassifier(criterion='entropy', max_depth=7, max_features=11,
                       min_samples_leaf=900)
Overfitting Check
Train GridSearch DecisionTree F1 score:  0.6587963557038369
Test GridSearch DecisionTree F1 score:  0.6583419155509784


In [46]:
y_grid_knn = grid_knn.fit(scaled_data_train_eng, y_train_eng)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  2.5min finished


In [47]:
# gridsearch knn parameter result
print(grid_knn.best_params_)
print(grid_knn.best_estimator_)

grid_knn_pred_train = grid_knn.best_estimator_.predict(scaled_data_train_eng)
grid_knn_pred_test = grid_knn.best_estimator_.predict(scaled_data_test_eng)


{'algorithm': 'ball_tree', 'leaf_size': 40, 'n_neighbors': 50, 'weights': 'distance'}
KNeighborsClassifier(algorithm='ball_tree', leaf_size=40, n_neighbors=50,
                     weights='distance')


In [48]:
print("Overfitting Check")
print("GridSearch KNN F1 score: ", f1_score(y_train_eng, grid_knn_pred_train))
print("GridSearch KNN F1 score: ", f1_score(y_test_eng, grid_knn_pred_test))

Overfitting Check
GridSearch KNN F1 score:  0.9956541628545289
GridSearch KNN F1 score:  0.8771084337349397


In [49]:
with open(r"..\model\grid_knn_best.pickle", "wb") as best_model:
    pickle.dump(grid_knn.best_estimator_, best_model)

In [50]:
with open(r"..\model\grid_knn.pickle", "wb") as model:
    pickle.dump(grid_knn, model)

In [52]:
with open(r"..\model\eng_scale.pickle", "wb") as scaler_eng_features:
    pickle.dump(eng_scaler, scaler_eng_features)

In [53]:
with open(r"..\model\grid_tree.pickle", "wb") as grid_tree_pickle:
    pickle.dump(grid_tree, grid_tree_pickle)