# Summary of stuff

## Model Data Import

In [23]:
import pandas as pd

from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

import os
import numpy as np

In [35]:
model_df = pd.read_csv(r"..\data\training_data_eda.csv")

In [36]:
model_df = model_df.drop(columns="Unnamed: 0")

In [37]:
model_df.head()

Unnamed: 0,max_credit,gender,education,marital_status,age,pay_status_sep,pay_status_aug,pay_status_jul,pay_status_jun,pay_status_may,...,carry_jun,carry_may,carry_apr,carry_ratio_sep,carry_ratio_aug,carry_ratio_jul,carry_ratio_jun,carry_ration_may,carry_ratio_apr,pay_status_sum
0,220000,1,1,2,36,0,0,0,0,0,...,215187,170872,40826,0.966355,0.973409,0.94445,0.978123,0.776691,0.185573,0
1,200000,1,3,2,29,-1,-1,-1,-1,-1,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,-6
2,180000,1,1,2,27,-2,-2,-2,-2,-2,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,-12
3,80000,0,2,2,32,0,0,0,0,0,...,42334,40768,41027,0.618988,0.62715,0.575887,0.529175,0.5096,0.512837,0
4,10000,0,2,2,27,0,0,0,0,0,...,5144,2339,1697,0.6257,0.6895,0.4278,0.5144,0.2339,0.1697,0


## Upsample Minority Result

In [38]:
model_df.groupby("default").age.count()

default
0    17471
1     5028
Name: age, dtype: int64

In [39]:
defaulted = model_df[model_df.default == 1]
undefaulted = model_df[model_df.default == 0]

In [40]:
defaulted_upsampled = resample(defaulted,
                          replace=True, # sample with replacement
                          n_samples=undefaulted.shape[0], # match number in majority class
                          random_state=42) # reproducible result

In [41]:
resampled_df  = pd.concat([undefaulted, defaulted_upsampled])
resampled_df.default.value_counts()

1    17471
0    17471
Name: default, dtype: int64

In [42]:
X = resampled_df.drop(columns="default")
y = resampled_df.default

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### Standard Scaling for KNN

In [44]:
scaler = StandardScaler()
scaled_data_train = scaler.fit_transform(X_train)
scaled_data_test = scaler.transform(X_test)

## Baseline Models

In [45]:
dummy = DummyClassifier()
logreg = LogisticRegression(max_iter = 10**5, verbose=1)
tree = DecisionTreeClassifier()
knn = KNeighborsClassifier()

In [46]:
dummy.fit(X_train, y_train)
logreg.fit(X_train, y_train)
tree.fit(X_train, y_train)
knn.fit(scaled_data_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s finished


KNeighborsClassifier()

In [47]:
dummy_pred = dummy.predict(X_test)
logreg_pred = logreg.predict(X_test)
tree_pred = tree.predict(X_test)
knn_pred = knn.predict(scaled_data_test)

In [48]:
# checking accuracy
print("Baseline Dummy F1 score: ", f1_score(y_test, dummy_pred))
print("Baseline LogReg F1 score: ", f1_score(y_test, logreg_pred))
print("Baseline DecisionTree F1 score: ", f1_score(y_test, tree_pred))
print("Baseline KNN F1 score: ", f1_score(y_test, knn_pred))

Baseline Dummy F1 score:  0.5042744785136214
Baseline LogReg F1 score:  0.6664610322845979
Baseline DecisionTree F1 score:  0.8902297025510745
Baseline KNN F1 score:  0.7646932646932647


## Using Engineered Feature Only
Keeping upsampled dataset

In [49]:
resampled_df.columns

Index(['max_credit', 'gender', 'education', 'marital_status', 'age',
       'pay_status_sep', 'pay_status_aug', 'pay_status_jul', 'pay_status_jun',
       'pay_status_may', 'pay_status_apr', 'bill_sep', 'bill_aug', 'bill_jul',
       'bill_jun', 'bill_may', 'bill_apr', 'payment_sep', 'payment_aug',
       'payments_jul', 'payment_jun', 'payment_may', 'payment_apr', 'default',
       'married', 'carry_sep', 'carry_aug', 'carry_jul', 'carry_jun',
       'carry_may', 'carry_apr', 'carry_ratio_sep', 'carry_ratio_aug',
       'carry_ratio_jul', 'carry_ratio_jun', 'carry_ration_may',
       'carry_ratio_apr', 'pay_status_sum'],
      dtype='object')

In [50]:
eng_feature = ['max_credit', 'gender', 'education',
       'pay_status_sep', 'pay_status_aug', 'pay_status_jul', 'pay_status_jun', 'pay_status_may', 'pay_status_apr',
       'married', 'carry_sep', 'carry_aug', 'carry_jul', 'carry_jun', 'carry_may', 'carry_apr', 'carry_ratio_sep', 'carry_ratio_aug',
       'carry_ratio_jul', 'carry_ratio_jun', 'carry_ration_may',
       'carry_ratio_apr', 'pay_status_sum']

In [51]:
eng_X = resampled_df[eng_feature]

X_train_eng, X_test_eng, y_train_eng, y_test_eng = train_test_split(eng_X, y, random_state=42)

### Scaled Feature Engineerd Data for KNN

In [52]:
scaled_data_train_eng = scaler.fit_transform(X_train_eng)
scaled_data_test_eng = scaler.transform(X_test_eng)

In [53]:
dummy.fit(X_train_eng, y_train_eng)
logreg.fit(X_train_eng, y_train_eng)
tree.fit(X_train_eng, y_train_eng)
knn.fit(scaled_data_train_eng, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


KNeighborsClassifier()

In [54]:
dummy_pred = dummy.predict(X_test_eng)
logreg_pred = logreg.predict(X_test_eng)
tree_pred = tree.predict(X_test_eng)
knn_pred = knn.predict(scaled_data_test_eng)

In [55]:
# checking accuracy
print("F1 Scores with Engineerd Features Only")
print("Dummy F1 score: ", f1_score(y_test, dummy_pred))
print("LogReg F1 score: ", f1_score(y_test, logreg_pred))
print("DecisionTree F1 score: ", f1_score(y_test, tree_pred))
print("KNN F1 score: ", f1_score(y_test, knn_pred))

F1 Scores with Engineerd Features Only
Dummy F1 score:  0.49669628616997036
LogReg F1 score:  0.45867943122624244
DecisionTree F1 score:  0.8849519743863393
KNN F1 score:  0.7721478307445098


## GridSearchCV for DecisionTree and KNN

In [71]:
tree_param_dict = {"max_depth":range(5,10,1), # optimal max depth is 6-8
              "criterion":["gini", "entropy"], #test for both criterion
              "min_samples_leaf":range(600,1000,50), #min sample leaf is 600-850
              "splitter":["best", "random"], #check for best splitter
              "max_features":range(0, len(eng_feature))} #default is sqrt(len(eng_features))

knn_param_dict = {"n_neighbors":range(5, 50, 5),
                 "weights":["uniform", "distance"],
                 "algorithm":["kd_tree", "ball_tree"],
                 "leaf_size": range(15, 60, 5)}

In [57]:
grid_tree = GridSearchCV(tree, tree_param_dict, cv=10, scoring="f1", n_jobs=-1, verbose=1)

In [73]:
grid_knn = GridSearchCV(knn, knn_param_dict, cv=10, scoring="f1", n_jobs=-1, verbose=1)

In [32]:
y_grid_tree = grid_tree.fit(X_train_eng, y_train_eng)

Fitting 10 folds for each of 3864 candidates, totalling 38640 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 200 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 1400 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done 2728 tasks      | elapsed:   39.5s
[Parallel(n_jobs=-1)]: Done 5316 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 8916 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 11908 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 15076 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 19732 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 25336 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 31476 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done 38640 out of 38640 | elapsed: 11.2min finished


In [74]:
y_grid_knn = grid_knn.fit(X_train_eng, y_train_eng)

Fitting 10 folds for each of 162 candidates, totalling 1620 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   37.4s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 1620 out of 1620 | elapsed:  5.0min finished


In [75]:
print(grid_tree.best_params_)
print(grid_tree.best_estimator_)

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [77]:
#grid_tree_pred = grid_tree.best_estimator_.predict(X_test_eng)
print("GridSearch DecisionTree F1 score: ", f1_score(y_test_eng, grid_tree_pred))

GridSearch DecisionTree F1 score:  0.6644577545558812


print(grid_knn.best_params_)
print(grid_knn.best_estimator_)

In [79]:
grid_knn_pred = grid_knn.best_estimator_.predict(X_test_eng)
print("GridSearch DecisionTree F1 score: ", f1_score(y_test_eng, grid_knn_pred))

GridSearch DecisionTree F1 score:  0.8190979356227375
