# Summary of stuff

## Model Data Import

In [32]:
import pandas as pd

from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier 

import os


In [4]:
model_df = pd.read_csv(r"..\data\training_data_eda.csv")

In [5]:
model_df = model_df.drop(columns="Unnamed: 0")

In [6]:
model_df.head()

Unnamed: 0,default,max_credit,gender,education,marital_status,age,pay_status_sep,pay_status_aug,pay_status_jul,pay_status_jun,...,carry_jul,carry_jun,carry_may,carry_apr,carry_ratio_sep,carry_ratio_aug,carry_ratio_jul,carry_ratio_jun,carry_ration_may,carry_ratio_apr
0,1,220000,0,1,2,36,0,0,0,0,...,207779,215187,170872,40826,0.966355,0.973409,0.94445,0.978123,0.776691,0.185573
1,0,200000,0,3,2,29,-1,-1,-1,-1,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,180000,0,1,2,27,-2,-2,-2,-2,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,80000,1,2,2,32,0,0,0,0,...,46071,42334,40768,41027,0.618988,0.62715,0.575887,0.529175,0.5096,0.512837
4,1,10000,1,2,2,27,0,0,0,0,...,4278,5144,2339,1697,0.6257,0.6895,0.4278,0.5144,0.2339,0.1697


## Upsample Minority Result

In [7]:
model_df.groupby("default").age.count()

default
0    17471
1     5028
Name: age, dtype: int64

In [9]:
defaulted = model_df[model_df.default == 1]
undefaulted = model_df[model_df.default == 0]

In [12]:
defaulted_upsampled = resample(defaulted,
                          replace=True, # sample with replacement
                          n_samples=undefaulted.shape[0], # match number in majority class
                          random_state=42) # reproducible result

In [15]:
resampled_df  = pd.concat([undefaulted, defaulted_upsampled])
resampled_df.default.value_counts()

1    17471
0    17471
Name: default, dtype: int64

In [19]:
X = resampled_df.drop(columns="default")
y = resampled_df.default

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### Standard Scaling for KNN

In [33]:
scaler = StandardScaler()
scaled_data_train = scaler.fit_transform(X_train)
scaled_data_test = scaler.transform(X_test)

## Baseline Models

In [38]:
dummy = DummyClassifier()
logreg = LogisticRegression(max_iter = 10**5, verbose=1)
tree = DecisionTreeClassifier()
clf = KNeighborsClassifier()

In [39]:
dummy.fit(X_train, y_train)
logreg.fit(X_train, y_train)
tree.fit(X_train, y_train)
clf.fit(scaled_data_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s finished


KNeighborsClassifier()

In [42]:
dummy_pred = dummy.predict(X_test)
logreg_pred = logreg.predict(X_test)
tree_pred = tree.predict(X_test)
clf_pred = clf.predict(scaled_data_test)

In [43]:
# checking accuracy
print("Baseline Dummy F1 score: ", f1_score(y_test, dummy_pred))
print("Baseline LogReg F1 score: ", f1_score(y_test, logreg_pred))
print("Baseline DecisionTree F1 score: ", f1_score(y_test, tree_pred))
print("Baseline KNN F1 score: ", f1_score(y_test, clf_pred))

Baseline Dummy F1 score:  0.49113981936664
Baseline LogReg F1 score:  0.6619791128114983
Baseline DecisionTree F1 score:  0.8799830813154277
Baseline KNN F1 score:  0.7647437272142397


## Using Engineered Feature Only

In [45]:
model_df.columns

Index(['default', 'max_credit', 'gender', 'education', 'marital_status', 'age',
       'pay_status_sep', 'pay_status_aug', 'pay_status_jul', 'pay_status_jun',
       'pay_status_may', 'pay_status_apr', 'bill_sep', 'bill_aug', 'bill_jul',
       'bill_jun', 'bill_may', 'bill_apr', 'payment_sep', 'payment_aug',
       'payments_jul', 'payment_jun', 'payment_may', 'payment_apr', 'married',
       'carry_sep', 'carry_aug', 'carry_jul', 'carry_jun', 'carry_may',
       'carry_apr', 'carry_ratio_sep', 'carry_ratio_aug', 'carry_ratio_jul',
       'carry_ratio_jun', 'carry_ration_may', 'carry_ratio_apr'],
      dtype='object')

In [None]:
eng_feature = ['default', 'gender', 'education', 'age',
       'pay_status_sep', 'pay_status_aug', 'pay_status_jul', 'pay_status_jun',
       'pay_status_may', 'pay_status_apr', 'bill_sep', 'bill_aug', 'bill_jul',
       'bill_jun', 'bill_may', 'bill_apr', 'payment_sep', 'payment_aug',
       'payments_jul', 'payment_jun', 'payment_may', 'payment_apr', 'married',
       'carry_sep', 'carry_aug', 'carry_jul', 'carry_jun', 'carry_may',
       'carry_apr', 'carry_ratio_sep', 'carry_ratio_aug', 'carry_ratio_jul',
       'carry_ratio_jun', 'carry_ration_may', 'carry_ratio_apr']