In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import altair as alt
from collections import Counter

In [84]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score,f1_score,precision_score, recall_score

from imblearn.over_sampling import RandomOverSampler, SMOTE

## Uploading, Cleaning Data

In [85]:
df = pd.read_csv('application_data_updated.csv')

In [86]:
# Cleaning for features that are used in dataset
df['CNT_FAM_MEMBERS'].describe()
df.loc[df['CNT_FAM_MEMBERS'].isnull()] = 0
df = df.dropna(subset = ['AMT_ANNUITY'])

## Target Analysis

In [87]:
# TARGET = our predicted value, 1 = default, 0 = no default
y = df['TARGET']
print(y.mean())
print(y.value_counts())

0.08073196986006459
0    282674
1     24825
Name: TARGET, dtype: int64


## Preprocessing Data

For our classification model, we chose the following features:

- AMT_INCOME_TOTAL
- AMT_CREDIT
- AMT_ANNUITY
- CODE_GENDER
- CNT_FAM_MEMBERS
- NAME_CONTRACT_TYPE
- NAME_EDUCATION_TYPE

We wanted to limit the number of features that the user would have to input in our application. In addition, we wanted to choose features that we believed could be relevant in predicting card defaults/delayed card payments, as well as features that could be easily remembered by users.

In [88]:
# Columns: Count_fam_members, income, credit, annuity, contract type, gender, education
X = df[['CNT_FAM_MEMBERS', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'CODE_GENDER', 'NAME_CONTRACT_TYPE', 
       'NAME_EDUCATION_TYPE']]
X = pd.get_dummies(X, drop_first = True)

## Classification Choices

Logistic Regression, Gradient Boosted Trees?, Decision Tree Classifier, KNN

In [89]:
classification_models = [LogisticRegression(), KNeighborsClassifier(), DecisionTreeClassifier()]

## Classification w/ Class Imbalance

First, we attempted to fit an SGD Classifier to the dataset. We also fit a logistic regression model to our data. However, because only ~8% of the data defaulted, we saw an obvious class imbalance. This led to our model predicting very few defaults (if at all).

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y)

MLA_columns = []
MLA_compare = pd.DataFrame(columns = MLA_columns)
row_index = 0

for alg in classification_models:
    predicted = alg.fit(X_train, y_train).predict(X_test)
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index,'Model Name'] = MLA_name
    MLA_compare.loc[row_index, 'Train Accuracy'] = round(alg.score(X_train, y_train), 2)
    MLA_compare.loc[row_index, 'Test Accuracy'] = round(alg.score(X_test, y_test), 2)
    MLA_compare.loc[row_index, 'Precision'] = round(precision_score(y_test, predicted),2)
    MLA_compare.loc[row_index, 'Recall'] = round(recall_score(y_test, predicted),2)
    MLA_compare.loc[row_index, 'F1 score'] = round(f1_score(y_test, predicted),2)
    row_index+=1
    MLA_compare.sort_values(by = ['Test Accuracy'], ascending = False, inplace = True)  
    
MLA_compare

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Model Name,Train Accuracy,Test Accuracy,Precision,Recall,F1 score
0,LogisticRegression,0.92,0.92,0.0,0.0,0.0
1,KNeighborsClassifier,0.92,0.91,0.17,0.02,0.03
2,DecisionTreeClassifier,0.98,0.86,0.12,0.11,0.11


From this, best one is KNeighborsClassifier - close train and test accuracy so probably no overfitting, precision and recall not equal to 0

In [91]:
KNN = KNeighborsClassifier()
KNN.fit(X_train, y_train)

train_pred = KNN.predict(X_train)
test_pred = KNN.predict(X_test)

In [92]:
confusion_matrix(y_train, train_pred)

array([[211214,    791],
       [ 17411,   1208]])

In [93]:
confusion_matrix(y_test, test_pred)

array([[70118,   551],
       [ 6093,   113]])

In [94]:
cross_val_score(KNN, X, y, cv = 5)

array([0.91274797, 0.9123252 , 0.91343089, 0.9122439 , 0.91273029])

## Classification w/ Random Oversampling

In [95]:
oversample = RandomOverSampler(sampling_strategy='minority')
X_train_o, y_train_o = oversample.fit_resample(X_train, y_train)
print('After Oversampling:', Counter(y_train_o))

After Oversampling: Counter({0: 212005, 1: 212005})


In [96]:
MLA_columns = []
MLA_compare = pd.DataFrame(columns = MLA_columns)
row_index = 0

for alg in classification_models:
    predicted = alg.fit(X_train_o, y_train_o).predict(X_test)
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index,'Model Name'] = MLA_name
    MLA_compare.loc[row_index, 'Train Accuracy'] = round(alg.score(X_train_o, y_train_o), 2)
    MLA_compare.loc[row_index, 'Test Accuracy'] = round(alg.score(X_test, y_test), 2)
    MLA_compare.loc[row_index, 'Precision'] = round(precision_score(y_test, predicted),2)
    MLA_compare.loc[row_index, 'Recall'] = round(recall_score(y_test, predicted),2)
    MLA_compare.loc[row_index, 'F1 score'] = round(f1_score(y_test, predicted),2)
    row_index+=1
    MLA_compare.sort_values(by = ['Test Accuracy'], ascending = False, inplace = True)  
    
MLA_compare

Unnamed: 0,Model Name,Train Accuracy,Test Accuracy,Precision,Recall,F1 score
2,DecisionTreeClassifier,0.96,0.82,0.11,0.18,0.14
1,KNeighborsClassifier,0.9,0.73,0.11,0.32,0.16
0,LogisticRegression,0.53,0.58,0.09,0.47,0.15


From this, we think that DecisionTreeClassifier is the best - highest test accuracy?

In [97]:
dt_o = DecisionTreeClassifier().fit(X_train_o, y_train_o)
predict_train = dt_o.predict(X_train_o)

logreg_o.score(X_train_o, y_train_o)

0.5098724086696068

In [98]:
confusion_matrix(predict_train, y_train_o)

array([[199435,   3241],
       [ 12570, 208764]])

In [99]:
predict_test = logreg_o.predict(X_test)
logreg_o.score(X_test, y_test)

0.493450406504065

In [100]:
confusion_matrix(predict_test, y_test)

array([[34607,  2879],
       [36062,  3327]])

In [101]:
cross_val_score(dt_o, X, y, cv = 5)

array([0.86411382, 0.86445528, 0.86369106, 0.86369106, 0.86474577])

## Classification with SMOTE

In [102]:
smote = SMOTE(sampling_strategy='minority')
X_train_s, y_train_s = smote.fit_resample(X_train, y_train)

In [103]:
MLA_columns = []
MLA_compare = pd.DataFrame(columns = MLA_columns)
row_index = 0

for alg in classification_models:
    predicted = alg.fit(X_train_s, y_train_s).predict(X_test)
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index,'Model Name'] = MLA_name
    MLA_compare.loc[row_index, 'Train Accuracy'] = round(alg.score(X_train_s, y_train_s), 2)
    MLA_compare.loc[row_index, 'Test Accuracy'] = round(alg.score(X_test, y_test), 2)
    MLA_compare.loc[row_index, 'Precision'] = round(precision_score(y_test, predicted),2)
    MLA_compare.loc[row_index, 'Recall'] = round(recall_score(y_test, predicted),2)
    MLA_compare.loc[row_index, 'F1 score'] = round(f1_score(y_test, predicted),2)
    row_index+=1
    MLA_compare.sort_values(by = ['Test Accuracy'], ascending = False, inplace = True)  
    
MLA_compare

Unnamed: 0,Model Name,Train Accuracy,Test Accuracy,Precision,Recall,F1 score
2,DecisionTreeClassifier,0.98,0.85,0.11,0.13,0.12
1,KNeighborsClassifier,0.88,0.76,0.11,0.27,0.15
0,LogisticRegression,0.52,0.6,0.09,0.44,0.15


Decision tree classifier is the best? - in terms of test accuracy, tradeoff though b/c precision and recall aren't the best

In [104]:
dt_s = DecisionTreeClassifier().fit(X_train_s, y_train_s)
predict_train = dt_s.predict(X_train_s)

dt_s.score(X_train_s, y_train_s)

0.9793188839885851

In [105]:
confusion_matrix(predict_train, y_train_s)

array([[208991,   5755],
       [  3014, 206250]])

In [106]:
predict_test = dt_s.predict(X_test)
dt_s.score(X_test, y_test)

0.8462178861788617

In [107]:
confusion_matrix(y_test, test_pred)

array([[70118,   551],
       [ 6093,   113]])

In [108]:
cross_val_score(dt_s, X, y, cv = 5)

array([0.86333333, 0.86426016, 0.86352846, 0.86396748, 0.86386771])

## Classfication with Undersampling

Note: results are going to change drastically once they're sampled 

In [109]:
X_y_train = X_y_train = pd.concat([X_train.reset_index(drop = True),
                       y_train.reset_index(drop = True)], axis = 1)

# Create data sets for defaults and non-defaults
nondefaults = X_y_train[X_y_train['TARGET'] == 0]
defaults = X_y_train[X_y_train['TARGET'] == 1]

# Undersample the non-defaults
nondefaults_under = nondefaults.sample(len(defaults))

# Concatenate the undersampled nondefaults with defaults
X_y_train_undersampled = pd.concat([nondefaults_under.reset_index(drop = True),
                             defaults.reset_index(drop = True)], axis = 0)

In [110]:
X_train_u = X_y_train_undersampled.drop(columns = ['TARGET'])
y_train_u = X_y_train_undersampled['TARGET']

In [111]:
MLA_columns = []
MLA_compare = pd.DataFrame(columns = MLA_columns)
row_index = 0

for alg in classification_models:
    predicted = alg.fit(X_train_u, y_train_u).predict(X_test)
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index,'Model Name'] = MLA_name
    MLA_compare.loc[row_index, 'Train Accuracy'] = round(alg.score(X_train_s, y_train_s), 2)
    MLA_compare.loc[row_index, 'Test Accuracy'] = round(alg.score(X_test, y_test), 2)
    MLA_compare.loc[row_index, 'Precision'] = round(precision_score(y_test, predicted),2)
    MLA_compare.loc[row_index, 'Recall'] = round(recall_score(y_test, predicted),2)
    MLA_compare.loc[row_index, 'F1 score'] = round(f1_score(y_test, predicted),2)
    row_index+=1
    MLA_compare.sort_values(by = ['Test Accuracy'], ascending = False, inplace = True)  
    
MLA_compare

Unnamed: 0,Model Name,Train Accuracy,Test Accuracy,Precision,Recall,F1 score
2,DecisionTreeClassifier,0.66,0.56,0.1,0.54,0.16
0,LogisticRegression,0.52,0.55,0.09,0.5,0.15
1,KNeighborsClassifier,0.65,0.55,0.1,0.55,0.16


For this one, KNeighborsClassifier is the best - highest train/test accuracy, precision, recall, and F1 score

In [112]:
KNN_u = KNeighborsClassifier()
KNN_u.fit(X_train_u, y_train_u)

train_pred = KNN_u.predict(X_train_u)
train_all_pred = KNN_u.predict(X_train)
test_pred = KNN_u.predict(X_test)

In [113]:
confusion_matrix(y_train_u, train_pred)

array([[13253,  5366],
       [ 5490, 13129]])

In [114]:
confusion_matrix(y_train, train_all_pred)

array([[119187,  92818],
       [  5490,  13129]])

In [115]:
confusion_matrix(y_test, test_pred)

array([[38885, 31784],
       [ 2795,  3411]])

In [116]:
cross_val_score(KNN_u, X, y, cv = 5)

array([0.91274797, 0.9123252 , 0.91343089, 0.9122439 , 0.91273029])

## Summary

Overall, best seems to be the Decision Tree classifier with random oversampling - it has a relatively high test accuracy (0.82), recall of 0.1, and precision of 0.17.