In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import altair as alt
from collections import Counter

In [2]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score,f1_score,precision_score, recall_score

from imblearn.over_sampling import RandomOverSampler, SMOTE

## Uploading, Cleaning Data

In [3]:
df = pd.read_csv('application_data_min.csv')

In [4]:
# Cleaning for features that are used in dataset
df['CNT_FAM_MEMBERS'].describe()
df.loc[df['CNT_FAM_MEMBERS'].isnull()] = 0
df = df.dropna(subset = ['AMT_ANNUITY'])

## Target Analysis

In [5]:
# TARGET = our predicted value, 1 = default, 0 = no default
y = df['TARGET']
print(y.mean())
print(y.value_counts())

0.08073196986006459
0    282674
1     24825
Name: TARGET, dtype: int64


## Preprocessing Data

For our classification model, we chose the following features:

- AMT_INCOME_TOTAL
- AMT_CREDIT
- AMT_ANNUITY
- CODE_GENDER
- CNT_FAM_MEMBERS
- NAME_CONTRACT_TYPE
- NAME_EDUCATION_TYPE

We wanted to limit the number of features that the user would have to input in our application. In addition, we wanted to choose features that we believed could be relevant in predicting card defaults/delayed card payments, as well as features that could be easily remembered by users.

In [6]:
# Columns: Count_fam_members, income, credit, annuity, contract type, gender, education
X = df.drop(columns = ['TARGET'])
X = pd.get_dummies(X, drop_first = True)

## Classification Choices

Logistic Regression, Gradient Boosted Trees?, Decision Tree Classifier, KNN

In [7]:
classification_models = [LogisticRegression(), KNeighborsClassifier(), DecisionTreeClassifier(), RandomForestClassifier()]

## Classification w/ Class Imbalance

First, we attempted to fit an SGD Classifier to the dataset. We also fit a logistic regression model to our data. However, because only ~8% of the data defaulted, we saw an obvious class imbalance. This led to our model predicting very few defaults (if at all).

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42)

In [8]:
MLA_columns = []
MLA_compare = pd.DataFrame(columns = MLA_columns)
row_index = 0

for alg in classification_models:
    predicted = alg.fit(X_train, y_train).predict(X_test)
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index,'Model Name'] = MLA_name
    MLA_compare.loc[row_index, 'Train Accuracy'] = round(alg.score(X_train, y_train), 2)
    MLA_compare.loc[row_index, 'Test Accuracy'] = round(alg.score(X_test, y_test), 2)
    MLA_compare.loc[row_index, 'Precision'] = round(precision_score(y_test, predicted),2)
    MLA_compare.loc[row_index, 'Recall'] = round(recall_score(y_test, predicted),2)
    MLA_compare.loc[row_index, 'F1 score'] = round(f1_score(y_test, predicted),2)
    row_index+=1
    MLA_compare.sort_values(by = ['Test Accuracy'], ascending = False, inplace = True)  
    
MLA_compare

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Model Name,Train Accuracy,Test Accuracy,Precision,Recall,F1 score
0,LogisticRegression,0.92,0.92,0.0,0.0,0.0
1,KNeighborsClassifier,0.92,0.91,0.15,0.02,0.03
3,RandomForestClassifier,0.98,0.91,0.15,0.03,0.05
2,DecisionTreeClassifier,0.98,0.86,0.12,0.11,0.11


From this, best one is KNeighborsClassifier - close train and test accuracy so probably no overfitting, precision and recall not equal to 0

In [9]:
KNN = KNeighborsClassifier()
KNN.fit(X_train, y_train)

train_pred = KNN.predict(X_train)
test_pred = KNN.predict(X_test)

In [10]:
confusion_matrix(y_train, train_pred)

array([[211169,    836],
       [ 17318,   1301]])

In [11]:
confusion_matrix(y_test, test_pred)

array([[70030,   639],
       [ 6094,   112]])

In [12]:
cross_val_score(KNN, X, y, cv = 5)

array([0.91274797, 0.9123252 , 0.91343089, 0.9122439 , 0.91273029])

## Classification w/ Random Oversampling

In [9]:
oversample = RandomOverSampler(sampling_strategy='minority', random_state = 42)
X_train_o, y_train_o = oversample.fit_resample(X_train, y_train)
print('After Oversampling:', Counter(y_train_o))

After Oversampling: Counter({0: 212005, 1: 212005})


In [14]:
MLA_columns = []
MLA_compare = pd.DataFrame(columns = MLA_columns)
row_index = 0

for alg in classification_models:
    predicted = alg.fit(X_train_o, y_train_o).predict(X_test)
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index,'Model Name'] = MLA_name
    MLA_compare.loc[row_index, 'Train Accuracy'] = round(alg.score(X_train_o, y_train_o), 2)
    MLA_compare.loc[row_index, 'Test Accuracy'] = round(alg.score(X_test, y_test), 2)
    MLA_compare.loc[row_index, 'Precision'] = round(precision_score(y_test, predicted),2)
    MLA_compare.loc[row_index, 'Recall'] = round(recall_score(y_test, predicted),2)
    MLA_compare.loc[row_index, 'F1 score'] = round(f1_score(y_test, predicted),2)
    row_index+=1
    MLA_compare.sort_values(by = ['Test Accuracy'], ascending = False, inplace = True)  
    
MLA_compare

Unnamed: 0,Model Name,Train Accuracy,Test Accuracy,Precision,Recall,F1 score
3,RandomForestClassifier,0.96,0.85,0.12,0.14,0.13
2,DecisionTreeClassifier,0.96,0.82,0.11,0.18,0.14
1,KNeighborsClassifier,0.9,0.73,0.1,0.31,0.16
0,LogisticRegression,0.51,0.49,0.08,0.53,0.14


From this, we think that DecisionTreeClassifier is the best - while test accuracy is lower, it has a higher recall and f1 score.

In [16]:
dt_o = DecisionTreeClassifier().fit(X_train_o, y_train_o)
predict_train = dt_o.predict(X_train_o)

dt_o.score(X_train_o, y_train_o)

0.9629772882715031

In [17]:
confusion_matrix(predict_train, y_train_o)

array([[199628,   3321],
       [ 12377, 208684]])

In [19]:
predict_test = dt_o.predict(X_test)
dt_o.score(X_test, y_test)

0.8184975609756098

In [20]:
confusion_matrix(predict_test, y_test)

array([[61810,  5094],
       [ 8859,  1112]])

In [21]:
cross_val_score(dt_o, X, y, cv = 5)

array([0.86343089, 0.8640813 , 0.86452033, 0.8636748 , 0.86393275])

## Classification with SMOTE

In [22]:
smote = SMOTE(sampling_strategy='minority', random_state = 42)
X_train_s, y_train_s = smote.fit_resample(X_train, y_train)

In [23]:
MLA_columns = []
MLA_compare = pd.DataFrame(columns = MLA_columns)
row_index = 0

for alg in classification_models:
    predicted = alg.fit(X_train_s, y_train_s).predict(X_test)
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index,'Model Name'] = MLA_name
    MLA_compare.loc[row_index, 'Train Accuracy'] = round(alg.score(X_train_s, y_train_s), 2)
    MLA_compare.loc[row_index, 'Test Accuracy'] = round(alg.score(X_test, y_test), 2)
    MLA_compare.loc[row_index, 'Precision'] = round(precision_score(y_test, predicted),2)
    MLA_compare.loc[row_index, 'Recall'] = round(recall_score(y_test, predicted),2)
    MLA_compare.loc[row_index, 'F1 score'] = round(f1_score(y_test, predicted),2)
    row_index+=1
    MLA_compare.sort_values(by = ['Test Accuracy'], ascending = False, inplace = True)  
    
MLA_compare

Unnamed: 0,Model Name,Train Accuracy,Test Accuracy,Precision,Recall,F1 score
3,RandomForestClassifier,0.98,0.89,0.13,0.07,0.09
2,DecisionTreeClassifier,0.98,0.84,0.1,0.12,0.11
1,KNeighborsClassifier,0.88,0.75,0.1,0.26,0.14
0,LogisticRegression,0.53,0.58,0.09,0.46,0.15


Decision tree classifier is the best? - second highest test accuracy, but better in terms of recall and f1 score than RandomForestClassifier

In [24]:
dt_s = DecisionTreeClassifier().fit(X_train_s, y_train_s)
predict_train = dt_s.predict(X_train_s)

dt_s.score(X_train_s, y_train_s)

0.979590104006981

In [25]:
confusion_matrix(predict_train, y_train_s)

array([[208908,   5557],
       [  3097, 206448]])

In [26]:
predict_test = dt_s.predict(X_test)
dt_s.score(X_test, y_test)

0.8443967479674797

In [27]:
confusion_matrix(y_test, test_pred)

array([[70030,   639],
       [ 6094,   112]])

In [28]:
cross_val_score(dt_s, X, y, cv = 5)

array([0.86386992, 0.86463415, 0.86460163, 0.86304065, 0.86485959])

## Classfication with Undersampling

Note: results are going to change drastically once they're sampled 

In [29]:
X_y_train = X_y_train = pd.concat([X_train.reset_index(drop = True),
                       y_train.reset_index(drop = True)], axis = 1)

# Create data sets for defaults and non-defaults
nondefaults = X_y_train[X_y_train['TARGET'] == 0]
defaults = X_y_train[X_y_train['TARGET'] == 1]

# Undersample the non-defaults
nondefaults_under = nondefaults.sample(len(defaults), random_state = 42)

# Concatenate the undersampled nondefaults with defaults
X_y_train_undersampled = pd.concat([nondefaults_under.reset_index(drop = True),
                             defaults.reset_index(drop = True)], axis = 0)

In [30]:
X_train_u = X_y_train_undersampled.drop(columns = ['TARGET'])
y_train_u = X_y_train_undersampled['TARGET']

In [31]:
MLA_columns = []
MLA_compare = pd.DataFrame(columns = MLA_columns)
row_index = 0

for alg in classification_models:
    predicted = alg.fit(X_train_u, y_train_u).predict(X_test)
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index,'Model Name'] = MLA_name
    MLA_compare.loc[row_index, 'Train Accuracy'] = round(alg.score(X_train_s, y_train_s), 2)
    MLA_compare.loc[row_index, 'Test Accuracy'] = round(alg.score(X_test, y_test), 2)
    MLA_compare.loc[row_index, 'Precision'] = round(precision_score(y_test, predicted),2)
    MLA_compare.loc[row_index, 'Recall'] = round(recall_score(y_test, predicted),2)
    MLA_compare.loc[row_index, 'F1 score'] = round(f1_score(y_test, predicted),2)
    row_index+=1
    MLA_compare.sort_values(by = ['Test Accuracy'], ascending = False, inplace = True)  
    
MLA_compare

Unnamed: 0,Model Name,Train Accuracy,Test Accuracy,Precision,Recall,F1 score
3,RandomForestClassifier,0.69,0.56,0.1,0.56,0.17
1,KNeighborsClassifier,0.65,0.55,0.1,0.55,0.16
2,DecisionTreeClassifier,0.66,0.55,0.09,0.53,0.16
0,LogisticRegression,0.52,0.49,0.08,0.53,0.14


For this one, RandomForestClassifier is the best - best test accuracy, precision, recall, and f1 score

In [32]:
rfc_u = RandomForestClassifier()
rfc_u.fit(X_train_u, y_train_u)

train_pred = rfc_u.predict(X_train_u)
train_all_pred = rfc_u.predict(X_train)
test_pred = rfc_u.predict(X_test)

In [33]:
confusion_matrix(y_train_u, train_pred)

array([[13129,  5490],
       [ 5386, 13233]])

In [34]:
confusion_matrix(y_train, train_all_pred)

array([[118854,  93151],
       [  5386,  13233]])

In [35]:
confusion_matrix(y_test, test_pred)

array([[38505, 32164],
       [ 2811,  3395]])

In [36]:
cross_val_score(rfc_u, X, y, cv = 5)

array([0.91274797, 0.9123252 , 0.91343089, 0.9122439 , 0.91273029])

## Summary

Overall, best seems to be the Decision Tree classifier with random oversampling - it has a relatively high test accuracy (0.82), recall of 0.1, and precision of 0.17.

In [10]:
dt_o = DecisionTreeClassifier().fit(X_train_o, y_train_o)
predict_train = dt_o.predict(X_train_o)

dt_o.score(X_train_o, y_train_o)

1.0

In [11]:
dt_o.feature_importances_

array([3.54225177e-01, 1.64881187e-01, 1.70879247e-01, 2.21903022e-01,
       4.62213125e-02, 4.23717898e-03, 1.54354952e-03, 1.57100338e-03,
       8.51505591e-03, 0.00000000e+00, 3.18458508e-05, 1.22957999e-02,
       4.72382566e-03, 2.46527320e-03, 6.50652232e-03])