In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
sns.set(context='paper', style='darkgrid', rc={'figure.facecolor':'white'}, font_scale=1.2)

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

from sklearn import metrics

from sklearn.metrics import confusion_matrix
import itertools

from sklearn.model_selection import GridSearchCV

from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier

import pickle

In [2]:
with open('Features_Selected_by_RF', 'rb') as handle:
    X = pickle.load(handle)

In [3]:
with open('Default_values', 'rb') as handle:
    y = pickle.load(handle)

In [49]:
X['Default'] = y

In [50]:
df_final = X

In [51]:
df_final.to_csv('Final_data_for_model.csv')

___

### Train Test Split

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=23)

In [40]:
scaler = StandardScaler()
final_scaler = scaler.fit(X_train)
X_train = pd.DataFrame(data=scaler.transform(X_train), columns = X.columns)
X_test = pd.DataFrame(data=scaler.transform(X_test), columns = X.columns)

___

## Best Model for the data

In [41]:
from sklearn.ensemble import RandomForestClassifier

In [42]:
rfc = RandomForestClassifier(criterion = 'entropy', max_depth = 8, n_estimators = 250, 
                             min_samples_leaf = 8, min_samples_split = 2, random_state = 1, 
                             class_weight='balanced', bootstrap=True, n_jobs=-1)

In [43]:
#fit the model to the training data
rfc.fit(X_train, y_train)
#use the fitted model to predict on the test data
rfc_pred = rfc.predict(X_test)

In [44]:
# checking accuracy
print('Test Accuracy score: ', accuracy_score(y_test, rfc_pred))

# checking precision
print('Precision score: ', metrics.precision_score(y_test, rfc_pred))

# Checking recall
print('Recall score: ', metrics.recall_score(y_test, rfc_pred))

# checking accuracy
print('Test F1 score: ', f1_score(y_test, rfc_pred))

Test Accuracy score:  0.7893333333333333
Precision score:  0.5383480825958702
Recall score:  0.5663304887509697
Test F1 score:  0.551984877126654


___

## More feature engineering on data without dummies for all of the status values

In [52]:
df = pd.read_csv('Data_wo_status_dummies.csv', index_col= 0)

In [68]:
X_2 = df.drop(columns = 'Default', axis =1)
y_2 = df['Default']
feature_cols_2 = X_2.columns

In [69]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.25, 
                                                            random_state=23)

In [70]:
scaler_2 = StandardScaler()
scaler_2.fit(X_train_2)
X_train_2 = pd.DataFrame(data=scaler_2.transform(X_train_2), columns = X_2.columns)
X_test_2 = pd.DataFrame(data=scaler_2.transform(X_test_2), columns = X_2.columns)

In [98]:
rfc_2 = RandomForestClassifier(criterion = 'entropy', max_depth = 3, n_estimators = 10000, 
                             min_samples_leaf = 5, min_samples_split = 2, random_state = 1, 
                             class_weight='balanced', bootstrap=True, n_jobs=-1)

In [None]:
#fit the model to the training data
rfc_2.fit(X_train_2, y_train_2)
#use the fitted model to predict on the test data
rfc_pred_2 = rfc_2.predict(X_test_2)

In [None]:
# checking accuracy
print('Test Accuracy score: ', accuracy_score(y_test_2, rfc_pred_2))

# checking precision
print('Precision score: ', metrics.precision_score(y_test_2, rfc_pred_2))

# Checking recall
print('Recall score: ', metrics.recall_score(y_test_2, rfc_pred_2))

# checking accuracy
print('Test F1 score: ', f1_score(y_test_2, rfc_pred_2))

___

## Binning the status columns

In [53]:
df['September_status'] = np.where(df['September_status'] >= 1, 1, 0)

df['August_status'] = np.where(df['August_status'] >= 1, 1, 0)

df['July_status'] = np.where(df['July_status'] >= 1, 1, 0)

df['June_status'] = np.where(df['June_status'] >= 1, 1, 0)

df['May_status'] = np.where(df['May_status'] >= 1, 1, 0)

df['April_status'] = np.where(df['April_status'] >= 1, 1, 0)

In [60]:
df['Status_balance_sep'] = [df['September_status'][i] * df['September_balance'][i] 
                        for i in range(len(df['September_status']))]

df['Status_balance_aug'] = [df['August_status'][i] * df['August_balance'][i] 
                        for i in range(len(df['September_status']))]

df['Status_balance_jul'] = [df['July_status'][i] * df['July_balance'][i] 
                        for i in range(len(df['September_status']))]

df['Status_balance_jun'] = [df['June_status'][i] * df['June_balance'][i] 
                        for i in range(len(df['September_status']))]

df['Status_balance_may'] = [df['May_status'][i] * df['May_balance'][i] 
                        for i in range(len(df['September_status']))]

df['Status_balance_apr'] = [df['April_status'][i] * df['April_balance'][i] 
                        for i in range(len(df['September_status']))]

In [67]:
df.drop(columns = ['September_status', 'August_status', 'July_status', 'June_status', 
                   'May_status', 'April_status'], inplace = True)

In [17]:
X_3 = df.drop(columns = 'Default', axis =1)
y_3 = df['Default']
feature_cols_3 = X_3.columns

In [18]:
poly= PolynomialFeatures(degree=2, include_bias=False)

In [19]:
X_poly = poly.fit_transform(X_3)
poly_columns = poly.get_feature_names(X_3.columns)

In [20]:
X_poly_df = pd.DataFrame(X_poly, columns = poly_columns)

In [21]:
X_poly_df

Unnamed: 0,LIMIT_BAL,SEX,AGE,September_balance,Min_payment_sep,Min_payment_aug,Min_payment_jul,Min_payment_jun,Min_payment_may,Min_payment_apr,...,edu_3^2,edu_3 edu_4,edu_3 marriage_2,edu_3 Overall_status,edu_4^2,edu_4 marriage_2,edu_4 Overall_status,marriage_2^2,marriage_2 Overall_status,Overall_status^2
0,200000.0,0.0,30.0,133727.0,1.0,1.0,1.0,1.0,1.0,0.0,...,1.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,36.0
1,200000.0,0.0,27.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
2,20000.0,0.0,28.0,-416.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
3,50000.0,1.0,23.0,47099.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,20000.0,1.0,47.0,-390.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22495,50000.0,0.0,50.0,19956.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22496,80000.0,0.0,53.0,46158.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22497,50000.0,0.0,50.0,49142.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22498,220000.0,1.0,32.0,198264.0,0.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
X_train_poly, X_test_poly, y_train_poly, y_test_poly = train_test_split(X_poly_df, y_2, test_size=0.25, 
                                                            random_state=23)

In [23]:
scaler_poly = StandardScaler()
scaler_poly.fit(X_train_poly)
X_train_poly = pd.DataFrame(data=scaler_poly.transform(X_train_poly), columns = X_poly_df.columns)
X_test_poly = pd.DataFrame(data=scaler_poly.transform(X_test_poly), columns = X_poly_df.columns)

In [36]:
rfc_poly = RandomForestClassifier(criterion = 'entropy', max_depth = 7, n_estimators = 250, 
                             min_samples_leaf = 8, min_samples_split = 2, random_state = 1, 
                             class_weight='balanced', bootstrap=False, n_jobs=-1)

In [37]:
#fit the model to the training data
rfc_poly.fit(X_train_poly, y_train_poly)
#use the fitted model to predict on the test data
rfc_pred_poly = rfc_poly.predict(X_test_poly)

In [38]:
# checking accuracy
print('Test Accuracy score: ', accuracy_score(y_test_poly, rfc_pred_poly))

# checking precision
print('Precision score: ', metrics.precision_score(y_test_poly, rfc_pred_poly))

# Checking recall
print('Recall score: ', metrics.recall_score(y_test_poly, rfc_pred_poly))

# checking accuracy
print('Test F1 score: ', f1_score(y_test_poly, rfc_pred_poly))

Test Accuracy score:  0.7368888888888889
Precision score:  0.4476138233680746
Recall score:  0.6330488750969744
Test F1 score:  0.5244215938303342
