In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
sns.set(context='paper', style='darkgrid', rc={'figure.facecolor':'white'}, font_scale=1.2)

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

from sklearn import metrics

from sklearn.metrics import confusion_matrix
import itertools

from sklearn.model_selection import GridSearchCV

from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier

import pickle

In [2]:
df_3 = pd.read_csv('Cleaned_data_w_dummies.csv', index_col = 0)

In [9]:
X = df_3.drop('Default', axis = 1)
y = df_3['Default']
feature_cols = X.columns

In [10]:
default = df_3[df_3['Default'] == 1]
no_default = df_3[df_3['Default'] == 0]
classes = ['default', 'no_default']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=23)

In [12]:
scaler = StandardScaler()
final_scaler = scaler.fit(X_train)
X_train = pd.DataFrame(data=scaler.transform(X_train), columns = X.columns)
X_test = pd.DataFrame(data=scaler.transform(X_test), columns = X.columns)

## Decision Tree

In [7]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion = "entropy", max_depth = 6, min_samples_split = 4, 
                             min_samples_leaf = 8, max_leaf_nodes = 16, 
                             class_weight = 'balanced', random_state = 1)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#predict the training set
y_pred_train = clf.predict(X_train)

#Predict the response for test dataset
y_pred_test = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Training F1 Score:",metrics.f1_score(y_train, y_pred_train))
print("Testing F1 Score:",metrics.f1_score(y_test, y_pred_test))

Training F1 Score: 0.5232658072437077
Testing F1 Score: 0.5289079229122056


#### GridsearchCV with decision tree

In [8]:
# max_leaf_nodes_value = range(10, 21)

In [9]:
parameters={}

In [10]:
#create our estimaor
clf_cv = DecisionTreeClassifier(criterion = "entropy", max_depth = 6, min_samples_split = 4, 
                             min_samples_leaf = 8, max_leaf_nodes = 16, 
                             class_weight = 'balanced', random_state = 1)

#create the instance of GridSearchCV
grid_tree = GridSearchCV(clf_cv, parameters, cv=10, scoring='f1')

#fit the Gridsearch to our data
grid_tree.fit(X_train,y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                              class_weight='balanced',
                                              criterion='entropy', max_depth=6,
                                              max_features=None,
                                              max_leaf_nodes=16,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=8,
                                              min_samples_split=4,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=1, splitter='best'),
             iid='deprecated', n_jobs=None, param_grid={},
             pre_dispatch='2*n_jobs', refit=True, return_trai

In [11]:
grid_tree.best_estimator_

DecisionTreeClassifier(ccp_alpha=0.0, class_weight='balanced',
                       criterion='entropy', max_depth=6, max_features=None,
                       max_leaf_nodes=16, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=8,
                       min_samples_split=4, min_weight_fraction_leaf=0.0,
                       presort='deprecated', random_state=1, splitter='best')

In [12]:
#Predict the response for test dataset
y_pred = grid_tree.best_estimator_.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.f1_score(y_test, y_pred))

Accuracy: 0.5289079229122056


___

## Logistic Regression

In [13]:
lr_clf_weighted = LogisticRegression(penalty='l1', tol = .01, max_iter = 5000, 
                                     solver='saga', class_weight='balanced')

lr_clf_weighted.fit(X_train, y_train)

y_weighted_test = lr_clf_weighted.predict(X_test)

In [14]:
# checking accuracy
print('Test Accuracy score: ', accuracy_score(y_test, y_weighted_test))

# checking precision
print('Precision score: ', metrics.precision_score(y_test, y_weighted_test))

# Checking recall
print('Recall score: ', metrics.recall_score(y_test, y_weighted_test))

# checking accuracy
print('Test F1 score: ', f1_score(y_test, y_weighted_test))

Test Accuracy score:  0.7790222222222222
Precision score:  0.5161064425770309
Recall score:  0.5717610550814585
Test F1 score:  0.54251012145749


#### Logistic Regression with Gridsearchcv

# Random Forest Model

In [13]:
from sklearn.ensemble import RandomForestClassifier

In [14]:
rfc = RandomForestClassifier(criterion = 'entropy', max_depth = 8, n_estimators = 250, 
                             min_samples_leaf = 8, min_samples_split = 2, random_state = 1, 
                             class_weight='balanced', bootstrap=True, n_jobs=-1)

In [15]:
#fit the model to the training data
rfc.fit(X_train, y_train)
#use the fitted model to predict on the test data
rfc_pred = rfc.predict(X_test)

In [16]:
# checking accuracy
print('Test Accuracy score: ', accuracy_score(y_test, rfc_pred))

# checking precision
print('Precision score: ', metrics.precision_score(y_test, rfc_pred))

# Checking recall
print('Recall score: ', metrics.recall_score(y_test, rfc_pred))

# checking accuracy
print('Test F1 score: ', f1_score(y_test, rfc_pred))

Test Accuracy score:  0.7898666666666667
Precision score:  0.53960029607698
Recall score:  0.5655546935608999
Test F1 score:  0.5522727272727272


In [17]:
rfc.feature_importances_

array([5.66457346e-02, 3.63599124e-03, 1.69199921e-02, 4.26616471e-02,
       4.24987403e-02, 3.50427386e-02, 3.02781229e-02, 2.89676915e-02,
       3.25868118e-02, 1.38287344e-02, 1.17015094e-02, 7.76399602e-03,
       3.18526955e-03, 2.81595243e-03, 3.51456934e-03, 4.43831948e-03,
       1.86900489e-03, 2.36098950e-03, 3.57094110e-03, 8.17965708e-02,
       2.10784045e-02, 1.74148111e-01, 9.15619290e-03, 3.78247887e-04,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.37387837e-05,
       2.71881533e-02, 0.00000000e+00, 1.08405562e-01, 5.28165313e-03,
       2.20770106e-04, 4.38200749e-05, 0.00000000e+00, 0.00000000e+00,
       1.27155192e-02, 0.00000000e+00, 6.64746333e-02, 1.33919813e-03,
       3.21440304e-04, 0.00000000e+00, 0.00000000e+00, 2.64141078e-04,
       0.00000000e+00, 8.14106430e-03, 0.00000000e+00, 4.43038262e-02,
       6.73561574e-04, 1.94387914e-04, 3.81553345e-05, 0.00000000e+00,
       9.19389890e-04, 0.00000000e+00, 4.99708258e-03, 4.59927389e-02,
      

#### Random forest GridsearchCV

In [37]:
# parameter check 1 
# best parameters n = 125, max depth = 7

n_estimators_rfc = [250, 275, 300, 325, 350]
max_depth_rfc = range(7, 11)

In [38]:
# parameter check 1 
# best parameters split = 2, leaf = 7, criterior = 'entropy'

min_samples_leaf_rfc = range(5,11)

In [39]:
param_dict_rf = {'max_depth': max_depth_rfc, 'n_estimators': n_estimators_rfc}

In [40]:
#create our estimaor
rfc_cv = RandomForestClassifier(criterion = 'entropy',
                             min_samples_leaf = 7, min_samples_split = 2, random_state = 1, 
                             class_weight='balanced', n_jobs = -1)

#create the instance of GridSearchCV
grid_tree_rfc = GridSearchCV(rfc_cv, param_dict_rf, cv=10, scoring='f1')

#fit the Gridsearch to our data
grid_tree_rfc.fit(X_train,y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight='balanced',
                                              criterion='entropy',
                                              max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=7,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=-1,
                                              oob_score=Fals

In [41]:
grid_tree_rfc.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='entropy', max_depth=8, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=7, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=250,
                       n_jobs=-1, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [46]:
#Predict the response for test dataset
y_pred_rfc = grid_tree_rfc.best_estimator_.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.f1_score(y_test, y_pred_rfc))

Accuracy: 0.5518814139110604


___

# Plotting results on a confusion Matrix

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion Matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
plot_confusion_matrix(metrics.confusion_matrix(y_test, rfc_pred), classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues)

___

# Aggregating models using voting classifier

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
# creating an ensemble model using the best models from the 3 methods that gave
# the best values for the 3 models

rfc_en = RandomForestClassifier(criterion = 'entropy', max_depth = 7, n_estimators = 150,
                             min_samples_leaf = 7, min_samples_split = 2, random_state = 1, 
                             class_weight='balanced')

lr_clf_en = LogisticRegression(penalty='l1', tol = .01, max_iter = 5000, 
                                     solver='saga')

clf_en = DecisionTreeClassifier(criterion = "entropy", max_depth = 6, min_samples_split = 4, 
                             min_samples_leaf = 8, max_leaf_nodes = 16, 
                             class_weight = 'balanced', random_state = 1)

In [None]:
voting_clf = VotingClassifier(estimators = [('lr', lr_clf_en), ('rf', rfc_en), 
                                            ('dt', clf_en)], voting = 'hard')

# fitting the training data
voting_clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred_vclf = voting_clf.predict(X_test)

In [None]:
# checking accuracy
print('Test Accuracy score: ', accuracy_score(y_test, y_pred_vclf))

# checking precision
print('Precision score: ', metrics.precision_score(y_test, y_pred_vclf))

# Checking recall
print('Recall score: ', metrics.recall_score(y_test, y_pred_vclf))

# checking accuracy
print('Test F1 score: ', f1_score(y_test, y_pred_vclf))

### Testing again with only the features that didn't get pushed to 0 in random tree feature selection

In [None]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(New_X, y2, test_size=0.25, 
                                                            random_state=23)

In [None]:
scaler_2 = StandardScaler()
scaler_2.fit(X_train_2)
X_train_2 = pd.DataFrame(data=scaler_2.transform(X_train_2), columns = New_X.columns)
X_test_2 = pd.DataFrame(data=scaler_2.transform(X_test_2), columns = New_X.columns)

### Random Forest with cleaned up DataFrame

In [None]:
rfc_2 = RandomForestClassifier(criterion = 'entropy', max_depth = 8, n_estimators = 300, 
                             min_samples_leaf = 6, min_samples_split = 4, random_state = 1, 
                             class_weight='balanced', bootstrap=True, n_jobs = -1)

In [None]:
#fit the model to the training data
rfc_2.fit(X_train_2, y_train_2)
#use the fitted model to predict on the test data
rfc_pred_2 = rfc_2.predict(X_test_2)

In [None]:
# checking accuracy
print('Test Accuracy score: ', accuracy_score(y_test_2, rfc_pred_2))

# checking precision
print('Precision score: ', metrics.precision_score(y_test_2, rfc_pred_2))

# Checking recall
print('Recall score: ', metrics.recall_score(y_test_2, rfc_pred_2))

# checking accuracy
print('Test F1 score: ', f1_score(y_test_2, rfc_pred_2))

In [None]:
rfc_2.feature_importances_

### More Gridsearch 

In [None]:
param_grid_2 = {'max_depth': range(6, 11), 'n_estimators': [200, 300, 400, 500, 600]}

In [None]:
#create our estimaor
rfc_cv_2 = RandomForestClassifier(criterion = 'entropy',
                             min_samples_leaf = 7, min_samples_split = 2, random_state = 1, 
                             class_weight='balanced', n_jobs = -1)

#create the instance of GridSearchCV
grid_tree_rfc_2 = GridSearchCV(rfc_cv_2, param_grid_2, cv=10, scoring='f1')

#fit the Gridsearch to our data
grid_tree_rfc_2.fit(X_train,y_train)

In [None]:
grid_tree_rfc.best_estimator_

### Feature importance Check

In [66]:
feature_dict = {}
for i in range(len(X.columns)):
    feature_dict[X.columns[i]] = rfc.feature_importances_[i]

In [67]:
final_features = []

for x, y in feature_dict.items():
    if y != 0:
        final_features.append(x)

In [68]:
New_X = X[final_features]

In [69]:
len(New_X.columns)

51

In [70]:
with open('Features_Selected_by_RF', 'wb') as handle:
    pickle.dump(X, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [74]:
with open('Default_values', 'wb') as handle:
    pickle.dump(y, handle, protocol=pickle.HIGHEST_PROTOCOL)