In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
sns.set(context='paper', style='darkgrid', rc={'figure.facecolor':'white'}, font_scale=1.2)

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

from sklearn import metrics

from sklearn.metrics import confusion_matrix
import itertools

from sklearn.model_selection import GridSearchCV

from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

import pickle

In [2]:
df = pd.read_csv('Final_data_2_for_model.csv', index_col = 0)

In [3]:
X = df.drop(columns = 'Default', axis =1)
y = df['Default']

### Train Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=23)

In [5]:
scaler = StandardScaler()
final_scaler = scaler.fit(X_train)
X_train = pd.DataFrame(data=scaler.transform(X_train), columns = X.columns)
X_test = pd.DataFrame(data=scaler.transform(X_test), columns = X.columns)

In [None]:
d

### Random Forest

In [16]:
rfc = RandomForestClassifier(criterion = 'entropy', max_depth = 8, n_estimators = 250, 
                             min_samples_leaf = 8, min_samples_split = 2, random_state = 20, 
                             class_weight='balanced', bootstrap=True, n_jobs=-1)

In [17]:
#fit the model to the training data
rfc.fit(X_train, y_train)
#use the fitted model to predict on the test data
rfc_pred = rfc.predict(X_test)

In [18]:
# checking accuracy
print('Test Accuracy score: ', accuracy_score(y_test, rfc_pred))

# checking precision
print('Precision score: ', metrics.precision_score(y_test, rfc_pred))

# Checking recall
print('Recall score: ', metrics.recall_score(y_test, rfc_pred))

# checking accuracy
print('Test F1 score: ', f1_score(y_test, rfc_pred))

Test Accuracy score:  0.7774222222222222
Precision score:  0.5126625598904859
Recall score:  0.5810705973622964
Test F1 score:  0.5447272727272727


### Decision Tree

In [9]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion = "entropy", max_depth = 6, min_samples_split = 4, 
                             min_samples_leaf = 8, max_leaf_nodes = 16, 
                             class_weight = 'balanced', random_state = 1)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#predict the training set
y_pred_train = clf.predict(X_train)

#Predict the response for test dataset
y_pred_test = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Training F1 Score:",metrics.f1_score(y_train, y_pred_train))
print("Testing F1 Score:",metrics.f1_score(y_test, y_pred_test))

Training F1 Score: 0.4972386587771203
Testing F1 Score: 0.5007202535292423


### Logistic Regression Model

In [10]:
lr = LogisticRegression(penalty='l1', tol = .01, max_iter = 5000, 
                                     solver='saga', class_weight='balanced')

lr.fit(X_train, y_train)

y_weighted_test = lr.predict(X_test)

In [11]:
# checking accuracy
print('Test Accuracy score: ', accuracy_score(y_test, y_weighted_test))

# checking precision
print('Precision score: ', metrics.precision_score(y_test, y_weighted_test))

# Checking recall
print('Recall score: ', metrics.recall_score(y_test, y_weighted_test))

# checking accuracy
print('Test F1 score: ', f1_score(y_test, y_weighted_test))

Test Accuracy score:  0.7091555555555555
Precision score:  0.4105208870551831
Recall score:  0.6175329712955779
Test F1 score:  0.49318463444857497


### KNN Model

In [12]:
knn = KNeighborsClassifier(n_neighbors=7)

knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)

In [13]:
# checking accuracy
print('Test Accuracy score: ', accuracy_score(y_test, y_pred_knn))

# checking precision
print('Precision score: ', metrics.precision_score(y_test, y_pred_knn))

# Checking recall
print('Recall score: ', metrics.recall_score(y_test, y_pred_knn))

# checking accuracy
print('Test F1 score: ', f1_score(y_test, y_pred_knn))

Test Accuracy score:  0.7806222222222222
Precision score:  0.5490196078431373
Recall score:  0.23894491854150504
Test F1 score:  0.33297297297297296


### Voting Classifier

In [25]:
voting_clf = VotingClassifier(estimators = [('rf', rfc), ('lr', lr),
                                            ('dt', clf), ('knn', knn)], voting = 'hard', 
                              weights = [.55, .175, .2, .075])

# fitting the training data
voting_clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred_vclf = voting_clf.predict(X_test)

In [26]:
# checking accuracy
print('Test Accuracy score: ', accuracy_score(y_test, y_pred_vclf))

# checking precision
print('Precision score: ', metrics.precision_score(y_test, y_pred_vclf))

# Checking recall
print('Recall score: ', metrics.recall_score(y_test, y_pred_vclf))

# checking accuracy
print('Test F1 score: ', f1_score(y_test, y_pred_vclf))

Test Accuracy score:  0.7774222222222222
Precision score:  0.5126625598904859
Recall score:  0.5810705973622964
Test F1 score:  0.5447272727272727
