In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
sns.set(context='paper', style='darkgrid', rc={'figure.facecolor':'white'}, font_scale=1.2)

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

from sklearn import metrics

from sklearn.metrics import confusion_matrix
import itertools

from sklearn.model_selection import GridSearchCV

from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier

import xgboost as xgb

import pickle

In [3]:
df = pd.read_csv('Final_data_1_for_model.csv', index_col = 0)

In [None]:
# def ext_values(df, extreme_cols):
#     new_df = df.copy()
#     for col in extreme_cols:
#         std = new_df[col].std()
#         mean = new_df[col].mean()
#         value = mean+(4*std)
#         new_df[col] = new_df[col].apply(lambda x: value if (np.abs(x-mean) > 4*std) else x)
#     return new_df

In [None]:
# df[continuous_cols] = ext_values(df[continuous_cols], df[continuous_cols].columns)

In [4]:
X = df.drop(columns = 'Default', axis =1)
y = df['Default']

In [5]:
# n = 4
# row_groups= [X.columns[i:i+n] for i in range(0, len(X.columns), n) ]

# for i in row_groups:
#     pp = sns.pairplot(data=df, y_vars=['Default'],x_vars=i, kind="reg", height=3)

### Train Test Split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=23)

In [7]:
scaler = StandardScaler()
final_scaler = scaler.fit(X_train)
X_train = pd.DataFrame(data=scaler.transform(X_train), columns = X.columns)
X_test = pd.DataFrame(data=scaler.transform(X_test), columns = X.columns)

### Random Forest

In [None]:
rfc = RandomForestClassifier(criterion = 'entropy', max_depth = 8, n_estimators = 250, 
                             min_samples_leaf = 8, min_samples_split = 2, random_state = 23, 
                             class_weight='balanced', bootstrap=True, n_jobs=-1)

In [None]:
#fit the model to the training data
rfc.fit(X_train, y_train)
#use the fitted model to predict on the test data
rfc_pred = rfc.predict(X_test)

In [None]:
# checking accuracy
print('Test Accuracy score: ', accuracy_score(y_test, rfc_pred))

# checking precision
print('Precision score: ', metrics.precision_score(y_test, rfc_pred))

# Checking recall
print('Recall score: ', metrics.recall_score(y_test, rfc_pred))

# checking accuracy
print('Test F1 score: ', f1_score(y_test, rfc_pred))

### Decision Tree

In [None]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion = "entropy", max_depth = 6, min_samples_split = 4, 
                             min_samples_leaf = 8, max_leaf_nodes = 16, 
                             class_weight = 'balanced', random_state = 1)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#predict the training set
y_pred_train = clf.predict(X_train)

#Predict the response for test dataset
y_pred_test = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Training F1 Score:",metrics.f1_score(y_train, y_pred_train))
print("Testing F1 Score:",metrics.f1_score(y_test, y_pred_test))

### Logistic Regression Model

In [None]:
lr = LogisticRegression(penalty='l1', tol = .01, max_iter = 5000, 
                                     solver='saga', class_weight='balanced')

lr.fit(X_train, y_train)

y_weighted_test = lr.predict(X_test)

In [None]:
# checking accuracy
print('Test Accuracy score: ', accuracy_score(y_test, y_weighted_test))

# checking precision
print('Precision score: ', metrics.precision_score(y_test, y_weighted_test))

# Checking recall
print('Recall score: ', metrics.recall_score(y_test, y_weighted_test))

# checking accuracy
print('Test F1 score: ', f1_score(y_test, y_weighted_test))

### KNN Model

In [None]:
knn = KNeighborsClassifier(n_neighbors=7)

knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)

In [None]:
# checking accuracy
print('Test Accuracy score: ', accuracy_score(y_test, y_pred_knn))

# checking precision
print('Precision score: ', metrics.precision_score(y_test, y_pred_knn))

# Checking recall
print('Recall score: ', metrics.recall_score(y_test, y_pred_knn))

# checking accuracy
print('Test F1 score: ', f1_score(y_test, y_pred_knn))

### Voting Classifier

In [None]:
voting_clf = VotingClassifier(estimators = [('rf', rfc), ('lr', lr),
                                            ('dt', clf), ('knn', knn)], voting = 'hard', 
                              weights = [.55, .175, .2, .075])

# fitting the training data
voting_clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred_vclf = voting_clf.predict(X_test)

In [None]:
# checking accuracy
print('Test Accuracy score: ', accuracy_score(y_test, y_pred_vclf))

# checking precision
print('Precision score: ', metrics.precision_score(y_test, y_pred_vclf))

# Checking recall
print('Recall score: ', metrics.recall_score(y_test, y_pred_vclf))

# checking accuracy
print('Test F1 score: ', f1_score(y_test, y_pred_vclf))

### X - Gradient Boosting

In [None]:
xgb_clf = xgb.XGBClassifier(max_depth = 6, 
                            learning_rate = .2,
                            n_jobs = -1, 
                            random_state = 23,
                            n_estimators = 500,
                            verbosity = 1)

In [None]:
xgb_clf.fit(X_train, y_train)

In [None]:
y_pred_xgb = xgb_clf.predict(X_test)

In [None]:
# checking accuracy
print('Test Accuracy score: ', accuracy_score(y_test, y_pred_xgb))

# checking precision
print('Precision score: ', metrics.precision_score(y_test, y_pred_xgb))

# Checking recall
print('Recall score: ', metrics.recall_score(y_test, y_pred_xgb))

# checking accuracy
print('Test F1 score: ', f1_score(y_test, y_pred_xgb))

### XGB with Gridsearch

In [8]:
clf_xgb = xgb.XGBClassifier(objective = 'binary:logistic')

param_dist = {'n_estimators': [100,300,500],
              'learning_rate': [0.1,0.07,0.05,0.03,0.01],
              'max_depth': [3, 4, 5, 6, 7],
              'colsample_bytree': [0.5,0.45,0.4],
              'min_child_weight': [1, 2, 3]
             }

In [9]:
gsearch1 = GridSearchCV(
    estimator = clf_xgb,
    param_grid = param_dist, 
    scoring='f1',
    n_jobs=-1,
    verbose=1,
    iid=False, 
    cv=5)

In [None]:
gsearch1.fit(X_train, y_train)

Fitting 5 folds for each of 675 candidates, totalling 3375 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   34.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 10.6min
