## Preprocessed Data for Machine Learning (also see separate Data Cleaning file)

In [1]:
import warnings
warnings.simplefilter('ignore')

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
# Read the csv file into a pandas DataFrame

df = pd.read_csv('../Resources/cleaned_credit_risk.csv')
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,21,9600,2,5.0,1,1,1000,11.14,0,0.1,1,2
1,25,9600,0,1.0,3,2,5500,12.87,1,0.57,1,3
2,23,65500,1,4.0,3,2,35000,15.23,1,0.53,1,2
3,24,54400,1,8.0,3,2,35000,14.27,1,0.55,0,4
4,21,9900,2,2.0,5,0,2500,7.14,1,0.25,1,2


In [3]:
# Define y-axis by setting target column

target = df["loan_status"]
target_names = ["non default", "default"]

In [4]:
# Define x-axis by removing target column and keeping remaining columns as features

data = df.drop("loan_status", axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,21,9600,2,5.0,1,1,1000,11.14,0.1,1,2
1,25,9600,0,1.0,3,2,5500,12.87,0.57,1,3
2,23,65500,1,4.0,3,2,35000,15.23,0.53,1,2
3,24,54400,1,8.0,3,2,35000,14.27,0.55,0,4
4,21,9900,2,2.0,5,0,2500,7.14,0.25,1,2


In [5]:
print(data.shape, target.shape)

(28632, 11) (28632,)


# Split data into training and testing data

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42, stratify=target)

X_train.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
7029,24,37200,1,8.0,1,0,7200,7.9,0.19,1,2
9445,23,34000,1,7.0,1,1,10000,10.36,0.29,1,4
2930,24,36000,0,3.0,4,1,4750,11.83,0.13,1,3
28116,36,38568,1,7.0,1,4,2400,18.39,0.06,0,13
23372,28,28896,1,6.0,4,2,15000,13.48,0.52,0,8


In [7]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(21474, 11) (7158, 11) (21474,) (7158,)


## Scaled or Normalized our data and used StandardScaler

In [8]:
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train)

In [9]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Fit the Model to the scaled training data and made predictions using the scaled test data

In [10]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train_scaled, y_train)
rf.score(X_test_scaled, y_test)

0.9328024587873708

In [11]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.2155673464384015, 'loan_percent_income'),
 (0.14459284587577004, 'person_income'),
 (0.12454377597201682, 'loan_int_rate'),
 (0.11381915703489748, 'loan_grade'),
 (0.0976741724127537, 'person_home_ownership'),
 (0.07468120333825032, 'loan_intent'),
 (0.07407965232742038, 'loan_amnt'),
 (0.06289985957956228, 'person_emp_length'),
 (0.04449285822541118, 'person_age'),
 (0.036224951360059626, 'cb_person_cred_hist_length'),
 (0.011424177435456715, 'cb_person_default_on_file')]

## Hypertuned Parameters with Randomized Search

In [14]:
# from sklearn.model_selection import RandomizedSearchCV

# # number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# # number of features at every split
# max_features = ['auto', 'sqrt']

# # max depth
# max_depth = [int(x) for x in np.linspace(100, 500, num = 11)]
# max_depth.append(None)

# # create random grid
# random_grid = {
#  'n_estimators': n_estimators,
#  'max_features': max_features,
#  'max_depth': max_depth
#  }

# # Random search of parameters
# rfc_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# # Fit the model
# rfc_random.fit(X_train_scaled, y_train)

# # print results
# print(rfc_random.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 11.6min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 23.5min finished


{'n_estimators': 800, 'max_features': 'auto', 'max_depth': 380}


In [16]:
# List the best score
print(rfc_random.best_score_)

0.9273540094998602


## Hypertuned Parameters with Grid Search

In [14]:
from sklearn.model_selection import GridSearchCV

# n_estimators = [100, 300, 500, 800, 1200]
# max_depth = [5, 8, 15, 25, 30]
# min_samples_split = [2, 5, 10, 15, 100]
# min_samples_leaf = [1, 2, 5, 10]
# hyperF = dict(n_estimators = n_estimators, max_depth = max_depth,
#               min_samples_split = min_samples_split,
#              min_samples_leaf = min_samples_leaf)
# gridF = GridSearchCV(rf, hyperF, cv = 3, verbose = 1,
#                       n_jobs = -1)
# bestF = gridF.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 500 candidates, totalling 1500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   33.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 21.9min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 41.0min
[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed: 52.5min finished


In [17]:
# List the best parameters for this dataset
print(gridF.best_params_)

{'max_depth': 25, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1200}


In [20]:
# List the best score
print(gridF.best_score_)

0.9269348980162057


## Hypertuned Parameters with Grid Search

In [18]:
predictions = rf.predict(X_test_scaled)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [0 0 1 0 0 0 0 0 0 0]
First 10 Actual labels: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [19]:
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,0,1
1,0,0
2,1,0
3,0,0
4,0,0
...,...,...
7153,1,1
7154,0,0
7155,1,1
7156,0,0


In [20]:
# Calculate classification report

from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test, predictions,
                            target_names=["non default", "default"]))

              precision    recall  f1-score   support

 non default       0.93      0.99      0.96      5608
     default       0.96      0.72      0.82      1550

    accuracy                           0.93      7158
   macro avg       0.94      0.86      0.89      7158
weighted avg       0.93      0.93      0.93      7158



## Save the Model

In [22]:
# Save to file in the current working directory

import pickle

with open("Best_Model.pkl", 'wb') as file:
    pickle.dump(rf, file)