## Preprocessed Data for Machine Learning (also see separate Data Cleaning file)

In [2]:
import warnings
warnings.simplefilter('ignore')

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [3]:
# Read the csv file into a pandas DataFrame

df = pd.read_csv('../Resources/cleaned_credit_risk.csv')
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,21,9600,2,5.0,1,1,1000,11.14,0,0.1,1,2
1,25,9600,0,1.0,3,2,5500,12.87,1,0.57,1,3
2,23,65500,1,4.0,3,2,35000,15.23,1,0.53,1,2
3,24,54400,1,8.0,3,2,35000,14.27,1,0.55,0,4
4,21,9900,2,2.0,5,0,2500,7.14,1,0.25,1,2


In [4]:
# Define y-axis by setting target column

target = df["loan_status"]
target_names = ["non default", "default"]

In [5]:
# Define x-axis by removing target column and keeping remaining columns as features

data = df.drop("loan_status", axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,21,9600,2,5.0,1,1,1000,11.14,0.1,1,2
1,25,9600,0,1.0,3,2,5500,12.87,0.57,1,3
2,23,65500,1,4.0,3,2,35000,15.23,0.53,1,2
3,24,54400,1,8.0,3,2,35000,14.27,0.55,0,4
4,21,9900,2,2.0,5,0,2500,7.14,0.25,1,2


In [6]:
print(data.shape, target.shape)

(28632, 11) (28632,)


# Split data into training and testing data

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42, stratify=target)

X_train.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
7029,24,37200,1,8.0,1,0,7200,7.9,0.19,1,2
9445,23,34000,1,7.0,1,1,10000,10.36,0.29,1,4
2930,24,36000,0,3.0,4,1,4750,11.83,0.13,1,3
28116,36,38568,1,7.0,1,4,2400,18.39,0.06,0,13
23372,28,28896,1,6.0,4,2,15000,13.48,0.52,0,8


In [8]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(21474, 11) (7158, 11) (21474,) (7158,)


## Scaled or Normalized our data and used StandardScaler

In [9]:
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)

In [10]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Fit the Model to the scaled training data and made predictions using the scaled test data

In [11]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train_scaled, y_train)
rf.score(X_test_scaled, y_test)

0.9326627549594859

In [12]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.21268524562417015, 'loan_percent_income'),
 (0.1478730782653114, 'person_income'),
 (0.1228958141481172, 'loan_grade'),
 (0.11503108473325932, 'loan_int_rate'),
 (0.09861592476448323, 'person_home_ownership'),
 (0.07527067066864017, 'loan_intent'),
 (0.07286600879611553, 'loan_amnt'),
 (0.06420300604657726, 'person_emp_length'),
 (0.04432776343131791, 'person_age'),
 (0.03574840456977874, 'cb_person_cred_hist_length'),
 (0.010482998952229029, 'cb_person_default_on_file')]

In [13]:
help(rf)

Help on RandomForestClassifier in module sklearn.ensemble.forest object:

class RandomForestClassifier(ForestClassifier)
 |  RandomForestClassifier(n_estimators='warn', criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None)
 |  
 |  A random forest classifier.
 |  
 |  A random forest is a meta estimator that fits a number of decision tree
 |  classifiers on various sub-samples of the dataset and uses averaging to
 |  improve the predictive accuracy and control over-fitting.
 |  The sub-sample size is always the same as the original
 |  input sample size but the samples are drawn with replacement if
 |  `bootstrap=True` (default).
 |  
 |  Read more in the :ref:`User Guide <forest>`.
 |  
 |  Parameters
 |  ----------
 |  n_estimator

In [14]:
from sklearn.model_selection import GridSearchCV

# n_estimators = [100, 300, 500, 800, 1200]
# max_depth = [5, 8, 15, 25, 30]
# min_samples_split = [2, 5, 10, 15, 100]
# min_samples_leaf = [1, 2, 5, 10]
# hyperF = dict(n_estimators = n_estimators, max_depth = max_depth,
#               min_samples_split = min_samples_split,
#              min_samples_leaf = min_samples_leaf)
# gridF = GridSearchCV(rf, hyperF, cv = 3, verbose = 1,
#                       n_jobs = -1)
# bestF = gridF.fit(X_train_scaled, y_train)

In [17]:
# List the best parameters for this dataset
print(gridF.best_params_)

{'max_depth': 25, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1200}


In [20]:
# List the best score
print(gridF.best_score_)

0.9269348980162057


In [16]:
forestOpt = RandomForestClassifier(random_state = 42, max_depth = 25, n_estimators = 1200, min_samples_split = 2, min_samples_leaf = 1)
modelOpt = forestOpt.fit(X_train_scaled, y_train)
y_pred = modelOpt.predict(X_test_scaled)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

In [15]:
import pickle

# Save to file in the current working directory
with open("Final_Model.pkl", 'wb') as file:
    pickle.dump(RandomForestClassifier, file)