In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
%matplotlib inline

In [2]:
# Importing DataFrame and removing columns/null-values that created index problems
# (similar to Guided Example DF):

data = pd.read_csv("LoanStats3d.csv", skipinitialspace = True, header = 1, engine = 'python', skipfooter = 2)

In [3]:
# Cleaning:

data.drop(['member_id', 'id', 'url', 'emp_title', 'zip_code', 'earliest_cr_line', 'revol_util',
            'sub_grade', 'addr_state', 'desc'], 1, inplace=True)

# Giving the 'loan_status' column some values that we can refer to later (and also binarizing them):

data.replace({"loan_status":{"Charged Off": 1,
                              "Current": 1,
                              "Default":  0,
                              "Fully Paid":  1,
                              "In Grace Period":  0,
                              "Late (16-30 days)":  0,
                              "Late (31-120 days)":  0}}, inplace = True)

data.loan_status.astype('int64')
data.drop(data.select_dtypes(include = ['object']).keys(), axis = 1, inplace = True)
data.dropna(how = 'any', axis = 1, inplace = True)

# Making a copy of the df so we can assign one to the target data (with loan status info) and one for the training
# data without the loan status info.

data2 = data.copy()

# Dropping the last two problematic rows:

data2 = data2[:-2]
data = data[:-2]

### Complex Trees:

Starting our complex tree here to compare it to the forest model later.

In [4]:
from sklearn import tree
from sklearn.model_selection import train_test_split

  from collections import Sequence


In [5]:
x = data.drop(columns = ['out_prncp', 'loan_status', 'out_prncp_inv', 'total_pymnt', 
                         'total_pymnt_inv', 'total_rec_prncp','total_rec_int', 
                         'total_rec_late_fee', 'recoveries','collection_recovery_fee', 
                         'last_pymnt_amnt'], axis = 1)

y = data['loan_status']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = .2)

###### Default Tree

In [7]:
# Going with the default tree first (limited max_depth to 10 for runtime considerations):

start_time = datetime.now()
default_tree = tree.DecisionTreeClassifier(criterion = 'gini', 
                                           max_depth = 10, 
                                           max_features = None)

default_tree.fit(X_train, y_train)

def_feat_imp = default_tree.feature_importances_

In [8]:
train_score = default_tree.score(X_train, y_train)
test_score = default_tree.score(X_test, y_test)

In [9]:
end_time = datetime.now()
print("The test score is {}.\n The  Training Score is: {}.\n The runtime is:{}.".format(test_score, train_score, 
                                                                                        (end_time-start_time)))


The test score is 0.9831035752027453.
 The  Training Score is: 0.9847687859555798.
 The runtime is:0:00:06.161371.


In [10]:
from sklearn.metrics import confusion_matrix

Y_pred = default_tree.predict(X_test)

c_matrix = confusion_matrix(y_test, Y_pred)

print(c_matrix)

feature_frame = pd.DataFrame(X_test.columns)
feature_frame['Gini_Importance'] = def_feat_imp
feature_frame

[[    0  1384]
 [   39 82796]]


Unnamed: 0,0,Gini_Importance
0,loan_amnt,0.027164
1,funded_amnt,0.057752
2,funded_amnt_inv,0.033378
3,installment,0.099549
4,annual_inc,0.04785
5,delinq_2yrs,0.019749
6,inq_last_6mths,0.005049
7,open_acc,0.007168
8,pub_rec,0.005302
9,revol_bal,0.01733


Here, the training and test scores are quite close to each other, which is great.  However, given that the accuracy is so high, it looks like there is a fair amount of over-fitting.  Manipulating the parameters a little bit more 

###### First Tree:
> Making a tree with slightly more depth than the default and also specifying the number of leaf samples and leaf nodes.

In [11]:
# Model:
start_time = datetime.now()

tree_1 = tree.DecisionTreeClassifier(criterion = 'entropy', 
                                     max_depth = 16, 
                                     max_features = None,
                                     min_samples_split = 200,
                                     min_samples_leaf = 70,
                                     max_leaf_nodes = 60)

tree_1.fit(X_train, y_train)

Y_pred = tree_1.predict(X_test)

In [12]:
train_score_1 = tree_1.score(X_train, y_train)
test_score_1 = tree_1.score(X_test, y_test)

tree_1_feat_imp = tree_1.feature_importances_

In [13]:
end_time = datetime.now()
print("The test score is {}.\n The  Training Score is: {}.\n The runtime is:{}.".format(test_score_1, train_score_1, 
                                                                                        (end_time-start_time)))
c_matrix_1 = confusion_matrix(y_test, Y_pred)

print(c_matrix_1)

feature_frame['Feature_Importances_1'] = tree_1_feat_imp
feature_frame

The test score is 0.9835666536054809.
 The  Training Score is: 0.9844303804983465.
 The runtime is:0:00:05.279761.
[[    0  1384]
 [    0 82835]]


Unnamed: 0,0,Gini_Importance,Feature_Importances_1
0,loan_amnt,0.027164,0.216238
1,funded_amnt,0.057752,0.0
2,funded_amnt_inv,0.033378,0.06347
3,installment,0.099549,0.095815
4,annual_inc,0.04785,0.016412
5,delinq_2yrs,0.019749,0.007403
6,inq_last_6mths,0.005049,0.0
7,open_acc,0.007168,0.009445
8,pub_rec,0.005302,0.0
9,revol_bal,0.01733,0.030681


The parameters on this tree are a little more specific.  While the accuracy has stayed the same, we can see that the test score has gone up ever so slightly, which is a good step with the accuracy, but we are still over-fitting quite a bit. If we tinker with the hyper-parameters a bit more, we should be able to get a more reliable model. 

###### Second Tree:

In [24]:
# Model:
start_time = datetime.now()

tree_2 = tree.DecisionTreeClassifier(criterion = 'entropy', 
                                     max_depth = 3,
                                     max_features = 3,
                                     min_samples_split = 1500,
                                     min_samples_leaf = 100,
                                     max_leaf_nodes = 60)
tree_2.fit(X_train, y_train)

Y_pred = tree_2.predict(X_test)

In [25]:
train_score_2 = tree_2.score(X_train, y_train)
test_score_2 = tree_2.score(X_test, y_test)

tree_2_feat_imp = tree_2.feature_importances_

In [26]:
end_time = datetime.now()
print("The test score is {}.\n The  Training Score is: {}.\n The runtime is:{}.".format(test_score_2, train_score_2, 
                                                                                        (end_time-start_time)))
c_matrix_2 = confusion_matrix(y_test, Y_pred)

print(c_matrix_2)

feature_frame['Feature_Importances_2'] = tree_2_feat_imp
feature_frame

The test score is 0.9835666536054809.
 The  Training Score is: 0.9844303804983465.
 The runtime is:0:00:00.997395.
[[    0  1384]
 [    0 82835]]


Unnamed: 0,0,Gini_Importance,Feature_Importances_1,Feature_Importances_2
0,loan_amnt,0.027164,0.216238,0.064382
1,funded_amnt,0.057752,0.0,0.0
2,funded_amnt_inv,0.033378,0.06347,0.402894
3,installment,0.099549,0.095815,0.030226
4,annual_inc,0.04785,0.016412,0.010113
5,delinq_2yrs,0.019749,0.007403,0.0
6,inq_last_6mths,0.005049,0.0,0.0
7,open_acc,0.007168,0.009445,0.0
8,pub_rec,0.005302,0.0,0.0
9,revol_bal,0.01733,0.030681,0.0


Here, you can still see the class imbalance between 'Fully Paid' and 'Current' loans (especially given that these are the only two out of six categories we see).  The accuracy score has increased a very miniscule amount and the runtime has decreased by about a third.  In order to increase the accuracy, we would have to allow for more room for depth and leaves, which would make it difficult to create a visual.  

### Simple Forests:
> Using the default constraints here, but limiting the depth for runtime purposes. 

In [17]:
from sklearn import ensemble

start_time = datetime.now()

default_forest = ensemble.RandomForestClassifier(max_depth = 10)

default_forest.fit(x_train, y_train)

y_pred = default_forest.predict(x_test)

end_time = datetime.now()

NameError: name 'x_train' is not defined

In [None]:
print("The mean accuracy score is: {}. \n The runtime is:{}".format(default_forest.score(x_train, y_train), (end_time-start_time)))
print(default_forest.score(x_test, y_test))
c_matrix = confusion_matrix(y_test, y_pred)
c_matrix

###### Forest 1
Aiming towards simplicity here...

In [None]:
start_time = datetime.now()

forest_1 = ensemble.RandomForestClassifier(n_estimators = 100, 
                                           criterion = 'entropy',
                                           max_features = 'auto',
                                           max_depth = 7)

forest_1.fit(x_train, y_train)

y_pred = forest_1.predict(x_test)

end_time = datetime.now()

In [None]:
data

In [None]:
print("The mean accuracy score is: {}. \n The runtime is:{}".format(forest_1.score(x_train, y_train), (end_time-start_time)))
print(forest_1.score(x_test, y_test))
c_matrix = confusion_matrix(y_test, y_pred)
c_matrix

In comparison to the default model, the accuracy has only decreased by two percent, yet the runtime has decreased by fifty percent, which is an interesting consideration. In comparison to the best decision tree model above, we are getting only a five percent decrease in accuracy for a significantly more reliable accuracy score at an eighth of the runtime. We do not, however, get the visual which could be useful for certain phases of the analysis/study in question. 

In [None]:
start_time = datetime.now()

forest_2 = ensemble.RandomForestClassifier(n_estimators = 4, 
                                           criterion = 'entropy',
                                           max_features = 3,
                                           max_depth = 3,
                                           min_samples_split = 2000,
                                           min_samples_leaf = 500,
                                           max_leaf_nodes = 20)

forest_2.fit(x, y)

end_time = datetime.now()

In [None]:
print("The mean accuracy score is: {}. \n The runtime is:{}".format(forest_2.score(x, y), (end_time-start_time)))

Here, our accuracy has decreased only minimally and the runtime has decreased by a fourth.  You can see here that the model has reached a point where it starts to plateau in terms of changing parameters and accuracy.  The most significant change was in runtime.  Thus, for half the number of estimatores, half the leaves, you get the same accuracy and one fourth of the runtime.  In comparison to the tree models, where the reliability of your accuracy score is weaker than four trees in this last forest model, this model is quite powerful.