In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
%matplotlib inline

In [2]:
# Importing DataFrame and removing columns/null-values that created index problems
# (similar to Guided Example DF):

data = pd.read_csv("LoanStats3d.csv", skipinitialspace = True, header = 1, engine = 'python', skipfooter = 2)

In [3]:
# Cleaning:

data.drop(['member_id', 'id', 'url', 'emp_title', 'zip_code', 'earliest_cr_line', 'revol_util',
            'sub_grade', 'addr_state', 'desc'], 1, inplace=True)

# Giving the 'loan_status' column some values that we can refer to later (and also binarizing them):

data.replace({"loan_status":{"Charged Off": 1,
                              "Current": 1,
                              "Default":  0,
                              "Fully Paid":  1,
                              "In Grace Period":  0,
                              "Late (16-30 days)":  0,
                              "Late (31-120 days)":  0}}, inplace = True)

data.loan_status.astype('int64')
data.drop(data.select_dtypes(include = ['object']).keys(), axis = 1, inplace = True)
data.dropna(how = 'any', axis = 1, inplace = True)

# Making a copy of the df so we can assign one to the target data (with loan status info) and one for the training
# data without the loan status info.

data2 = data.copy()

# Dropping the last two problematic rows:

data2 = data2[:-2]
data = data[:-2]

### Complex Trees:

Starting our complex tree here to compare it to the forest model later.

In [4]:
from sklearn import tree
from sklearn.model_selection import train_test_split

  from collections import Sequence


In [5]:
x = data.drop(columns = ['out_prncp', 'loan_status', 'out_prncp_inv', 'total_pymnt', 
                         'total_pymnt_inv', 'total_rec_prncp','total_rec_int', 
                         'total_rec_late_fee', 'recoveries','collection_recovery_fee', 
                         'last_pymnt_amnt'], axis = 1)

y = data['loan_status']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = .2)

###### Default Tree

In [7]:
# Going with the default tree first (limited max_depth to 10 for runtime considerations):

start_time = datetime.now()
default_tree = tree.DecisionTreeClassifier(criterion = 'gini', 
                                           max_depth = 10, 
                                           max_features = None)

default_tree.fit(X_train, y_train)

def_feat_imp = default_tree.feature_importances_

In [8]:
train_score = default_tree.score(X_train, y_train)
test_score = default_tree.score(X_test, y_test)

In [9]:
end_time = datetime.now()
print("The test score is {}.\n The  Training Score is: {}.\n The runtime is:{}.".format(test_score, train_score, 
                                                                                        (end_time-start_time)))


The test score is 0.9834954107742908.
 The  Training Score is: 0.984563961599886.
 The runtime is:0:00:05.789800.


In [10]:
from sklearn.metrics import confusion_matrix

Y_pred = default_tree.predict(X_test)

c_matrix = confusion_matrix(y_test, Y_pred)

print(c_matrix)

feature_frame = pd.DataFrame(X_test.columns)
feature_frame['Gini_Importance'] = def_feat_imp
feature_frame

[[    1  1342]
 [   48 82828]]


Unnamed: 0,0,Gini_Importance
0,loan_amnt,0.048347
1,funded_amnt,0.022921
2,funded_amnt_inv,0.03445
3,installment,0.05912
4,annual_inc,0.056137
5,delinq_2yrs,0.023071
6,inq_last_6mths,0.013188
7,open_acc,0.006859
8,pub_rec,0.011442
9,revol_bal,0.018802


Here, the training and test scores are quite close to each other, which is great.  However, given that the accuracy is so high, it looks like there is a fair amount of over-fitting.  Manipulating the parameters a little bit more 

###### First Tree:
> Making a tree with slightly more depth than the default and also specifying the number of leaf samples and leaf nodes.

In [18]:
# Model:
start_time = datetime.now()

tree_1 = tree.DecisionTreeClassifier(criterion = 'entropy', 
                                     max_depth = 16, 
                                     max_features = None)

tree_1.fit(X_train, y_train)

Y_pred = tree_1.predict(X_test)

In [19]:
train_score_1 = tree_1.score(X_train, y_train)
test_score_1 = tree_1.score(X_test, y_test)

tree_1_feat_imp = tree_1.feature_importances_

In [20]:
end_time = datetime.now()
print("The test score is {}.\n The  Training Score is: {}.\n The runtime is:{}.".format(test_score_1, train_score_1, 
                                                                                        (end_time-start_time)))
c_matrix_1 = confusion_matrix(y_test, Y_pred)

print(c_matrix_1)

feature_frame['Feature_Importances_1'] = tree_1_feat_imp
feature_frame

The test score is 0.9788527529417352.
 The  Training Score is: 0.9884289081377607.
 The runtime is:0:00:08.811284.
[[    7  1336]
 [  445 82431]]


Unnamed: 0,0,Gini_Importance,Feature_Importances_1,Feature_Importances_2
0,loan_amnt,0.048347,0.015914,0.0
1,funded_amnt,0.022921,0.012605,0.101242
2,funded_amnt_inv,0.03445,0.041169,0.445313
3,installment,0.05912,0.074312,0.0
4,annual_inc,0.056137,0.04647,0.005736
5,delinq_2yrs,0.023071,0.00901,0.0
6,inq_last_6mths,0.013188,0.012416,0.0
7,open_acc,0.006859,0.010622,0.0
8,pub_rec,0.011442,0.004388,0.076859
9,revol_bal,0.018802,0.052145,0.0


The parameters on this tree are a little more specific.  While the accuracy has stayed the same, we can see that the test score has gone up ever so slightly, which is a good step with the accuracy, but we are still over-fitting quite a bit. If we tinker with the hyper-parameters a bit more, we should be able to get a more reliable model. 

###### Second Tree:

In [21]:
# Model:
start_time = datetime.now()

tree_2 = tree.DecisionTreeClassifier(criterion = 'entropy', 
                                     max_depth = 3,
                                     max_features = 3,
                                     min_samples_split = 1500,
                                     min_samples_leaf = 100,
                                     max_leaf_nodes = 60)
tree_2.fit(X_train, y_train)

Y_pred = tree_2.predict(X_test)

In [22]:
train_score_2 = tree_2.score(X_train, y_train)
test_score_2 = tree_2.score(X_test, y_test)

tree_2_feat_imp = tree_2.feature_importances_

In [23]:
end_time = datetime.now()
print("The test score is {}.\n The  Training Score is: {}.\n The runtime is:{}.".format(test_score_2, train_score_2, 
                                                                                        (end_time-start_time)))
c_matrix_2 = confusion_matrix(y_test, Y_pred)

print(c_matrix_2)

feature_frame['Feature_Importances_2'] = tree_2_feat_imp
feature_frame

The test score is 0.9840534796186133.
 The  Training Score is: 0.9843086732724995.
 The runtime is:0:00:00.846599.
[[    0  1343]
 [    0 82876]]


Unnamed: 0,0,Gini_Importance,Feature_Importances_1,Feature_Importances_2
0,loan_amnt,0.048347,0.015914,0.609922
1,funded_amnt,0.022921,0.012605,0.0
2,funded_amnt_inv,0.03445,0.041169,0.0
3,installment,0.05912,0.074312,0.0
4,annual_inc,0.056137,0.04647,0.087468
5,delinq_2yrs,0.023071,0.00901,0.0
6,inq_last_6mths,0.013188,0.012416,0.0
7,open_acc,0.006859,0.010622,0.0
8,pub_rec,0.011442,0.004388,0.0
9,revol_bal,0.018802,0.052145,0.0


The second tree here has less flexibility in creating nodes and leaves, so the confusion matrix is more 'black and white' in comparison to the first tree where you can see there were a few predictions that landed in all four categories.

### Simple Forests:
> Using the default constraints here - still gets us a lot of overfitting.

In [34]:
from sklearn import ensemble

start_time = datetime.now()

default_forest = ensemble.RandomForestClassifier()

default_forest.fit(X_train, y_train)

Y_pred = default_forest.predict(X_test)

end_time = datetime.now()

In [35]:
default_train_score = default_forest.score(X_train, y_train)
default_test_score = default_forest.score(X_test, y_test)

default_importances = default_forest.feature_importances_

In [36]:
print("The test score is: {}.\n The train score is: {}.\n The runtime is:{}".format(default_test_score, default_train_score,
                                                                                    (end_time-start_time)))
def_forest_matrix = confusion_matrix(y_test, Y_pred)
forest_features = pd.DataFrame(X_test.columns)
forest_features['Default'] = default_importances

print(def_forest_matrix)
forest_features

The test score is: 0.9840178582030183.
 The train score is: 0.9982842249624488.
 The runtime is:0:00:15.315527
[[    0  1343]
 [    3 82873]]


Unnamed: 0,0,Default
0,loan_amnt,0.02748
1,funded_amnt,0.026075
2,funded_amnt_inv,0.029241
3,installment,0.045057
4,annual_inc,0.039751
5,delinq_2yrs,0.009285
6,inq_last_6mths,0.012155
7,open_acc,0.02123
8,pub_rec,0.007155
9,revol_bal,0.046049


###### Forest 1
Aiming towards simplicity here...

In [40]:
start_time = datetime.now()

forest_1 = ensemble.RandomForestClassifier(n_estimators = 100, 
                                           criterion = 'entropy',
                                           max_features = 'auto',
                                           max_depth = 5)

forest_1.fit(X_train, y_train)

Y_pred = forest_1.predict(X_test)

end_time = datetime.now()

In [41]:
forest_1_train_score = forest_1.score(X_train, y_train)
forest_1_test_score = forest_1.score(X_test, y_test)

forest_1_importances = forest_1.feature_importances_

In [44]:
print("The test score is: {}.\n The train score is: {}.\n The runtime is:{}".format(forest_1_test_score, forest_1_train_score,
                                                                                    (end_time-start_time)))
forest_1_matrix = confusion_matrix(y_test, Y_pred)
forest_features['Forest_1'] = forest_1_importances

print(forest_1_matrix)
forest_features

The test score is: 0.9840534796186133.
 The train score is: 0.9843086732724995.
 The runtime is:0:00:08.932302
[[    0  1343]
 [    0 82876]]


Unnamed: 0,0,Default,Forest_1
0,loan_amnt,0.02748,0.09018
1,funded_amnt,0.026075,0.080424
2,funded_amnt_inv,0.029241,0.105707
3,installment,0.045057,0.075385
4,annual_inc,0.039751,0.016569
5,delinq_2yrs,0.009285,0.01624
6,inq_last_6mths,0.012155,0.002278
7,open_acc,0.02123,0.006119
8,pub_rec,0.007155,0.007143
9,revol_bal,0.046049,0.016989


In [43]:
start_time = datetime.now()

forest_2 = ensemble.RandomForestClassifier(n_estimators = 50, 
                                           criterion = 'entropy',
                                           max_features = 3,
                                           max_depth = 3)

forest_2.fit(X_train, y_train)

Y_pred = forest_2.predict(X_test)

end_time = datetime.now()

In [46]:
forest_2_train_score = forest_2.score(X_train, y_train)
forest_2_test_score = forest_2.score(X_test, y_test)

forest_2_importances = forest_2.feature_importances_

In [47]:
print("The test score is: {}.\n The train score is: {}.\n The runtime is:{}".format(forest_2_test_score, forest_2_train_score,
                                                                                    (end_time-start_time)))
forest_2_matrix = confusion_matrix(y_test, Y_pred)
forest_features['Forest_2'] = forest_2_importances

print(forest_2_matrix)
forest_features

The test score is: 0.9840534796186133.
 The train score is: 0.9843086732724995.
 The runtime is:0:00:08.932302
[[    0  1343]
 [    0 82876]]


Unnamed: 0,0,Default,Forest_1,Forest_2
0,loan_amnt,0.02748,0.09018,0.093211
1,funded_amnt,0.026075,0.080424,0.087784
2,funded_amnt_inv,0.029241,0.105707,0.15089
3,installment,0.045057,0.075385,0.046798
4,annual_inc,0.039751,0.016569,0.013422
5,delinq_2yrs,0.009285,0.01624,0.010316
6,inq_last_6mths,0.012155,0.002278,0.002755
7,open_acc,0.02123,0.006119,0.009208
8,pub_rec,0.007155,0.007143,0.010634
9,revol_bal,0.046049,0.016989,0.006129
