Packages

In [70]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.model_selection import GridSearchCV

In [19]:
os.chdir("/Users/nielskreuk/Dropbox/DataScience/Exercises/NYCDSA/CapstoneProject/LendingClub")

In [42]:
rand_seed = 42

Reading the data

In [118]:
applications = pd.read_csv("./data/cleaned_applications.csv")

In [119]:
applications.drop(["State", "Application.Date", "Amount.Requested"], inplace = True, axis = 1)

Label encoding

In [120]:
enc = LabelEncoder()
result = {}
for col in applications.columns:
    if (applications[col].dtype == "int" or applications[col].dtype == "float"):
        result[col] = applications[col]
    else:
        result[col] = pd.Series(enc.fit_transform(applications[col]))
features = pd.DataFrame(result)

Splitting a train+test subset and features vs. target

In [121]:
target = features.Status
features.drop("Status", inplace = True, axis = 1)
train_features, test_features, train_target, test_target = train_test_split(features, target, test_size = 0.1)

Tree model

In [109]:
tree_model = tree.DecisionTreeClassifier(min_samples_leaf = 30000)

In [87]:
np.random.seed(rand_seed)
tree_model.fit(train_features, train_target)

DecisionTreeClassifier(min_samples_leaf=30000)

In [88]:
tree_model.score(train_features, train_target)

0.9943810210643259

In [89]:
tree_model.score(test_features, test_target)

0.9944215538533708

In [51]:
print(tree_model.feature_importances_)

[0.01471673 0.00246467 0.95926612 0.02150362 0.00204885]


Grid search for best tree depth

In [122]:
grid_para_tree = [{
    #"min_samples_leaf": range(1, 10),
    "max_depth": range(2,9)
}]
grid_search_tree = GridSearchCV(tree_model, grid_para_tree, cv = 5, scoring = 'accuracy', n_jobs = -1)

In [123]:
np.random.seed(rand_seed)
%time grid_search_tree.fit(train_features, train_target)

CPU times: user 15.3 s, sys: 4.79 s, total: 20.1 s
Wall time: 2min 32s


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(min_samples_leaf=30000),
             n_jobs=-1, param_grid=[{'max_depth': range(3, 9)}],
             scoring='accuracy')

In [124]:
grid_search_tree.best_params_

{'max_depth': 4}

In [125]:
grid_search_tree.best_score_

0.9936861857071024

In [126]:
grid_search_tree.score(test_features, test_target)

0.9936899762839136

In [127]:
model = grid_search_tree.best_estimator_

Export tree as visual

In [131]:
tree.export_graphviz(model, out_file = 'tree.dot', feature_names = features.columns, class_names = ["Accepted", "Rejected"], proportion = True, rotate = True, label = "root", rounded = True, leaves_parallel = True) 
!dot -Tpng tree.dot -o tree.png