In [37]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text, export_graphviz
from sklearn.metrics import recall_score, confusion_matrix
import graphviz
import matplotlib.pyplot as plt
from sklearn import tree

### Loading the data

In [5]:
X_train = pd.read_csv('../data/X_train.csv', index_col=['SK_ID_CURR'])
y_train = pd.read_csv('../data/y_train.csv').values.ravel()

X_test = pd.read_csv('../data/X_test.csv', index_col=['SK_ID_CURR'])
y_test = pd.read_csv('../data/y_test.csv').values.ravel()

In [6]:
sum_train = 0
sum_test = 0

for i in range(0, len(y_train)):
    sum_train += y_train[i]

for i in range(0, len(y_test)):
    sum_test += y_test[i]

print('Positive examples in training set: ', sum_train, '/', len(y_train))
print('Positive examples in testing set: ', sum_test, '/', len(y_test))

Positive examples in training set:  17214 / 214344
Positive examples in testing set:  7541 / 91863


The positive class (1) indicates a client that has payment difficulties, that is, he/she had late payment more than X days on at least one of the first Y installments of the loan in our sample.
The negative class (0) indicates all other cases.

### Training with all features

In [7]:
dt = DecisionTreeClassifier(random_state=0)
dt = dt.fit(X_train, y_train)

### Some information about the model trained

In [8]:
print('Tree depth: ', dt.get_depth())
print('Number of leaves: ', dt.get_n_leaves())

Tree depth:  57
Number of leaves:  20544


In [9]:
# Text representation of the tree
text_representation = export_text(dt)
print(text_representation)

|--- feature_10 <= 0.65
|   |--- feature_1 <= 0.25
|   |   |--- feature_53 <= 0.50
|   |   |   |--- feature_11 <= 0.04
|   |   |   |   |--- feature_8 <= 0.61
|   |   |   |   |   |--- feature_7 <= 0.44
|   |   |   |   |   |   |--- feature_121 <= 0.50
|   |   |   |   |   |   |   |--- feature_6 <= 0.01
|   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |--- feature_6 >  0.01
|   |   |   |   |   |   |   |   |--- feature_5 <= 0.06
|   |   |   |   |   |   |   |   |   |--- feature_24 <= -0.56
|   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |--- feature_24 >  -0.56
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |--- feature_5 >  0.06
|   |   |   |   |   |   |   |   |   |--- feature_12 <= 0.82
|   |   |   |   |   |   |   |   |   |   |--- feature_135 <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 10
|   |   |   |   |   |   |   |   |   |   |--- feature_135 > 

In [40]:
# Graphic representation of the tree

# não rodou
# dot_data = export_graphviz(dt, out_file=None, max_depth=7,
#                                 feature_names=list(X_train.columns), 
#                                 class_names=['autoriza empréstimo', 'nega empréstimo'],
#                                 filled=True)
# graph = graphviz.Source(dot_data, format="png") 
# graph.render("decision_tree_depth7")

# demora mt KKKK
# fig = plt.figure(figsize=(25, 20))
# _ = tree.plot_tree(dt, 
#                    feature_names=X_train.columns,  
#                    class_names=['autoriza empréstimo', 'nega empréstimo'],
#                    filled=True)


### Accuracy

In [16]:
dt.score(X_test, y_test)

0.8440612651448353

### Confusion matrix

In [12]:
tn, fp, fn, tp = confusion_matrix(y_test, dt.predict(X_test)).ravel()
print('True positives:  ', tp)
print('False negatives: ', fn)
print('True negatives:  ', tn)
print('False positives: ', fp)

True positives:   985
False negatives:  6556
True negatives:   76553
False positives:  7769


### Recall

In [18]:
recall_score(y_test, dt.predict(X_test), average=None)

array([0.90786509, 0.13061928])

Training with categorical features only

In [21]:
categorical_columns = ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
                       'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE',
                       'FLAG_EMAIL', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION',
                       'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY',
                       'LIVE_CITY_NOT_WORK_CITY']

In [22]:
X_train_categorical = X_train.loc[:, categorical_columns]
X_test_categorical = X_test.loc[:, categorical_columns]

In [23]:
dt_categorical = DecisionTreeClassifier(random_state=0)
dt_categorical = dt_categorical.fit(X_train_categorical, y_train)

Some information about the model trained

In [24]:
print('Tree depth: ', dt_categorical.get_depth())
print('Number of leaves: ', dt_categorical.get_n_leaves())

Tree depth:  15
Number of leaves:  1189


In [25]:
# Text representation of the tree
text_representation = export_text(dt_categorical)
print(text_representation)

|--- feature_1 <= 0.25
|   |--- feature_2 <= 0.50
|   |   |--- feature_14 <= 0.50
|   |   |   |--- feature_5 <= 0.50
|   |   |   |   |--- feature_13 <= 0.50
|   |   |   |   |   |--- feature_9 <= 0.50
|   |   |   |   |   |   |--- feature_8 <= 0.50
|   |   |   |   |   |   |   |--- feature_10 <= 0.50
|   |   |   |   |   |   |   |   |--- feature_7 <= 0.50
|   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |--- feature_7 >  0.50
|   |   |   |   |   |   |   |   |   |--- feature_3 <= 0.50
|   |   |   |   |   |   |   |   |   |   |--- feature_0 <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |   |--- feature_0 >  0.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |--- feature_3 >  0.50
|   |   |   |   |   |   |   |   |   |   |--- feature_0 <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |   |--- feature_0 >  0.

In [26]:
# Graphic representation of the tree
dot_data_categorical = export_graphviz(dt_categorical, out_file=None, max_depth=7,
                                feature_names=list(X_train_categorical.columns), 
                                class_names=['autoriza empréstimo', 'nega empréstimo'],
                                filled=True)
graph = graphviz.Source(dot_data_categorical, format="png") 
graph.render("decision_tree_categorical_depth7")

ExecutableNotFound: failed to execute ['dot', '-Tpng', '-O', 'decision_tree_categorical_depth7'], make sure the Graphviz executables are on your systems' PATH

### Accuracy

In [28]:
dt_categorical.score(X_test_categorical, y_test)

0.9177035367884785

### Confusion matrix

In [29]:
tn, fp, fn, tp = confusion_matrix(y_test, dt_categorical.predict(X_test_categorical)).ravel()
print('True positives:  ', tp)
print('False negatives: ', fn)
print('True negatives:  ', tn)
print('False positives: ', fp)

True positives:   2
False negatives:  7539
True negatives:   84301
False positives:  21


### Recall

In [31]:
recall_score(y_test, dt_categorical.predict(X_test_categorical), average=None)

array([9.99750955e-01, 2.65216815e-04])