In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text, export_graphviz
from sklearn.metrics import recall_score, confusion_matrix
import graphviz

Loading the data

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
X_train = pd.read_csv("/content/drive/MyDrive/AM1/data/X_train.csv")
X_test = pd.read_csv("/content/drive/MyDrive/AM1/data/X_test.csv")

y_train = pd.read_csv("/content/drive/MyDrive/AM1/data/y_train.csv", header=None)
y_test = pd.read_csv("/content/drive/MyDrive/AM1/data/y_test.csv", header=None)

y_train = y_train.iloc[:,1]
y_test = y_test.iloc[:,1]

In [4]:
sum_train = 0
sum_test = 0

for i in range(0, len(y_train)):
  sum_train += y_train[i]

for i in range(0, len(y_test)):
  sum_test += y_test[i]

print('Positive examples in training set: ', sum_train, '/', len(y_train))
print('Positive examples in testing set: ', sum_test, '/', len(y_test))

Positive examples in training set:  17214 / 214344
Positive examples in testing set:  7541 / 91863


The positive class (1) indicates a client that has payment difficulties, that is, he/she had late payment more than X days on at least one of the first Y installments of the loan in our sample.
The negative class (0) indicates all other cases.

Training with all features


In [5]:
dt = DecisionTreeClassifier(random_state=0)
dt = dt.fit(X_train, y_train)

Some information about the model trained

In [6]:
print('Tree depth: ', dt.get_depth())
print('Number of leaves: ', dt.get_n_leaves())

Tree depth:  62
Number of leaves:  20190


In [7]:
# Text representation of the tree
text_representation = export_text(dt)
print(text_representation)

|--- feature_11 <= 0.65
|   |--- feature_2 <= 0.25
|   |   |--- feature_54 <= 0.50
|   |   |   |--- feature_12 <= 0.04
|   |   |   |   |--- feature_9 <= 0.61
|   |   |   |   |   |--- feature_8 <= 0.44
|   |   |   |   |   |   |--- feature_122 <= 0.50
|   |   |   |   |   |   |   |--- feature_7 <= 0.01
|   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |--- feature_7 >  0.01
|   |   |   |   |   |   |   |   |--- feature_6 <= 0.06
|   |   |   |   |   |   |   |   |   |--- feature_11 <= 0.48
|   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |--- feature_11 >  0.48
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |--- feature_6 >  0.06
|   |   |   |   |   |   |   |   |   |--- feature_13 <= 0.82
|   |   |   |   |   |   |   |   |   |   |--- feature_136 <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 11
|   |   |   |   |   |   |   |   |   |   |--- feature_136 >  0

In [21]:
# Graphic representation of the tree
dot_data = export_graphviz(dt, out_file=None, max_depth=7,
                                feature_names=list(X_train.columns), 
                                class_names=['autoriza empréstimo', 'nega empréstimo'],
                                filled=True)
graph = graphviz.Source(dot_data, format="png") 
graph.render("decision_tree_depth7")

dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.989013 to fit


'decision_tree_depth7.png'

Accuracy


In [9]:
dt.score(X_test, y_test)

0.8448776983116162

Confusion matrix

In [10]:
tn, fp, fn, tp = confusion_matrix(y_test, dt.predict(X_test)).ravel()
print('True positives:  ', tp)
print('False negatives: ', fn)
print('True negatives:  ', tn)
print('False positives: ', fp)

True positives:   967
False negatives:  6574
True negatives:   76646
False positives:  7676


Recall

In [11]:
recall_score(y_test, dt.predict(X_test))

0.12823232992971753

Training with categorical features only

In [12]:
categorical_columns = ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE_Children', 'NAME_TYPE_SUITE_Family',
                       'NAME_TYPE_SUITE_Group of people', 'NAME_TYPE_SUITE_Other_A', 'NAME_TYPE_SUITE_Other_B', 'NAME_TYPE_SUITE_Spouse, partner',
                       'NAME_TYPE_SUITE_Unaccompanied', 'NAME_INCOME_TYPE_Businessman',
                       'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE',
                       'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 
                       'FLAG_EMAIL', 'OCCUPATION_TYPE', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 
                       'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 
                       'LIVE_CITY_NOT_WORK_CITY', 'ORGANIZATION_TYPE']

In [13]:
cols = []

for col_name in categorical_columns:
  for i in range(0, 141):
    if col_name in X_train.columns[i]:
      cols.append(X_train.columns[i])

X_train_categorical = X_train[cols]
X_test_categorical = X_test[cols]

In [14]:
dt_categorical = DecisionTreeClassifier(random_state=0)
dt_categorical = dt_categorical.fit(X_train_categorical, y_train)

Some information about the model trained

In [15]:
print('Tree depth: ', dt_categorical.get_depth())
print('Number of leaves: ', dt_categorical.get_n_leaves())

Tree depth:  75
Number of leaves:  29469


In [16]:
# Text representation of the tree
text_representation = export_text(dt_categorical)
print(text_representation)

|--- feature_19 <= 0.50
|   |--- feature_21 <= 0.50
|   |   |--- feature_15 <= 0.50
|   |   |   |--- feature_64 <= 0.50
|   |   |   |   |--- feature_1 <= 0.50
|   |   |   |   |   |--- feature_2 <= 0.50
|   |   |   |   |   |   |--- feature_26 <= 0.50
|   |   |   |   |   |   |   |--- feature_51 <= 0.50
|   |   |   |   |   |   |   |   |--- feature_4 <= 0.50
|   |   |   |   |   |   |   |   |   |--- feature_6 <= 0.50
|   |   |   |   |   |   |   |   |   |   |--- feature_0 <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 47
|   |   |   |   |   |   |   |   |   |   |--- feature_0 >  0.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 11
|   |   |   |   |   |   |   |   |   |--- feature_6 >  0.50
|   |   |   |   |   |   |   |   |   |   |--- feature_50 <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |   |--- feature_50 >  0.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncate

In [22]:
# Graphic representation of the tree
dot_data_categorical = export_graphviz(dt_categorical, out_file=None, max_depth=7,
                                feature_names=list(X_train_categorical.columns), 
                                class_names=['autoriza empréstimo', 'nega empréstimo'],
                                filled=True)
graph = graphviz.Source(dot_data_categorical, format="png") 
graph.render("decision_tree_categorical_depth7")

dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.887922 to fit


'decision_tree_categorical_depth7.png'

Accuracy

In [18]:
dt_categorical.score(X_test_categorical, y_test)

0.88329360025255

Confusion matrix

In [19]:
tn, fp, fn, tp = confusion_matrix(y_test, dt_categorical.predict(X_test_categorical)).ravel()
print('True positives:  ', tp)
print('False negatives: ', fn)
print('True negatives:  ', tn)
print('False positives: ', fp)

True positives:   426
False negatives:  7115
True negatives:   80716
False positives:  3606


Recall

In [20]:
recall_score(y_test, dt_categorical.predict(X_test_categorical))

0.05649118154090969