# Setup inicial

In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict

%matplotlib inline
from matplotlib import pyplot as plt

import seaborn as sns

from IPython.display import HTML, display
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

from sklearn.pipeline import Pipeline

from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import category_encoders as ce

from src.creditapp.data.raw_data_access import get_application_record

# Loading data

In [2]:
df_clients = get_application_record()

df_clients.drop(['id'], axis=1, inplace=True)

print('data loading was successfully...')

data loading was successfully...


<img src="https://av.sc.com/in/content/images/in-credit-card-banner-1600x490px.jpg" />

# Experiments

**Fecha de creación: 13/05/2020**

**Autores: Santiago Porchietto, Agustín Trulli**

El objetivo del presente cuaderno es la realización de multiples experimentos involucrando modelos de Machine Learning en función de determinar cual es el que mejor funciona para esta problemática en particular.

Como se menciona en el análisis anterior, se busca predecir si una persona incurrirá en incumplimiento de sus deudas contraidas mediante tarjeta de crédito. Para esto creamos la columna target (y).

Todas las features que quedaron en el dataset (menos el id) posterior al preprocessing serán utilizadas como entradas (X).

<u>Metricas a utilizar</u>:

* Precission: Para poder determinar que porcentaje de las veces que dijimos positivo acertamos la predicción. Por si sola es insuficiente (Encontramos 2 positivos que realmente lo son (100% de precission), en un dataset que posee 1000 positivos).

* Recall: Para complementar precission, nos va a decir el porcentaje de positivos que encontramos por sobre los reales.

* F1-Score y AuCRoc: Como métricas que combinan las anteriores.

Con nuestro ejemplo en particular, precission es la métrica que más deberíamos mirar, ya que, en función de las políticas que tome el banco contra los deudores, deberíamos estar bastante seguros de que una persona lo es antes de etiquetarla de ello.

<u>Modelos a evaluar</u>:

* Decission Tree 

* Random Forest

* 3

* 4

## Feature Engineering 

PREGUNTAR POR LOS NULLS DE LOS TRABAJOS (CATEGORICAL)

### Income range

In [3]:
# parentesis no incluye el término
# corchete incluye el término

df_clients.loc[::, 'months_employed_range'] = pd.cut(df_clients.months_employed, 
                                                     list(range(0, 205, 6)), include_lowest='True')

df_clients['months_employed_range'] = df_clients['months_employed_range'].astype(str)
df_clients.loc[df_clients.months_employed_range == 'nan', 'months_employed_range'] = np.nan
df_clients.loc[df_clients.months_employed_range == '(-0.001, 6.0]', 'months_employed_range'] = '[0.0, 6.0]'
df_clients.loc[df_clients.months_employed >= 205, 'months_employed_range'] = '(204.0, 524.0]'

### Has Childs 

In [4]:
# Childs
df_clients.loc[::, 'has_childs'] = 1
df_clients.loc[df_clients.cnt_children == 0, 'has_childs'] = 0

### months_employed

In [5]:
# Bajo la suposición de que estas personas nunca trabajaron o se encuentran este periodo de tiempo sin trabajar.
# Se debería consultar con el "cliente/banco", pero tiene sentido.

df_clients["months_employed"].fillna(0, inplace = True)

## Generating test, train & validation sets 

In [13]:
# Test, train & validation sets
train, not_train = train_test_split(df_clients, test_size=0.3, random_state=1)
validation, test = train_test_split(not_train, test_size=0.5, random_state=1)

# Train inputs & outputs
y_train = train.target
X_train = train.drop('target', axis=1)

# Test inputs & outputs
#y_test = test.target
#X_test = test.drop('target', axis=1)

# Validation inputs & outputs
#y_validation = validation.target
#X_validation = validation.drop('target', axis=1)

print('data generating was successfully...')

data generating was successfully...


## Mapper

In [14]:
mapper = DataFrameMapper([
    (['code_gender'], [ce.OneHotEncoder()]),
    (['flag_own_car'], [ce.OneHotEncoder()]),
    (['flag_own_realty'], [ce.OneHotEncoder()]),
    (['amt_income_total'], [StandardScaler()]),
    (['name_income_type'], [ce.WOEEncoder()]),
    (['name_education_type'], [ce.WOEEncoder()]),
    (['name_family_status'], [ce.WOEEncoder()]),
    (['name_housing_type'], [ce.WOEEncoder()]),
    (['months_employed'], [StandardScaler()]),
    (['occupation_type'], [ce.WOEEncoder()]),
    (['months_employed_range'], [ce.WOEEncoder()])
])

## Decission Tree

In [15]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.metrics import *

In [40]:
tree = DecisionTreeClassifier(class_weight="balanced", max_depth=10, random_state=1)

pipe = Pipeline([
    ('mapper', mapper),
    ('imputer', IterativeImputer(random_state=42)),
    ('classifier', tree)
])

pipe.fit(X_train, y_train)

evaluate_model(pipe, title='Decision Tree')

'Decision Tree'

Unnamed: 0,Accuracy,Precision,Recall,F1
train,0.545784,0.137555,0.856881,0.237056
test,0.515152,0.106207,0.65812,0.182898


In [17]:
def evaluate_model(model, set_names=('train', 'test'), title=''):
    if title:
        display(title)
    final_metrics = defaultdict(list)
    
    for i, set_name in enumerate(set_names):
        assert set_name in ['train', 'test', 'validation']
        set_data = globals()[set_name] # <- hack feo...

        y = set_data.target
        y_pred = model.predict(set_data)
        final_metrics['Accuracy'].append(metrics.accuracy_score(y, y_pred))
        final_metrics['Precision'].append(metrics.precision_score(y, y_pred))
        final_metrics['Recall'].append(metrics.recall_score(y, y_pred))
        final_metrics['F1'].append(metrics.f1_score(y, y_pred))

    display(pd.DataFrame(final_metrics, index=set_names))

In [18]:
from sklearn import metrics

In [36]:
import graphviz  # pip install graphviz
from sklearn.tree import export_graphviz

def graph_tree(tree, col_names):
    graph_data = export_graphviz(
        tree, 
        out_file=None, 
        feature_names=col_names,  
        class_names=['deudor', 'no deudor'],  
        filled=True, 
        rounded=True,  
        special_characters=True,
    )
    graph = graphviz.Source(graph_data)  
    return graph

<img src="https://i0.pngocean.com/files/972/337/281/tribal-wars-2-video-game-internet-sad-pepe.jpg" />

In [41]:
from sklearn.preprocessing import LabelBinarizer
graph_tree(tree, mapper.transformed_names_)

ExecutableNotFound: failed to execute ['dot', '-Tsvg'], make sure the Graphviz executables are on your systems' PATH

<graphviz.files.Source at 0x2dbb9cbe4e0>

In [39]:
dir(mapper)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build',
 '_get_col_subset',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_selected_columns',
 '_transform',
 '_unselected_columns',
 'built_default',
 'built_features',
 'default',
 'df_out',
 'features',
 'fit',
 'fit_transform',
 'get_dtype',
 'get_dtypes',
 'get_names',
 'get_params',
 'input_df',
 'set_params',
 'sparse',
 'transform',
 'transformed_names_']

### grid search

In [21]:
from sklearn.model_selection import GridSearchCV

In [22]:
parameters = {#'n_estimators': [100, 200], 
              'max_depth':[3, 5, 7, 9, 11, 13],#como se que valores usar? ver esto porque va para el orto
              'max_features': [2, 3, 4, 5]}


clf = GridSearchCV(tree, parameters, refit=True, verbose=1)

cv_pipe = Pipeline([
    ('mapper', mapper),
    ('imputer', IterativeImputer(random_state=42)),
    ('classifier', clf),
])

cv_pipe.fit(train, train.target)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:    0.9s finished


Pipeline(memory=None,
         steps=[('mapper',
                 DataFrameMapper(default=False, df_out=False,
                                 features=[(['code_gender'],
                                            [OneHotEncoder(cols=[0],
                                                           drop_invariant=False,
                                                           handle_missing='value',
                                                           handle_unknown='value',
                                                           return_df=True,
                                                           use_cat_names=False,
                                                           verbose=0)]),
                                           (['flag_own_car'],
                                            [OneHotEncoder(cols=[0],
                                                           drop_invariant=False,
                                                           handle_missin

In [23]:
clf.best_score_, clf.best_params_

(0.6355442747760399, {'max_depth': 13, 'max_features': 3})

In [24]:

evaluate_model(pipe, title='Decision Tree')

evaluate_model(cv_pipe, title='Decision Tree after Cross Validation')

'Decision Tree'

Unnamed: 0,Accuracy,Precision,Recall,F1
train,0.503324,0.128052,0.866055,0.223115
test,0.449612,0.085,0.581197,0.14831


'Decision Tree after Cross Validation'

Unnamed: 0,Accuracy,Precision,Recall,F1
train,0.662738,0.187013,0.924771,0.311111
test,0.601128,0.109565,0.538462,0.182081


## Random Forests

In [25]:
from sklearn.ensemble import RandomForestClassifier

In [26]:
random_forests = RandomForestClassifier(random_state=42)

### Simple Inputer

In [35]:
pipe_rf_si = Pipeline([
    ('mapper', mapper),
    ('imputer', SimpleImputer(strategy='mean')),
    ('classifier', random_forests),
])

pipe_rf_si.fit(X_train, y_train)

evaluate_model(pipe_rf_si, title='Random Forest Simple Imputer')

'Random Forest'

Unnamed: 0,Accuracy,Precision,Recall,F1
train,0.999093,0.998152,0.990826,0.994475
test,0.915433,0.2,0.008547,0.016393


#### Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
parameters_rf_si = {'n_estimators': [100, 200], 
              'max_depth':[3, 5, 7, 9, 11, 13],
              'max_features': [2, 3, 4, 5]}


gs_rf_si = GridSearchCV(random_forests, parameters_rf_si, refit=True, verbose=1)

pipe_rf_cv_si = Pipeline([
    ('mapper', mapper),
    ('imputer', IterativeImputer(random_state=42)),
    ('classifier', gs_rf_si),
])

pipe_rf_cv_si.fit(X_train, y_train)

In [None]:
gs_rf_si.best_score_, gs_rf_si.best_params_

In [None]:
evaluate_model(pipe_rf_si, title='Random Forests Simple Inputer')

evaluate_model(pipe_rf_cv_si, title='Random Forests Simple Inputer after Cross Validation')

### Iterative Imputer

In [27]:
pipe_rf_ii = Pipeline([
    ('mapper', mapper),
    ('imputer', IterativeImputer(random_state=42)),
    ('classifier', random_forests),
])

pipe_rf_ii.fit(X_train, y_train)

evaluate_model(pipe_rf_ii, title='Random Forest Iterative Imputer')

'Random Forest'

Unnamed: 0,Accuracy,Precision,Recall,F1
train,0.999093,0.998152,0.990826,0.994475
test,0.915433,0.2,0.008547,0.016393


#### Grid Search

In [28]:
from sklearn.model_selection import GridSearchCV

In [32]:
parameters_rf_ii = {'n_estimators': [100, 200], 
              'max_depth':[3, 5, 7, 9, 11, 13],
              'max_features': [2, 3, 4, 5]}


gs_rf_ii = GridSearchCV(random_forests, parameters_rf_ii, refit=True, verbose=1)

pipe_rf_cv_ii = Pipeline([
    ('mapper', mapper),
    ('imputer', IterativeImputer(random_state=42)),
    ('classifier', gs_rf_ii),
])

pipe_rf_cv_ii.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed:  4.0min finished


Pipeline(memory=None,
         steps=[('mapper',
                 DataFrameMapper(default=False, df_out=False,
                                 features=[(['code_gender'],
                                            [OneHotEncoder(cols=[0],
                                                           drop_invariant=False,
                                                           handle_missing='value',
                                                           handle_unknown='value',
                                                           return_df=True,
                                                           use_cat_names=False,
                                                           verbose=0)]),
                                           (['flag_own_car'],
                                            [OneHotEncoder(cols=[0],
                                                           drop_invariant=False,
                                                           handle_missin

In [33]:
gs_rf_ii.best_score_, gs_rf_ii.best_params_

(0.917799996803018, {'max_depth': 13, 'max_features': 2, 'n_estimators': 200})

In [34]:
evaluate_model(pipe_rf_ii, title='Random Forests Iterative Imputer')

evaluate_model(pipe_rf_cv_ii, title='Random Forests Iterative Imputer after Cross Validation')

'Random Forests'

Unnamed: 0,Accuracy,Precision,Recall,F1
train,0.999093,0.998152,0.990826,0.994475
test,0.915433,0.2,0.008547,0.016393


'Random Forests after Cross Validation'

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Accuracy,Precision,Recall,F1
train,0.930946,1.0,0.161468,0.278041
test,0.917548,0.0,0.0,0.0


## Regresión Logística

In [43]:
from sklearn.linear_model import LogisticRegression

In [44]:
logistic_regression = LogisticRegression(random_state=1)

### Simple Imputer

In [45]:
pipe_lg_si = Pipeline([
    ('mapper', mapper),
    ('imputer', SimpleImputer(strategy='mean')),
    ('classifier', logistic_regression),
])

pipe_lg_si.fit(X_train, y_train)

evaluate_model(pipe_lg_si)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Accuracy,Precision,Recall,F1
train,0.917649,0.0,0.0,0.0
test,0.917548,0.0,0.0,0.0


### Iterative Imputer

In [None]:
logistic_regression = LogisticRegression(random_state=1)

pipe_lg_ii = Pipeline([
    ('mapper', mapper),
    ('imputer', IterativeImputer(random_state=42)),
    ('classifier', logistic_regression),
])

pipe_lg_ii.fit(X_train, y_train)

evaluate_model(pipe_lg_ii)

## K nearest neighbors

In [46]:
from sklearn.neighbors import KNeighborsClassifier

In [47]:
knn = KNeighborsClassifier(n_neighbors=10)

### Simple Imputer

In [48]:
pipe_knn_si = Pipeline([
    ('mapper', mapper),
    ('imputer', SimpleImputer(strategy='mean')),
    ('classifier', knn),
])

pipe_knn_si.fit(X_train, y_train)

evaluate_model(pipe_knn_si, title='Knn Simple Imputer')

'Knn Simple Imputer'

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Accuracy,Precision,Recall,F1
train,0.9178,1.0,0.001835,0.003663
test,0.917548,0.0,0.0,0.0


### Iterative Imputer

In [None]:
pipe_knn_ii = Pipeline([
    ('mapper', mapper),
    ('imputer', IterativeImputer(random_state=42)),
    ('classifier', knn),
])

pipe_knn_ii.fit(X_train, y_train)

evaluate_model(pipe_knn_ii, title='Knn Iterative Imputer')

## Gradient Boosted Trees

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gradient_boosting = GradientBoostingClassifier(random_state=42)

pipe_gb = Pipeline([
    ('mapper', mapper),
    ('imputer', IterativeImputer(random_state=42)),
    ('classifier', gradient_boosting),
])

pipe_gb.fit(X_train, y_train)

evaluate_model(pipe_gb, title='Gradient Boosting')

In [None]:
encoder = ce.WOEEncoder()
encoder.fit(train, train.target)

train_WOEEncoder = encoder.transform(train)
train_WOEEncoder.head(5)

In [None]:
train[train.occupation_type.isnull()].head(3)

Inputa nulls con el valor mas negativo (el mas useless para predecir)

In [None]:
train_WOEEncoder[train_WOEEncoder.index == 15577]

In [None]:
train_WOEEncoder.occupation_type.min()

In [None]:
train_leaveOneOutEnc.occupation_type.value_counts()

<img src=https://miro.medium.com/max/578/1*5S_5aAHWe0_knDGZUK3W8w.png width="300" height="200">

In [None]:
train.name_education_type.value_counts()

In [None]:
train[train.name_education_type == 'Academic degree']