# Setup inicial

In [61]:
import numpy as np
import pandas as pd
from collections import defaultdict

%matplotlib inline
from matplotlib import pyplot as plt

import seaborn as sns

from IPython.display import HTML, display
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

from sklearn.pipeline import Pipeline

from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import StandardScaler
import category_encoders as ce

from src.creditapp.data.raw_data_access import get_application_record

# Loading data

In [62]:
df_clients = get_application_record()

df_clients.drop(['id'], axis=1, inplace=True)

print('data loading was successfully...')

data loading was successfully...


<img src="https://av.sc.com/in/content/images/in-credit-card-banner-1600x490px.jpg" />

# Experiments

**Fecha de creación: 13/05/2020**

**Autores: Santiago Porchietto, Agustín Trulli**

El objetivo del presente cuaderno es la realización de multiples experimentos involucrando modelos de Machine Learning en función de determinar cual es el que mejor funciona para esta problemática en particular.

Como se menciona en el análisis anterior, se busca predecir si una persona incurrirá en incumplimiento de sus deudas contraidas mediante tarjeta de crédito. Para esto creamos la columna target (y).

Todas las features que quedaron en el dataset (menos el id) posterior al preprocessing serán utilizadas como entradas (X).

<u>Metricas a utilizar</u>:

* Precission: Para poder determinar que porcentaje de las veces que dijimos positivo acertamos la predicción. Por si sola es insuficiente (Encontramos 2 positivos que realmente lo son (100% de precission), en un dataset que posee 1000 positivos).

* Recall: Para complementar precission, nos va a decir el porcentaje de positivos que encontramos por sobre los reales.

* F1-Score y AuCRoc: Como métricas que combinan las anteriores.

Con nuestro ejemplo en particular, precission es la métrica que más deberíamos mirar, ya que, en función de las políticas que tome el banco contra los deudores, deberíamos estar bastante seguros de que una persona lo es antes de etiquetarla de ello.

<u>Modelos a evaluar</u>:

* Decission Tree 

* Random Forest

* 3

* 4

## Feature Engineering 

PREGUNTAR POR LOS NULLS DE LOS TRABAJOS (CATEGORICAL)

### Income range

In [63]:
# parentesis no incluye el término
# corchete incluye el término

df_clients.loc[::, 'months_employed_range'] = pd.cut(df_clients.months_employed, 
                                                     list(range(0, 205, 6)), include_lowest='True')

df_clients['months_employed_range'] = df_clients['months_employed_range'].astype(str)
df_clients.loc[df_clients.months_employed_range == 'nan', 'months_employed_range'] = np.nan
df_clients.loc[df_clients.months_employed_range == '(-0.001, 6.0]', 'months_employed_range'] = '[0.0, 6.0]'
df_clients.loc[df_clients.months_employed >= 205, 'months_employed_range'] = '(204.0, 524.0]'

### Has Childs 

In [64]:
# Childs
df_clients.loc[::, 'has_childs'] = 1
df_clients.loc[df_clients.cnt_children == 0, 'has_childs'] = 0

### months_employed

In [65]:
# Bajo la suposición de que estas personas nunca trabajaron o se encuentran este periodo de tiempo sin trabajar.
# Se debería consultar con el "cliente/banco", pero tiene sentido.

df_clients["months_employed"].fillna(0, inplace = True)

## Generating test, train & validation sets 

In [66]:
# Test, train & validation sets
train, not_train = train_test_split(df_clients, test_size=0.3, random_state=1)
validation, test = train_test_split(not_train, test_size=0.5, random_state=1)

# Train inputs & outputs
#y_train = train.target
#X_train = train.drop('target', axis=1)

# Test inputs & outputs
#y_test = test.target
#X_test = test.drop('target', axis=1)

# Validation inputs & outputs
#y_validation = validation.target
#X_validation = validation.drop('target', axis=1)

print('data generating was successfully...')

data generating was successfully...


## Mapper

In [67]:
mapper = DataFrameMapper([
    (['code_gender'], [ce.OneHotEncoder()]),
    (['flag_own_car'], [ce.OneHotEncoder()]),
    (['flag_own_realty'], [ce.OneHotEncoder()]),
    (['amt_income_total'], [StandardScaler()]),
    (['name_income_type'], [ce.WOEEncoder()]),
    (['name_education_type'], [ce.WOEEncoder()]),
    (['name_family_status'], [ce.WOEEncoder()]),
    (['name_housing_type'], [ce.WOEEncoder()]),
    (['months_employed'], [StandardScaler()]),
    (['occupation_type'], [ce.WOEEncoder()]),
    (['months_employed_range'], [ce.WOEEncoder()])
])

## Decission Tree

In [68]:
from sklearn.impute import SimpleImputer
from sklearn.metrics import *

In [76]:
tree = DecisionTreeClassifier(random_state=1, 
                              class_weight="balanced", max_depth=10)

pipe = Pipeline([
    ('mapper', mapper),
    ('imputer', SimpleImputer(strategy='mean')),
    ('classifier', tree)
])

pipe.fit(X_train, y_train)

evaluate_model(pipe, title='Decision Tree')

'Decision Tree'

Unnamed: 0,Accuracy,Precision,Recall,F1
train,0.545784,0.137555,0.856881,0.237056
test,0.515152,0.106207,0.65812,0.182898


In [71]:
def evaluate_model(model, set_names=('train', 'test'), title=''):
    if title:
        display(title)
    final_metrics = defaultdict(list)
    
    for i, set_name in enumerate(set_names):
        assert set_name in ['train', 'test', 'validation']
        set_data = globals()[set_name] # <- hack feo...

        y = set_data.target
        y_pred = model.predict(set_data)
        final_metrics['Accuracy'].append(metrics.accuracy_score(y, y_pred))
        final_metrics['Precision'].append(metrics.precision_score(y, y_pred))
        final_metrics['Recall'].append(metrics.recall_score(y, y_pred))
        final_metrics['F1'].append(metrics.f1_score(y, y_pred))

    display(pd.DataFrame(final_metrics, index=set_names))

In [70]:
from sklearn import metrics

In [36]:
encoder = ce.WOEEncoder()
encoder.fit(train, train.target)

train_WOEEncoder = encoder.transform(train)
train_WOEEncoder.head(5)

Unnamed: 0,code_gender,flag_own_car,flag_own_realty,cnt_children,amt_income_total,name_income_type,name_education_type,name_family_status,name_housing_type,months_employed,flag_mobil,flag_work_phone,flag_email,cnt_fam_members,age,flag_phone,occupation_type,target,months_employed_range,has_childs
30511,-0.077672,0.103559,-0.006846,0,81000.0,-0.023125,0.000794,-0.018866,-0.016815,90.0,1,0,0,2.0,30,0,0.206567,0,-0.307942,0
8042,-0.077672,-0.065516,-0.006846,0,112500.0,0.159541,-0.01288,-0.018866,-0.016815,63.0,1,1,0,2.0,29,0,0.206567,0,0.116077,0
5538,0.128934,0.103559,-0.006846,0,202500.0,0.159541,-0.01288,-0.018866,-0.016815,43.0,1,0,1,2.0,32,0,0.072717,0,0.465789,0
28245,0.128934,0.103559,-0.006846,2,135000.0,0.159541,-0.01288,-0.018866,-0.016815,131.0,1,1,0,4.0,40,0,0.072717,0,0.185872,1
32820,-0.077672,0.103559,0.013532,5,112500.0,0.159541,-0.01288,-0.018866,-0.016815,45.0,1,1,0,7.0,33,0,-0.61374,0,0.465789,1


In [49]:
train[train.occupation_type.isnull()].head(3)

Unnamed: 0,code_gender,flag_own_car,flag_own_realty,cnt_children,amt_income_total,name_income_type,name_education_type,name_family_status,name_housing_type,months_employed,flag_mobil,flag_work_phone,flag_email,cnt_fam_members,age,flag_phone,occupation_type,target,months_employed_range,has_childs
32820,F,Y,N,5,112500.0,Working,Secondary / secondary special,Married,House / apartment,45.0,1,1,0,7.0,33,0,,0,"(42.0, 48.0]",1
19881,F,N,Y,0,67500.0,Pensioner,Secondary / secondary special,Single / not married,House / apartment,,1,0,0,1.0,51,0,,0,,0
15577,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Single / not married,House / apartment,7.0,1,0,1,1.0,23,0,,0,"(6.0, 12.0]",0


Inputa nulls con el valor mas negativo (el mas useless para predecir)

In [52]:
train_WOEEncoder[train_WOEEncoder.index == 15577]

Unnamed: 0,code_gender,flag_own_car,flag_own_realty,cnt_children,amt_income_total,name_income_type,name_education_type,name_family_status,name_housing_type,months_employed,flag_mobil,flag_work_phone,flag_email,cnt_fam_members,age,flag_phone,occupation_type,target,months_employed_range,has_childs
15577,0.128934,0.103559,-0.006846,0,112500.0,0.159541,-0.01288,0.323385,-0.016815,7.0,1,0,1,1.0,23,0,-0.61374,0,0.377318,0


In [56]:
train_WOEEncoder.occupation_type.min()

-0.6137401712381046

In [42]:
train_leaveOneOutEnc.occupation_type.value_counts()

0.046012    1956
0.087603    1210
0.104135     653
0.098333     600
0.089286     560
0.104513     421
0.103053     262
0.076555     209
0.117347     196
0.149254     134
0.117188     128
0.094737      95
0.049180      61
0.153846      39
0.037037      27
0.083333      24
0.111111      18
0.200000      15
0.100000      10
Name: occupation_type, dtype: int64

<img src=https://miro.medium.com/max/578/1*5S_5aAHWe0_knDGZUK3W8w.png width="300" height="200">

In [38]:
train.name_education_type.value_counts()

Secondary / secondary special    4555
Higher education                 1740
Incomplete higher                 241
Lower secondary                    79
Academic degree                     3
Name: name_education_type, dtype: int64

In [33]:
train[train.name_education_type == 'Academic degree']

Unnamed: 0,code_gender,flag_own_car,flag_own_realty,cnt_children,amt_income_total,name_income_type,name_education_type,name_family_status,name_housing_type,months_employed,flag_mobil,flag_work_phone,flag_email,cnt_fam_members,age,flag_phone,occupation_type,target,months_employed_range,has_childs
17326,F,N,Y,1,337500.0,Working,Academic degree,Single / not married,House / apartment,92.0,1,0,0,2.0,34,0,,0,"(90.0, 96.0]",1
21308,F,Y,Y,0,450000.0,Pensioner,Academic degree,Civil marriage,House / apartment,,1,0,0,2.0,59,1,,0,,0
11797,F,Y,Y,1,270000.0,Commercial associate,Academic degree,Married,House / apartment,33.0,1,0,0,3.0,33,0,Sales staff,0,"(30.0, 36.0]",1
