In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, KFold,RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score, roc_auc_score
from sklearn.metrics import classification_report, precision_recall_curve

from scripts import featurizer_xgb

import xgboost as xgb
import sklearn
from xgboost.sklearn import XGBClassifier


%load_ext autoreload
%autoreload 2

### Lectura de set de datos ya depurado.

In [2]:
df = pd.read_csv("../data/dataset-procesado.csv", index_col=False)

In [3]:
df.shape

(263965, 123)

In [4]:
df.head()

Unnamed: 0,P_2,D_39,B_1,B_2,R_1,S_3,D_41,D_43,D_44,B_4,...,R_28,D_139,D_140,D_144,D_145,customer_ID,S_2,D_63,D_64,target
0,0.947242,0.002204,0.008002,1.007516,0.00283,0.453227,0.005992,0.153998,0.001613,0.029088,...,0.000518,0.003183,0.005264,0.004306,0.002719,743d55408505dee752adc6e62b273f4397db6e89f19034...,2017-08-10,CO,O,0
1,0.757648,0.002076,0.008751,0.814083,0.000408,0.1557,0.004156,0.127285,0.129051,0.040101,...,0.001186,1.000928,0.005655,0.007953,0.095926,522c85a354bfec790067b54e9db7353d20d4709ae5dc93...,2017-07-04,CO,O,0
2,0.673978,0.588604,0.295201,0.02465,0.007951,0.125583,0.00876,0.153998,0.005867,0.17595,...,0.000805,0.003558,0.008268,0.00302,0.002798,9296e983c3e4f99b046952397fc4ba950abf1032a9bd6c...,2017-03-30,CO,R,1
3,0.778075,0.00693,0.056721,0.819369,0.004922,0.089214,0.006884,0.041515,0.002841,0.001179,...,0.009053,0.00467,0.000125,0.000819,0.000342,3fa10314686e0e51d8f76235eceab91d70aa3f24a98c3e...,2018-01-17,CL,O,0
4,0.561622,0.032756,0.822028,0.025315,0.00557,1.1474,0.00829,0.153998,0.002991,0.31665,...,0.008689,0.003884,0.003367,0.00901,0.004627,1f81653f1397fe11839b585cb688e2e92ce1805023ba67...,2017-06-21,CO,O,1


###### Aplicacion de un script como featurizador para agilizar la creacion e iteracion de features.

In [5]:
df_featurizado = featurizer_xgb.featurizer(df)

----xgb-featurizer----


In [6]:
df_featurizado.shape

(263965, 154)

In [7]:
df_featurizado.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 263965 entries, 0 to 263964
Data columns (total 154 columns):
 #    Column                   Dtype  
---   ------                   -----  
 0    P_2                      float64
 1    D_39                     float64
 2    B_1                      float64
 3    B_2                      float64
 4    R_1                      float64
 5    S_3                      float64
 6    D_41                     float64
 7    D_43                     float64
 8    D_44                     float64
 9    B_4                      float64
 10   D_45                     float64
 11   B_5                      float64
 12   R_2                      float64
 13   D_46                     float64
 14   D_47                     float64
 15   B_6                      float64
 16   B_7                      float64
 17   D_50                     float64
 18   D_51                     float64
 19   B_9                      float64
 20   R_3                     

Como se puede observar, hay un incremento de columnas que viene por tratar con nuevas features en nuestro set de datos.

In [8]:
X = df_featurizado.drop(axis = 1, columns = ['target'])
y = df_featurizado.target

features = X.columns

In [9]:
X.shape

(263965, 153)

In [10]:
len(y)

263965

###### Train test split para poder diferenciar los sets de datos de entrenamiento de testing, stratifico la columna target ya que hay un desbalance de 70/30 entre labels positivos y negativos.

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

### Clasificador 

In [12]:
xgb = XGBClassifier(random_state = 1)

Set de hiperparametros posibles para el random search

In [13]:
param_dist = {'n_estimators':[50, 100, 150],
              'max_depth':[5, 10],              
              'subsample': [0.5,0.8,1],
              'colsample_bytree': [0.5,0.8,1],              
              'learning_rate':[0.2, 0.3]
              }

In [14]:
random_search_xgb = RandomizedSearchCV(xgb,
                                       param_distributions = param_dist,
                                       n_iter = 10,
                                       scoring = 'roc_auc',
                                       cv = 5,
                                       n_jobs = -1,
                                       verbose=1)

In [15]:
%%time
random_search_xgb.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
CPU times: user 5min 51s, sys: 578 ms, total: 5min 52s
Wall time: 26min 55s


In [16]:
params_opt_xgb = random_search_xgb.best_params_

Los mejores hiperparametros sacados del random search para el modelo de xgboost final

In [17]:
params_opt_xgb

{'subsample': 0.8,
 'n_estimators': 100,
 'max_depth': 5,
 'learning_rate': 0.2,
 'colsample_bytree': 0.5}

In [18]:
xgbModelBestParams = XGBClassifier(**params_opt_xgb)

In [19]:
%%time
xgbModelBestParams.fit(X_train, y_train)

CPU times: user 6min 39s, sys: 366 ms, total: 6min 39s
Wall time: 25.5 s


In [20]:
y_pred_temp2 = xgbModelBestParams.predict(X_train)

In [21]:
y_pred_test2 = xgbModelBestParams.predict(X_test)

Metricas de testeo para el set de training

In [22]:
confusion_matrix(y_train, y_pred_temp2)

array([[148839,  11220],
       [ 11922,  39191]])

In [23]:
print("precision = ", precision_score(y_train, y_pred_temp2))
print("recall = ", recall_score(y_train, y_pred_temp2))
print("auc = ", roc_auc_score(y_train, y_pred_temp2))
print("f1 score = ", f1_score(y_train, y_pred_temp2))

precision =  0.7774295292694055
recall =  0.7667520982920196
auc =  0.8483264736769641
f1 score =  0.7720538985855561


Metricas de testeo para el set de testing

In [24]:
confusion_matrix(y_test, y_pred_test2)

array([[36863,  3152],
       [ 3388,  9390]])

In [25]:
print("precision = ", precision_score(y_test, y_pred_test2))
print("recall = ", recall_score(y_test, y_pred_test2))
print("auc = ", roc_auc_score(y_test, y_pred_test2))
print("f1 score = ", f1_score(y_test, y_pred_test2))

precision =  0.7486844203476319
recall =  0.7348567850993896
auc =  0.8280431620111468
f1 score =  0.7417061611374408
