In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, KFold,RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score, roc_auc_score
from sklearn.metrics import classification_report, precision_recall_curve

from scripts import featurizer_xgb

import xgboost as xgb
import sklearn
from xgboost.sklearn import XGBClassifier


%load_ext autoreload
%autoreload 2

### Lectura de set de datos ya depurado.

In [2]:
df = pd.read_csv("../data/dataset-procesado.csv", index_col=False)

In [3]:
df.shape

(251091, 121)

In [4]:
df.head()

Unnamed: 0,P_2,D_39,B_1,B_2,R_1,S_3,D_41,D_43,D_44,B_4,...,R_28,D_139,D_140,D_144,D_145,customer_ID,S_2,D_63,D_64,target
0,0.947242,0.002204,0.008002,1.007516,0.00283,0.453227,0.005992,0.14548,0.001613,0.029088,...,0.000518,0.003183,0.005264,0.004306,0.002719,743d55408505dee752adc6e62b273f4397db6e89f19034...,2017-08-10,CO,O,0
1,0.757648,0.002076,0.008751,0.814083,0.000408,0.1557,0.004156,0.127285,0.129051,0.040101,...,0.001186,1.000928,0.005655,0.007953,0.095926,522c85a354bfec790067b54e9db7353d20d4709ae5dc93...,2017-07-04,CO,O,0
2,0.673978,0.588604,0.295201,0.02465,0.007951,0.125583,0.00876,0.14548,0.005867,0.17595,...,0.000805,0.003558,0.008268,0.00302,0.002798,9296e983c3e4f99b046952397fc4ba950abf1032a9bd6c...,2017-03-30,CO,R,1
3,0.778075,0.00693,0.056721,0.819369,0.004922,0.089214,0.006884,0.041515,0.002841,0.001179,...,0.009053,0.00467,0.000125,0.000819,0.000342,3fa10314686e0e51d8f76235eceab91d70aa3f24a98c3e...,2018-01-17,CL,O,0
4,0.561622,0.032756,0.822028,0.025315,0.00557,1.1474,0.00829,0.14548,0.002991,0.31665,...,0.008689,0.003884,0.003367,0.00901,0.004627,1f81653f1397fe11839b585cb688e2e92ce1805023ba67...,2017-06-21,CO,O,1


###### Aplicacion de un script como featurizador para agilizar la creacion e iteracion de features.

In [5]:
df_featurizado = featurizer_xgb.featurizer(df)

-------


In [6]:
df_featurizado.shape

(251091, 152)

Como se puede observar, hay un incremento de columnas que viene por tratar con nuevas features en nuestro set de datos.

In [7]:
X = df_featurizado.drop(axis = 1, columns = ['target'])
y = df_featurizado.target

features = X.columns

In [8]:
X.shape

(251091, 151)

In [9]:
len(y)

251091

###### Train test split para poder diferenciar los sets de datos de entrenamiento de testing, stratifico la columna target ya que hay un desbalance de 70/30 entre labels positivos y negativos.

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

### Clasificador 

In [11]:
xgb = XGBClassifier()

Set de hiperparametros posibles para el random search

In [12]:
param_dist = {'n_estimators':[50, 100, 150],
              'max_depth':[5, 10],              
              'subsample': [0.5,0.8,1],
              'colsample_bytree': [0.5,0.8,1],              
              'learning_rate':[0.2, 0.3]
              }

In [13]:
random_search_xgb = RandomizedSearchCV(xgb,
                                       param_distributions = param_dist,
                                       n_iter = 10,
                                       scoring = 'roc_auc',
                                       cv = 5,
                                       n_jobs = -1,
                                       verbose=1)

In [14]:
%%time
random_search_xgb.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
CPU times: user 6min 55s, sys: 936 ms, total: 6min 56s
Wall time: 50min 46s


RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           callbacks=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None,
                                           early_stopping_rounds=None,
                                           enable_categorical=False,
                                           eval_metric=None, gamma=None,
                                           gpu_id=None, grow_policy=None,
                                           importance_type=None,
                                           interaction_constraints=None,
                                           learning_rate=None, max_bin=None,...
                                           min_child_weight=None, missing=nan,
                                           monotone_constra

In [15]:
params_opt_xgb = random_search_xgb.best_params_

Los mejores hiperparametros sacados del random search para el modelo de xgboost final

In [16]:
params_opt_xgb

{'subsample': 1,
 'n_estimators': 100,
 'max_depth': 5,
 'learning_rate': 0.2,
 'colsample_bytree': 0.5}

In [17]:
xgbModelBestParams = XGBClassifier(**params_opt_xgb)

In [18]:
%%time
xgbModelBestParams.fit(X_train, y_train)

CPU times: user 7min 11s, sys: 613 ms, total: 7min 12s
Wall time: 27.9 s


XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.5,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.2, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [19]:
y_pred_temp2 = xgbModelBestParams.predict(X_train)

In [20]:
y_pred_test2 = xgbModelBestParams.predict(X_test)

Metricas de testeo para el set de training

In [21]:
confusion_matrix(y_train, y_pred_temp2)

array([[147860,  10016],
       [ 11365,  31631]])

In [22]:
print("precision = ", precision_score(y_train, y_pred_temp2))
print("recall = ", recall_score(y_train, y_pred_temp2))
print("auc = ", roc_auc_score(y_train, y_pred_temp2))
print("f1 score = ", f1_score(y_train, y_pred_temp2))

precision =  0.7595024851730017
recall =  0.7356730858684529
auc =  0.8361154453639815
f1 score =  0.7473978946870975


Metricas de testeo para el set de testing

In [23]:
confusion_matrix(y_test, y_pred_test2)

array([[36626,  2844],
       [ 3188,  7561]])

In [24]:
print("precision = ", precision_score(y_test, y_pred_test2))
print("recall = ", recall_score(y_test, y_pred_test2))
print("auc = ", roc_auc_score(y_test, y_pred_test2))
print("f1 score = ", f1_score(y_test, y_pred_test2))

precision =  0.7266698702546852
recall =  0.7034142710949856
auc =  0.8156797729936544
f1 score =  0.7148529828873972
