In [25]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, KFold,RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score, roc_auc_score
from sklearn.metrics import classification_report, precision_recall_curve

from scripts import featurizer_xgb

import xgboost as xgb
import sklearn
from xgboost.sklearn import XGBClassifier


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Lectura de set de datos ya depurado.

In [26]:
df = pd.read_csv("../data/dataset-procesado.csv", index_col=False)

In [27]:
df.shape

(13256, 121)

In [28]:
df.head()

Unnamed: 0,P_2,D_39,B_1,B_2,R_1,S_3,D_41,D_43,D_44,B_4,...,R_28,D_139,D_140,D_144,D_145,customer_ID,S_2,D_63,D_64,target
0,0.503081,0.12269,0.060852,0.818435,0.005692,0.15086,0.002214,0.150801,0.126017,0.275044,...,0.00868,0.004475,0.001958,0.009255,0.005948,dbc39818725bb5a6693839b2cc0aa6416619e257937f3b...,2017-07-31,CO,U,0
1,0.967004,0.147385,0.025202,1.006426,0.007549,0.112134,0.005367,0.071737,0.006661,0.03455,...,0.008593,0.006886,0.002667,0.009003,0.008484,cc6156327f2bb0f6d7026c8f4bd67925d07e90aa30c3f2...,2018-01-01,CL,O,0
2,0.606902,0.178559,0.200311,0.027147,0.009911,0.128841,0.003677,0.104828,0.257234,0.417678,...,0.006294,0.007027,0.006109,0.001858,0.00115,9a563afbb91edf3b91add14584e2cbead02cadecefa6a9...,2017-03-16,CO,O,1
3,0.499094,0.478672,0.040157,0.818853,0.008475,0.147698,0.003302,0.124946,0.255405,0.39929,...,0.00682,0.000544,0.004316,0.002098,0.0082,f83df258e87389552401e86124b840eb743986f025c230...,2017-09-01,CR,O,0
4,0.492151,0.009685,0.043034,1.005668,0.005704,0.243877,0.007754,0.322158,0.0012,0.047605,...,0.005398,0.009299,0.006047,0.002128,0.00826,5f0c9387f66a49b668e5475fce137cca4acc101d14f8a0...,2017-10-12,CL,R,1


###### Aplicacion de un script como featurizador para agilizar la creacion e iteracion de features.

In [29]:
df_featurizado = featurizer_xgb.featurizer(df)

-------


In [30]:
df_featurizado.shape

(13256, 152)

Como se puede observar, hay un incremento de columnas que viene por tratar con nuevas features en nuestro set de datos.

In [31]:
X = df_featurizado.drop(axis = 1, columns = ['target'])
y = df_featurizado.target

features = X.columns

In [32]:
X.shape

(13256, 151)

In [33]:
len(y)

13256

###### Train test split para poder diferenciar los sets de datos de entrenamiento de testing, stratifico la columna target ya que hay un desbalance de 70/30 entre labels positivos y negativos.

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

### Clasificador 

In [35]:
xgb = XGBClassifier()

Set de hiperparametros posibles para el random search

In [36]:
param_dist = {'n_estimators':[50, 100],
              'max_depth':[3, 5, 10],              
              'subsample': [0.5,0.8,1],
              'colsample_bytree': [0.5,0.8,1],              
              'learning_rate':[0.2, 0.3]
              }

In [37]:
random_search_xgb = RandomizedSearchCV(xgb,
                                       param_distributions = param_dist,
                                       n_iter = 30,
                                       scoring = 'roc_auc',
                                       cv = 5,
                                       n_jobs = -1,
                                       verbose=1)

In [None]:
%%time
random_search_xgb.fit(X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [None]:
params_opt_xgb = random_search_xgb.best_params_

Los mejores hiperparametros sacados del random search para el modelo de xgboost final

In [None]:
params_opt_xgb

In [None]:
xgbModelBestParams = XGBClassifier(**params_opt_xgb)

In [None]:
%time
xgbModelBestParams.fit(X_train, y_train)

In [None]:
y_pred_temp2 = xgbModelBestParams.predict(X_train)

In [None]:
y_pred_test2 = xgbModelBestParams.predict(X_test)

Metricas de testeo para el set de training

In [None]:
confusion_matrix(y_train, y_pred_temp2)

In [None]:
print("precision = ", precision_score(y_train, y_pred_temp2))
print("recall = ", recall_score(y_train, y_pred_temp2))
print("auc = ", roc_auc_score(y_train, y_pred_temp2))
print("f1 score = ", f1_score(y_train, y_pred_temp2))

Metricas de testeo para el set de testing

In [None]:
confusion_matrix(y_test, y_pred_test2)

In [None]:
print("precision = ", precision_score(y_test, y_pred_test2))
print("recall = ", recall_score(y_test, y_pred_test2))
print("auc = ", roc_auc_score(y_test, y_pred_test2))
print("f1 score = ", f1_score(y_test, y_pred_test2))