In [None]:
pip install pycaret

In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import lightgbm as lgbm

In [2]:
import pycaret.classification

In [3]:
# Opción para ver todas las columnas del dataset en el notebook
pd.set_option('display.max_columns', 50)

# Cargamos los datos

In [4]:
# Leemos el dataset con la función de pandas "read_csv"
key = "data/bank-additional-full.csv"
df = pd.read_csv(key, sep=";")

In [5]:
df.head(2)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


# División en grupo de test y train

In [6]:
# Reemplazamos la columna y (target) por 1 y 0
df.y = df.y.replace('yes', 1)
df.y = df.y.replace('no', 0)

In [7]:
df_train, df_test = train_test_split(df, test_size=0.2, stratify=df.y, random_state=42)

### Análisis global con PyCaret

In [8]:
from pycaret.classification import *
clf = setup(data = df_train, target ="y" )

Unnamed: 0,Description,Value
0,session_id,8427
1,Target,y
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(32950, 21)"
5,Missing Values,False
6,Numeric Features,9
7,Categorical Features,11
8,Ordinal Features,False
9,High Cardinality Features,False


In [9]:
#best = compare_models()
best = compare_models(sort = 'f1') #default is 'Accuracy', LO CAMBIO POR F1 QUE ES LA QUE TOMAMOS COMO REFERENCIA

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9148,0.9475,0.5321,0.6437,0.5823,0.5353,0.5385,0.476
xgboost,Extreme Gradient Boosting,0.9126,0.9443,0.5285,0.6299,0.5741,0.5259,0.5288,3.521
catboost,CatBoost Classifier,0.9143,0.9463,0.5181,0.6449,0.5739,0.5269,0.5312,9.512
gbc,Gradient Boosting Classifier,0.9132,0.9442,0.501,0.6447,0.563,0.5157,0.5212,2.33
lda,Linear Discriminant Analysis,0.9075,0.9318,0.4916,0.6055,0.5421,0.4913,0.4949,0.339
knn,K Neighbors Classifier,0.9054,0.8698,0.4769,0.5956,0.5288,0.477,0.4811,0.829
rf,Random Forest Classifier,0.9105,0.9375,0.4345,0.6473,0.5195,0.4723,0.484,1.341
dt,Decision Tree Classifier,0.8884,0.7251,0.515,0.5008,0.5073,0.4444,0.4447,0.139
lr,Logistic Regression,0.9079,0.9284,0.3906,0.6442,0.4855,0.4383,0.4553,3.335
ada,Ada Boost Classifier,0.9071,0.935,0.3809,0.642,0.4775,0.4302,0.4482,0.639


### Continúo división train, test y validation

In [10]:
X = df.drop(columns='y')
y = df.y

In [11]:
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42) 

In [12]:
X_temp.shape, y_temp.shape, X_test.shape, y_test.shape

((32950, 20), (32950,), (8238, 20), (8238,))

In [13]:
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, stratify=y_temp, random_state=42)

In [14]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape

((26360, 20), (26360,), (6590, 20), (6590,))

# Pre-procesamiento

In [None]:
#Todas las variables del dataset
variables_categoricas_original = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
variables_numericas_original = ['age', 'duration', 'campaign', 'pdays', 'previous','emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']

In [None]:
#Las variables que identificamos como relevantes
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

In [15]:
class SelectColumnsTransformer():
    def __init__(self, columns=None):
        self.columns = columns

    def transform(self, X, **transform_params):
        cpy_df = X[self.columns].copy()
        return cpy_df

    def fit(self, X, y=None, **fit_params):
        return self

In [None]:
#Aplicamos las transformaciones previas a los conjuntos de Train y Validation
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]
X_v = X_val[variables_categoricas + variables_numericas]     

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             #('standard_scaler', StandardScaler()),
                              ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                  ('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                       ('cat', pipeline_categorico, variables_categoricas)
                                      ])


train = pipeline_completo.fit_transform(X_t)
val = pipeline_completo.fit_transform(X_v)

# Métricas

Dado el desbalance de casos con que cuenta este dataset, las métricas a tomar en consideración para el análisis son: AUC y F1.

# Testeo por modelos

## Decision Tree

In [None]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             #('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 ('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('dt', DecisionTreeClassifier(random_state=0, class_weight="balanced"))])

In [None]:
pipeline_modelo.fit(X_t, y_train)

Pipeline(memory=None,
         steps=[('preprocess',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('select_numeric_columns',
                                                                   <__main__.SelectColumnsTransformer object at 0x7f9546ae2ed0>)],
                                                           verbose=False),
                                                  ['age', 'campaign',
                                                   'previous', 'cons.conf.idx',
                                                   'euribor3m']),
                                                 ('cat'...
                                                   'poutcome

In [None]:
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('precision','recall','f1', 'roc_auc'))

{'fit_time': array([0.66887331, 0.66426802, 0.65012574, 0.63357973, 0.64320779]),
 'score_time': array([0.05395627, 0.05164504, 0.05172205, 0.04956985, 0.0503974 ]),
 'test_f1': array([0.32550607, 0.32682927, 0.32518955, 0.30560272, 0.30016313]),
 'test_precision': array([0.31357254, 0.31603774, 0.32546374, 0.30821918, 0.29113924]),
 'test_recall': array([0.33838384, 0.33838384, 0.32491582, 0.3030303 , 0.30976431]),
 'test_roc_auc': array([0.62267988, 0.62309589, 0.62050083, 0.60900116, 0.60736138])}

Análisis de estos resultados sobre el conjunto de validación

In [None]:
train=pipeline_completo.fit_transform(X_t)

In [None]:
dt=DecisionTreeClassifier(random_state=0, class_weight="balanced")
dt.fit(train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight='balanced', criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [None]:
X_v=X_val[variables_categoricas + variables_numericas]
val=pipeline_completo.fit_transform(X_v)

In [None]:
y_val_pred=dt.predict(val)
print("VALIDACIÓN")
print(classification_report(y_val, y_val_pred))

VALIDACIÓN
              precision    recall  f1-score   support

           0       0.91      0.90      0.91      5848
           1       0.28      0.32      0.30       742

    accuracy                           0.83      6590
   macro avg       0.60      0.61      0.60      6590
weighted avg       0.84      0.83      0.84      6590



## Logistic Regression

In [None]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             ('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 # ('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('lr', LogisticRegression(random_state=0, class_weight="balanced"))])

In [None]:
pipeline_modelo

Pipeline(memory=None,
         steps=[('preprocess',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('select_numeric_columns',
                                                                   <__main__.SelectColumnsTransformer object at 0x7fd582729610>),
                                                                  ('standard_scaler',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True))],
                                  

In [None]:
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('f1', 'roc_auc'))

{'fit_time': array([0.53868079, 0.75247121, 0.78675079, 0.81195235, 0.79544616]),
 'score_time': array([0.06840634, 0.08396602, 0.08711243, 0.0852983 , 0.08784676]),
 'test_f1': array([0.38568019, 0.38275699, 0.37743006, 0.38307985, 0.3780037 ]),
 'test_roc_auc': array([0.77777022, 0.77820927, 0.76186962, 0.77274185, 0.76329329])}

## SVM

In [None]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             # ('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 # ('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('svm', SVC(random_state=0, class_weight="balanced"))])

In [None]:
pipeline_modelo

Pipeline(memory=None,
         steps=[('preprocess',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('select_numeric_columns',
                                                                   <__main__.SelectColumnsTransformer object at 0x7fd58267b9d0>)],
                                                           verbose=False),
                                                  ['age', 'campaign',
                                                   'previous', 'cons.conf.idx',
                                                   'euribor3m']),
                                                 ('cat'...
                                                           v

In [None]:
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('f1', 'roc_auc'))

{'fit_time': array([60.6817162 , 55.7583847 , 54.49913716, 58.66039801, 56.12915897]),
 'score_time': array([12.16631556, 11.8804481 , 11.91367435, 11.80435562, 11.78719926]),
 'test_f1': array([0.36551724, 0.36698033, 0.35686103, 0.36570429, 0.35428089]),
 'test_roc_auc': array([0.77124098, 0.76489672, 0.75543143, 0.75780878, 0.74833935])}

## Naive Bayes

In [16]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]
X_v = X_val[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             # ('standard_scaler', StandardScaler()),
                             ("kbins_discretizer", KBinsDiscretizer(n_bins=4, encode="ordinal", strategy="uniform")),   #strategy="uniform"
                             ('bins_cat', OneHotEncoder())
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 ('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())
                                 ])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('nb', ComplementNB())])

#The Complement Naive Bayes classifier was designed to correct the “severe assumptions” made by the standard Multinomial Naive Bayes classifier. It is particularly suited for imbalanced data sets.
#En el pre-procesamiento transformé todos los atributos en categóricos, porque es el requerimiento del tipo de modelo

In [None]:
pipeline_modelo

In [17]:
#Cross validation con toda la estimación sobre X_train
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('f1', 'roc_auc'))

{'fit_time': array([1.41415572, 0.86170268, 0.84408879, 0.87501144, 0.91733503]),
 'score_time': array([0.07812595, 0.03962302, 0.06250358, 0.06251788, 0.04687977]),
 'test_f1': array([0.36495032, 0.37488948, 0.35423341, 0.36487716, 0.35874835]),
 'test_roc_auc': array([0.76568611, 0.77107742, 0.7508099 , 0.76077488, 0.75201567])}

In [18]:
#Pre-procesamiento
train = pipeline_completo.fit_transform(X_t)
val = pipeline_completo.transform(X_v)

In [19]:
nb=ComplementNB()
nb.fit(train, y_train)

ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)

In [20]:
print("MÉTRICAS CONJUNTO DE TRAIN")
print(classification_report(y_train, nb.predict(train)))
print("MÉTRICAS CONJUNTO DE VALIDACIÓN")
print(classification_report(y_val, nb.predict(val)))

MÉTRICAS CONJUNTO DE TRAIN
              precision    recall  f1-score   support

           0       0.95      0.74      0.83     23390
           1       0.25      0.68      0.36      2970

    accuracy                           0.73     26360
   macro avg       0.60      0.71      0.60     26360
weighted avg       0.87      0.73      0.78     26360

MÉTRICAS CONJUNTO DE VALIDACIÓN
              precision    recall  f1-score   support

           0       0.95      0.73      0.83      5848
           1       0.24      0.68      0.36       742

    accuracy                           0.73      6590
   macro avg       0.60      0.71      0.59      6590
weighted avg       0.87      0.73      0.77      6590



In [21]:
#Optimización de hiperparámetros
#Grilla de parámetros
params={'alpha':[1, 0.5, 0.3, 0.1, 1.2, 1.5],
        'fit_prior':[True, False],
        'norm':[True,False]
       }

nb=ComplementNB()

In [22]:
#Búsqueda de parámetros
cv_nb = GridSearchCV(nb, params, scoring='f1', cv=5,refit=True,n_jobs=-1)     
cv_nb.fit(train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=ComplementNB(alpha=1.0, class_prior=None, fit_prior=True,
                                    norm=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'alpha': [1, 0.5, 0.3, 0.1, 1.2, 1.5],
                         'fit_prior': [True, False], 'norm': [True, False]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1', verbose=0)

In [23]:
cv_nb.best_params_

{'alpha': 0.3, 'fit_prior': True, 'norm': True}

In [24]:
#Entrenamiento de la mejor versión encontrada del modelo
nb_best = ComplementNB(alpha=0.3, fit_prior=True, norm=True)
nb_best.fit(train, y_train)

ComplementNB(alpha=0.3, class_prior=None, fit_prior=True, norm=True)

In [25]:
print("MÉTRICAS CONJUNTO DE TRAIN")
print(classification_report(y_train, nb_best.predict(train)))
print("MÉTRICAS CONJUNTO DE VALIDACIÓN")
print(classification_report(y_val, nb_best.predict(val)))

MÉTRICAS CONJUNTO DE TRAIN
              precision    recall  f1-score   support

           0       0.93      0.91      0.92     23390
           1       0.38      0.43      0.40      2970

    accuracy                           0.86     26360
   macro avg       0.65      0.67      0.66     26360
weighted avg       0.86      0.86      0.86     26360

MÉTRICAS CONJUNTO DE VALIDACIÓN
              precision    recall  f1-score   support

           0       0.92      0.91      0.92      5848
           1       0.37      0.41      0.39       742

    accuracy                           0.85      6590
   macro avg       0.65      0.66      0.65      6590
weighted avg       0.86      0.85      0.86      6590



## Random Forest

In [None]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             # ('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 # ('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('rf', RandomForestClassifier(random_state=0, class_weight="balanced"))])

In [None]:
pipeline_modelo

Pipeline(memory=None,
         steps=[('preprocess',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('select_numeric_columns',
                                                                   <__main__.SelectColumnsTransformer object at 0x7fd581d16b90>)],
                                                           verbose=False),
                                                  ['age', 'campaign',
                                                   'previous', 'cons.conf.idx',
                                                   'euribor3m']),
                                                 ('cat'...
                 RandomForestClassifier(bootstrap=True, ccp_

In [None]:
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('f1', 'roc_auc'))

{'fit_time': array([2.65647292, 2.51803756, 2.39808679, 2.47354674, 2.47565603]),
 'score_time': array([0.35200453, 0.33567739, 0.3027122 , 0.30561471, 0.37091136]),
 'test_f1': array([0.32018561, 0.32821724, 0.31153389, 0.32328106, 0.35111111]),
 'test_roc_auc': array([0.76488503, 0.76032845, 0.75850802, 0.76109193, 0.75566086])}

## KNN

In [None]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             ('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 # ('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('knn', KNeighborsClassifier())])

In [None]:
pipeline_modelo

Pipeline(memory=None,
         steps=[('preprocess',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('select_numeric_columns',
                                                                   <__main__.SelectColumnsTransformer object at 0x7fd581cd4c50>),
                                                                  ('standard_scaler',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True))],
                                  

In [None]:
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('f1', 'roc_auc'))

{'fit_time': array([0.36779571, 0.36578655, 0.33474255, 0.37986541, 0.38184404]),
 'score_time': array([ 9.78636217,  8.91680884,  9.06589484, 10.27293444,  9.09081459]),
 'test_f1': array([0.31336406, 0.33179724, 0.32590856, 0.3087886 , 0.35386338]),
 'test_roc_auc': array([0.70310739, 0.70060157, 0.69511615, 0.70793873, 0.71124455])}

## Modelos Tree Based

### XGBOOST

In [None]:
import xgboost as xgb

In [26]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             # ('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 ('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('xgb', xgb.XGBClassifier(seed=0))])

In [27]:
pipeline_modelo.fit(X_t, y_train)



Pipeline(memory=None,
         steps=[('preprocess',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('select_numeric_columns',
                                                                   <__main__.SelectColumnsTransformer object at 0x00000018268D7B80>)],
                                                           verbose=False),
                                                  ['age', 'campaign',
                                                   'previous', 'cons.conf.idx',
                                                   'euribor3m']),
                                                 ('...
                               learning_rate=0.300000012, ma

In [28]:
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('f1', 'roc_auc'))



{'fit_time': array([3.11002564, 2.95222878, 2.91297197, 2.9912858 , 2.98509741]),
 'score_time': array([0.08205843, 0.08205891, 0.08205676, 0.08105803, 0.09206629]),
 'test_f1': array([0.36959064, 0.37198622, 0.35824437, 0.3712297 , 0.3681257 ]),
 'test_roc_auc': array([0.77572738, 0.77711489, 0.77857922, 0.78029691, 0.77348679])}

In [29]:
pipeline_modelo[1].feature_importances_    #No se entiende este 

array([0.02291044, 0.02426711, 0.02155347, 0.04411939, 0.09447913,
       0.02118124, 0.02820904, 0.02015119, 0.02030318, 0.02230303,
       0.02138294, 0.0245105 , 0.0220443 , 0.02649062, 0.02912351,
       0.01692515, 0.02376754, 0.02346515, 0.02592699, 0.02173736,
       0.02110064, 0.03140582, 0.02401848, 0.00233567, 0.02450542,
       0.02417863, 0.        , 0.        , 0.01997583, 0.        ,
       0.02702206, 0.        , 0.04010792, 0.        , 0.03302651,
       0.        , 0.19747172], dtype=float32)

In [30]:
#Solo Pre-procesamiento
train = pipeline_completo.fit_transform(X_t)
val = pipeline_completo.transform(X_v)

In [31]:
xgb=xgb.XGBClassifier(seed=0)
xgb.fit(train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=0, subsample=1,
              tree_method='exact', use_label_encoder=True,
              validate_parameters=1, verbosity=None)

In [32]:
print("MÉTRICAS CONJUNTO DE TRAIN")
print(classification_report(y_train, xgb.predict(train)))
print("MÉTRICAS CONJUNTO DE VALIDACIÓN")
print(classification_report(y_val, xgb.predict(val)))

MÉTRICAS CONJUNTO DE TRAIN
              precision    recall  f1-score   support

           0       0.93      0.99      0.96     23390
           1       0.91      0.43      0.58      2970

    accuracy                           0.93     26360
   macro avg       0.92      0.71      0.77     26360
weighted avg       0.93      0.93      0.92     26360

MÉTRICAS CONJUNTO DE VALIDACIÓN
              precision    recall  f1-score   support

           0       0.91      0.98      0.94      5848
           1       0.58      0.27      0.37       742

    accuracy                           0.90      6590
   macro avg       0.75      0.62      0.66      6590
weighted avg       0.88      0.90      0.88      6590



In [38]:
import xgboost as xgb

In [39]:
#Optimización de hiperparámetros
#Grilla de parámetros
params={'objective':["binary:logistic","binary:hinge","binary:logitraw"],
        'learning_rate':[ 0.1,0.2,0.3],
        'max_depth':[2,4, 6, 7, 8, 10],
        'alpha':[2, 3, 5, 7],
        "n_estimators":[5, 7, 10]
       }
xg=xgb.XGBClassifier()

In [41]:
#Búsqueda de parámetros
cv_xgb = GridSearchCV(xg, params, scoring='f1', cv=5,refit=True,n_jobs=-1)     
cv_xgb.fit(train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estim...
                                     use_label_encoder=True,
                                     validate_parameters=None, verbosity=None),
             iid='deprecated', n_jobs=-1,
             param_grid={'alpha': [2, 3, 5, 7],
                         'learning_rate': [0.1, 0.2,

In [42]:
cv_xgb.best_params_

{'alpha': 2,
 'learning_rate': 0.1,
 'max_depth': 8,
 'n_estimators': 7,
 'objective': 'binary:hinge'}

In [None]:
import xgboost as xgb

In [43]:
#Entrenamiento de la mejor versión encontrada del modelo
xgb_best = xgb.XGBClassifier(seed=0, alpha= 2, learning_rate= 0.1, max_depth= 8, n_estimators=7, objective='binary:hinge')
xgb_best.fit(train, y_train)

XGBClassifier(alpha=2, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=8,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=7, n_jobs=4, num_parallel_tree=1,
              objective='binary:hinge', random_state=0, reg_alpha=2,
              reg_lambda=1, scale_pos_weight=None, seed=0, subsample=1,
              tree_method='exact', use_label_encoder=True,
              validate_parameters=1, verbosity=None)

In [44]:
print("MÉTRICAS CONJUNTO DE TRAIN")
print(classification_report(y_train, xgb_best.predict(train)))
print("MÉTRICAS CONJUNTO DE VALIDACIÓN")
print(classification_report(y_val, xgb_best.predict(val)))

MÉTRICAS CONJUNTO DE TRAIN
              precision    recall  f1-score   support

           0       0.94      0.91      0.92     23390
           1       0.44      0.57      0.50      2970

    accuracy                           0.87     26360
   macro avg       0.69      0.74      0.71     26360
weighted avg       0.89      0.87      0.88     26360

MÉTRICAS CONJUNTO DE VALIDACIÓN
              precision    recall  f1-score   support

           0       0.94      0.90      0.92      5848
           1       0.44      0.58      0.50       742

    accuracy                           0.87      6590
   macro avg       0.69      0.74      0.71      6590
weighted avg       0.89      0.87      0.88      6590



In [73]:
#Optimización de hiperparámetros
#Grilla de parámetros
params={'objective':["binary:logistic","binary:hinge","binary:logitraw"],
        'learning_rate':[0.05,0.1,0.15,0.2,0.25,0.3,0.4,0.5],
        'max_depth':[2,3, 4,5, 6, 7, 8, 9, 10, 12, 15],
        'alpha':[0, 0.5, 1, 2, 3, 5],
        'lambda':[0.5, 1, 2, 3, 5],
        "n_estimators":[3, 5, 6, 7, 8, 9, 10, 15],
        "booster":["gbtree","dart"],
        "gamma":[0.5,1,2,5, 7, 8],
        "tree_method":["auto","exact","approx","hist"]
       }
xg=xgb.XGBClassifier()

In [74]:
#Búsqueda de parámetros
rcv_xgb = RandomizedSearchCV(xg, params, scoring='f1', cv=5,refit=True,n_jobs=-1)     
rcv_xgb.fit(train, y_train)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n...
                                        'lambda': [0.5, 1, 2, 3, 5],
                                        'learning_rate': [0.05, 0.1, 0.15, 0.2,
                               

In [75]:
rcv_xgb.best_params_

{'tree_method': 'approx',
 'objective': 'binary:hinge',
 'n_estimators': 15,
 'max_depth': 7,
 'learning_rate': 0.25,
 'lambda': 3,
 'gamma': 8,
 'booster': 'dart',
 'alpha': 0}

In [80]:
#Entrenamiento de la mejor versión encontrada del modelo
xgb_best_r = xgb.XGBClassifier(seed=0, tree_method= 'approx', objective="binary:hinge", n_estimators=3, max_depth= 6,
                               learning_rate= 0.25, reg_lambda=3, gamma= 8, booster="dart", 
                               alpha= 0 )   #el lambda por default es 1
xgb_best_r.fit(train, y_train)

XGBClassifier(alpha=0, base_score=0.5, booster='dart', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=8, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.25, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=3, n_jobs=4, num_parallel_tree=1,
              objective='binary:hinge', random_state=0, reg_alpha=0,
              reg_lambda=3, scale_pos_weight=None, seed=0, subsample=1,
              tree_method='approx', use_label_encoder=True,
              validate_parameters=1, verbosity=None)

In [81]:
print("MÉTRICAS CONJUNTO DE TRAIN")
print(classification_report(y_train, xgb_best_r.predict(train)))
print("MÉTRICAS CONJUNTO DE VALIDACIÓN")
print(classification_report(y_val, xgb_best_r.predict(val)))

MÉTRICAS CONJUNTO DE TRAIN
              precision    recall  f1-score   support

           0       0.94      0.91      0.93     23390
           1       0.44      0.54      0.49      2970

    accuracy                           0.87     26360
   macro avg       0.69      0.73      0.71     26360
weighted avg       0.88      0.87      0.88     26360

MÉTRICAS CONJUNTO DE VALIDACIÓN
              precision    recall  f1-score   support

           0       0.94      0.91      0.93      5848
           1       0.46      0.57      0.51       742

    accuracy                           0.88      6590
   macro avg       0.70      0.74      0.72      6590
weighted avg       0.89      0.88      0.88      6590



### LigthGBM

In [None]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             # ('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 # ('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('lgbm', lgbm(seed=0))])

In [None]:
pipeline_modelo

In [None]:
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('f1', 'roc_auc', "precision", "recall"))