In [None]:
pip install pycaret

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import lightgbm as lgbm

In [None]:
import pycaret.classification

In [None]:
# Opción para ver todas las columnas del dataset en el notebook
pd.set_option('display.max_columns', 50)

# Cargamos los datos

In [None]:
# Leemos el dataset con la función de pandas "read_csv"
url = "https://raw.githubusercontent.com/AgusCarchano/Mentorias-grupo1/master/data/bank-additional-full.csv"
df = pd.read_csv(url, sep=";")

In [None]:
df.head(2)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


# División en grupo de test y train

In [None]:
# Reemplazamos la columna y (target) por 1 y 0
df.y = df.y.replace('yes', 1)
df.y = df.y.replace('no', 0)

In [None]:
df_train, df_test = train_test_split(df, test_size=0.2, stratify=df.y, random_state=42)

### Análisis global con PyCaret

In [None]:
from pycaret.classification import *
clf = setup(data = df_train, target ="y" )

Unnamed: 0,Description,Value
0,session_id,8073
1,Target,y
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(32950, 21)"
5,Missing Values,False
6,Numeric Features,9
7,Categorical Features,11
8,Ordinal Features,False
9,High Cardinality Features,False


In [None]:
#best = compare_models()
best = compare_models(sort = 'AUC') #default is 'Accuracy', LO CAMBIO POR AUC QUE ES LA QUE TOMAMOS COMO REFERENCIA

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9139,0.9468,0.5311,0.6525,0.5849,0.5375,0.5414,0.434
gbc,Gradient Boosting Classifier,0.9143,0.9439,0.5053,0.6643,0.5736,0.527,0.5333,3.998
rf,Random Forest Classifier,0.9101,0.9359,0.4381,0.6612,0.5264,0.479,0.4918,2.76
ada,Ada Boost Classifier,0.908,0.9344,0.4032,0.6597,0.5002,0.4528,0.4696,1.14
lda,Linear Discriminant Analysis,0.9061,0.931,0.5019,0.6078,0.5493,0.4975,0.5006,0.345
lr,Logistic Regression,0.9073,0.9287,0.4096,0.651,0.5022,0.4541,0.4692,4.001
et,Extra Trees Classifier,0.8995,0.9109,0.3485,0.604,0.4418,0.391,0.409,3.206
knn,K Neighbors Classifier,0.9027,0.8637,0.4742,0.5933,0.5265,0.4731,0.4771,1.236
nb,Naive Bayes,0.8704,0.832,0.4704,0.4381,0.4532,0.3799,0.3805,0.062
dt,Decision Tree Classifier,0.8868,0.7253,0.516,0.5047,0.51,0.446,0.4462,0.213


### Continúo división train, test y validation

In [None]:
X = df.drop(columns='y')
y = df.y

In [None]:
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42) 

In [None]:
X_temp.shape, y_temp.shape, X_test.shape, y_test.shape

((32950, 20), (32950,), (8238, 20), (8238,))

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, stratify=y_temp, random_state=42)

In [None]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape

((26360, 20), (26360,), (6590, 20), (6590,))

# Pre-procesamiento

In [None]:
#Todas las variables del dataset
variables_categoricas_original = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
variables_numericas_original = ['age', 'duration', 'campaign', 'pdays', 'previous','emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']

In [None]:
#Las variables que identificamos como relevantes
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

In [None]:
class SelectColumnsTransformer():
    def __init__(self, columns=None):
        self.columns = columns

    def transform(self, X, **transform_params):
        cpy_df = X[self.columns].copy()
        return cpy_df

    def fit(self, X, y=None, **fit_params):
        return self

In [None]:
#Aplicamos las transformaciones previas a los conjuntos de Train y Validation
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]
X_v = X_val[variables_categoricas + variables_numericas]     

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             #('standard_scaler', StandardScaler()),
                              ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                  ('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                       ('cat', pipeline_categorico, variables_categoricas)
                                      ])


train = pipeline_completo.fit_transform(X_t)
val = pipeline_completo.fit_transform(X_v)

# Métricas

Dado el desbalance de casos con que cuenta este dataset, las métricas a tomar en consideración para el análisis son: AUC y F1.

#Testeo por modelos

## Decision Tree

In [None]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             #('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 ('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('dt', DecisionTreeClassifier(random_state=0, class_weight="balanced"))])

In [None]:
pipeline_modelo.fit(X_t, y_train)

Pipeline(memory=None,
         steps=[('preprocess',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('select_numeric_columns',
                                                                   <__main__.SelectColumnsTransformer object at 0x7f9546ae2ed0>)],
                                                           verbose=False),
                                                  ['age', 'campaign',
                                                   'previous', 'cons.conf.idx',
                                                   'euribor3m']),
                                                 ('cat'...
                                                   'poutcome

In [None]:
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('precision','recall','f1', 'roc_auc'))

{'fit_time': array([0.66887331, 0.66426802, 0.65012574, 0.63357973, 0.64320779]),
 'score_time': array([0.05395627, 0.05164504, 0.05172205, 0.04956985, 0.0503974 ]),
 'test_f1': array([0.32550607, 0.32682927, 0.32518955, 0.30560272, 0.30016313]),
 'test_precision': array([0.31357254, 0.31603774, 0.32546374, 0.30821918, 0.29113924]),
 'test_recall': array([0.33838384, 0.33838384, 0.32491582, 0.3030303 , 0.30976431]),
 'test_roc_auc': array([0.62267988, 0.62309589, 0.62050083, 0.60900116, 0.60736138])}

Análisis de estos resultados sobre el conjunto de validación

In [None]:
train=pipeline_completo.fit_transform(X_t)

In [None]:
dt=DecisionTreeClassifier(random_state=0, class_weight="balanced")
dt.fit(train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight='balanced', criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [None]:
X_v=X_val[variables_categoricas + variables_numericas]
val=pipeline_completo.fit_transform(X_v)

In [None]:
y_val_pred=dt.predict(val)
print("VALIDACIÓN")
print(classification_report(y_val, y_val_pred))

VALIDACIÓN
              precision    recall  f1-score   support

           0       0.91      0.90      0.91      5848
           1       0.28      0.32      0.30       742

    accuracy                           0.83      6590
   macro avg       0.60      0.61      0.60      6590
weighted avg       0.84      0.83      0.84      6590



## Logistic Regression

In [None]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             ('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 # ('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('lr', LogisticRegression(random_state=0, class_weight="balanced"))])

In [None]:
pipeline_modelo

Pipeline(memory=None,
         steps=[('preprocess',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('select_numeric_columns',
                                                                   <__main__.SelectColumnsTransformer object at 0x7fd582729610>),
                                                                  ('standard_scaler',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True))],
                                  

In [None]:
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('f1', 'roc_auc'))

{'fit_time': array([0.53868079, 0.75247121, 0.78675079, 0.81195235, 0.79544616]),
 'score_time': array([0.06840634, 0.08396602, 0.08711243, 0.0852983 , 0.08784676]),
 'test_f1': array([0.38568019, 0.38275699, 0.37743006, 0.38307985, 0.3780037 ]),
 'test_roc_auc': array([0.77777022, 0.77820927, 0.76186962, 0.77274185, 0.76329329])}

## SVM

In [None]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             # ('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 # ('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('svm', SVC(random_state=0, class_weight="balanced"))])

In [None]:
pipeline_modelo

Pipeline(memory=None,
         steps=[('preprocess',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('select_numeric_columns',
                                                                   <__main__.SelectColumnsTransformer object at 0x7fd58267b9d0>)],
                                                           verbose=False),
                                                  ['age', 'campaign',
                                                   'previous', 'cons.conf.idx',
                                                   'euribor3m']),
                                                 ('cat'...
                                                           v

In [None]:
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('f1', 'roc_auc'))

{'fit_time': array([60.6817162 , 55.7583847 , 54.49913716, 58.66039801, 56.12915897]),
 'score_time': array([12.16631556, 11.8804481 , 11.91367435, 11.80435562, 11.78719926]),
 'test_f1': array([0.36551724, 0.36698033, 0.35686103, 0.36570429, 0.35428089]),
 'test_roc_auc': array([0.77124098, 0.76489672, 0.75543143, 0.75780878, 0.74833935])}

## Naive Bayes

In [None]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]
X_v = X_val[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             # ('standard_scaler', StandardScaler()),
                             ("kbins_discretizer", KBinsDiscretizer(n_bins=4, encode="ordinal", strategy="uniform")),   #strategy="uniform"
                             ('bins_cat', OneHotEncoder())
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 ('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())
                                 ])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('nb', ComplementNB())])

#The Complement Naive Bayes classifier was designed to correct the “severe assumptions” made by the standard Multinomial Naive Bayes classifier. It is particularly suited for imbalanced data sets.
#En el pre-procesamiento transformé todos los atributos en categóricos, porque es el requerimiento del tipo de modelo

In [None]:
pipeline_modelo

In [None]:
#Cross validation con toda la estimación sobre X_train
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('f1', 'roc_auc'))

{'fit_time': array([0.55301046, 0.55523276, 0.54159117, 0.55705428, 0.54250813]),
 'score_time': array([0.05808043, 0.05570722, 0.05618405, 0.05498481, 0.0617044 ]),
 'test_f1': array([0.36495032, 0.37488948, 0.35423341, 0.36487716, 0.35874835]),
 'test_roc_auc': array([0.76568611, 0.77107742, 0.7508099 , 0.76077488, 0.75201567])}

In [None]:
#Pre-procesamiento
train = pipeline_completo.fit_transform(X_t)
val = pipeline_completo.transform(X_v)

In [None]:
nb=ComplementNB()
nb.fit(train, y_train)

ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)

In [None]:
print("MÉTRICAS CONJUNTO DE TRAIN")
print(classification_report(y_train, nb.predict(train)))
print("MÉTRICAS CONJUNTO DE VALIDACIÓN")
print(classification_report(y_val, nb.predict(val)))

MÉTRICAS CONJUNTO DE TRAIN
              precision    recall  f1-score   support

           0       0.95      0.74      0.83     23390
           1       0.25      0.68      0.36      2970

    accuracy                           0.73     26360
   macro avg       0.60      0.71      0.60     26360
weighted avg       0.87      0.73      0.78     26360

MÉTRICAS CONJUNTO DE VALIDACIÓN
              precision    recall  f1-score   support

           0       0.95      0.73      0.83      5848
           1       0.24      0.68      0.36       742

    accuracy                           0.73      6590
   macro avg       0.60      0.71      0.59      6590
weighted avg       0.87      0.73      0.77      6590



In [None]:
#Optimización de hiperparámetros
#Grilla de parámetros
params={'alpha':[1, 0.5, 0.3, 0.1, 1.2, 1.5],
        'fit_prior':[True, False],
        'norm':[True,False]
       }

nb=ComplementNB()

In [None]:
#Búsqueda de parámetros
cv_nb = GridSearchCV(nb, params, scoring='f1', cv=5,refit=True,n_jobs=-1)     
cv_nb.fit(train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=ComplementNB(alpha=1.0, class_prior=None, fit_prior=True,
                                    norm=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'alpha': [1, 0.5, 0.3, 0.1, 1.2, 1.5],
                         'fit_prior': [True, False], 'norm': [True, False]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1', verbose=0)

In [None]:
cv_nb.best_params_

{'alpha': 0.3, 'fit_prior': True, 'norm': True}

In [None]:
#Entrenamiento de la mejor versión encontrada del modelo
nb_best = ComplementNB(alpha=0.3, fit_prior=True, norm=True)
nb_best.fit(train, y_train)

ComplementNB(alpha=0.3, class_prior=None, fit_prior=True, norm=True)

In [None]:
print("MÉTRICAS CONJUNTO DE TRAIN")
print(classification_report(y_train, nb_best.predict(train)))
print("MÉTRICAS CONJUNTO DE VALIDACIÓN")
print(classification_report(y_val, nb_best.predict(val)))

MÉTRICAS CONJUNTO DE TRAIN
              precision    recall  f1-score   support

           0       0.93      0.91      0.92     23390
           1       0.38      0.43      0.40      2970

    accuracy                           0.86     26360
   macro avg       0.65      0.67      0.66     26360
weighted avg       0.86      0.86      0.86     26360

MÉTRICAS CONJUNTO DE VALIDACIÓN
              precision    recall  f1-score   support

           0       0.92      0.91      0.92      5848
           1       0.37      0.41      0.39       742

    accuracy                           0.85      6590
   macro avg       0.65      0.66      0.65      6590
weighted avg       0.86      0.85      0.86      6590



## Random Forest

In [None]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             # ('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 # ('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('rf', RandomForestClassifier(random_state=0, class_weight="balanced"))])

In [None]:
pipeline_modelo

Pipeline(memory=None,
         steps=[('preprocess',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('select_numeric_columns',
                                                                   <__main__.SelectColumnsTransformer object at 0x7fd581d16b90>)],
                                                           verbose=False),
                                                  ['age', 'campaign',
                                                   'previous', 'cons.conf.idx',
                                                   'euribor3m']),
                                                 ('cat'...
                 RandomForestClassifier(bootstrap=True, ccp_

In [None]:
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('f1', 'roc_auc'))

{'fit_time': array([2.65647292, 2.51803756, 2.39808679, 2.47354674, 2.47565603]),
 'score_time': array([0.35200453, 0.33567739, 0.3027122 , 0.30561471, 0.37091136]),
 'test_f1': array([0.32018561, 0.32821724, 0.31153389, 0.32328106, 0.35111111]),
 'test_roc_auc': array([0.76488503, 0.76032845, 0.75850802, 0.76109193, 0.75566086])}

## KNN

In [None]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             ('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 # ('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('knn', KNeighborsClassifier())])

In [None]:
pipeline_modelo

Pipeline(memory=None,
         steps=[('preprocess',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('select_numeric_columns',
                                                                   <__main__.SelectColumnsTransformer object at 0x7fd581cd4c50>),
                                                                  ('standard_scaler',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True))],
                                  

In [None]:
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('f1', 'roc_auc'))

{'fit_time': array([0.36779571, 0.36578655, 0.33474255, 0.37986541, 0.38184404]),
 'score_time': array([ 9.78636217,  8.91680884,  9.06589484, 10.27293444,  9.09081459]),
 'test_f1': array([0.31336406, 0.33179724, 0.32590856, 0.3087886 , 0.35386338]),
 'test_roc_auc': array([0.70310739, 0.70060157, 0.69511615, 0.70793873, 0.71124455])}

## Modelos Tree Based

### XGBOOST

In [None]:
import xgboost as xgb

In [None]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             # ('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 # ('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('xgb', xgb.XGBClassifier(seed=0))])

In [None]:
pipeline_modelo.fit(X_t, y_train)

Pipeline(memory=None,
         steps=[('preprocess',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('select_numeric_columns',
                                                                   <__main__.SelectColumnsTransformer object at 0x7f2164aad250>)],
                                                           verbose=False),
                                                  ['age', 'campaign',
                                                   'previous', 'cons.conf.idx',
                                                   'euribor3m']),
                                                 ('cat'...
                 XGBClassifier(base_score=0.5, booster='gbtr

In [None]:
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('f1', 'roc_auc'))

{'fit_time': array([2.4715898 , 2.42720032, 2.41386509, 2.43215537, 2.42058063]),
 'score_time': array([0.08356142, 0.08234549, 0.08738708, 0.08313775, 0.07965589]),
 'test_f1': array([0.27576975, 0.30406291, 0.29842932, 0.30440587, 0.32459426]),
 'test_roc_auc': array([0.80054878, 0.79947077, 0.78987088, 0.79740112, 0.78659637])}

In [None]:
pipeline_modelo[1].feature_importances_    #No se entiende este 

array([0.02405806, 0.01746204, 0.0136159 , 0.07061283, 0.22140473,
       0.0020534 , 0.03583425, 0.0099582 , 0.        , 0.        ,
       0.02186744, 0.        , 0.01123115, 0.        , 0.01450737,
       0.0167304 , 0.01276967, 0.00734479, 0.        , 0.01511674,
       0.0107288 , 0.01147832, 0.00606717, 0.01080613, 0.01855494,
       0.        , 0.0118583 , 0.01032898, 0.01180607, 0.04529954,
       0.        , 0.        , 0.01107221, 0.01004251, 0.01168574,
       0.01074075, 0.        , 0.00692597, 0.04348093, 0.        ,
       0.03862974, 0.        , 0.235927  ], dtype=float32)

In [None]:
#Solo Pre-procesamiento
train = pipeline_completo.fit_transform(X_t)
val = pipeline_completo.transform(X_v)

In [None]:
xgb=xgb.XGBClassifier(seed=0)
xgb.fit(train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0,
              silent=None, subsample=1, verbosity=1)

In [None]:
print("MÉTRICAS CONJUNTO DE TRAIN")
print(classification_report(y_train, xgb.predict(train)))
print("MÉTRICAS CONJUNTO DE VALIDACIÓN")
print(classification_report(y_val, xgb.predict(val)))

MÉTRICAS CONJUNTO DE TRAIN
              precision    recall  f1-score   support

           0       0.91      0.99      0.95     23390
           1       0.72      0.21      0.32      2970

    accuracy                           0.90     26360
   macro avg       0.81      0.60      0.64     26360
weighted avg       0.89      0.90      0.88     26360

MÉTRICAS CONJUNTO DE VALIDACIÓN
              precision    recall  f1-score   support

           0       0.91      0.99      0.95      5848
           1       0.67      0.21      0.32       742

    accuracy                           0.90      6590
   macro avg       0.79      0.60      0.63      6590
weighted avg       0.88      0.90      0.87      6590



In [None]:
#Optimización de hiperparámetros
#Grilla de parámetros
params={'objective':["binary:logistic","binary:hinge"],
        'learning_rate':[ 0.1,0.2,0.3],
        'max_depth':[2,4, 6, 7, 8, 10],
        'alpha':[2, 3, 5, 7],
        "n_estimators":[5, 7, 10]
       }
xgb=xgb.XGBClassifier(seed=0)

AttributeError: ignored

In [None]:
#Búsqueda de parámetros
cv_xgb = GridSearchCV(xgb, params, scoring='f1', cv=5,refit=True,n_jobs=-1)     
cv_xgb.fit(train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=0, silent=None,
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=-1,
             param_grid={'alpha': [2, 3, 5, 7],
                         'learning_rate': [0.1, 0.2, 0.3],
                         'max_depth': [2, 4, 6, 7, 8, 10],
                

In [None]:
cv_xgb.best_params_

{'alpha': 2,
 'learning_rate': 0.1,
 'max_depth': 7,
 'n_estimators': 7,
 'objective': 'binary:hinge'}

In [None]:
import xgboost as xgb

In [None]:
#Entrenamiento de la mejor versión encontrada del modelo
xgb_best = xgb.XGBClassifier(seed=0, alpha= 2, learning_rate= 0.1, max_depth= 7, n_estimators=7, objective='binary:hinge')
xgb_best.fit(train, y_train)

XGBClassifier(alpha=2, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=None, n_estimators=7, n_jobs=1,
              nthread=None, objective='binary:hinge', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0,
              silent=None, subsample=1, verbosity=1)

In [None]:
print("MÉTRICAS CONJUNTO DE TRAIN")
print(classification_report(y_train, xgb_best.predict(train)))
print("MÉTRICAS CONJUNTO DE VALIDACIÓN")
print(classification_report(y_val, xgb_best.predict(val)))

MÉTRICAS CONJUNTO DE TRAIN
              precision    recall  f1-score   support

           0       0.94      0.91      0.93     23390
           1       0.44      0.58      0.50      2970

    accuracy                           0.87     26360
   macro avg       0.69      0.74      0.71     26360
weighted avg       0.89      0.87      0.88     26360

MÉTRICAS CONJUNTO DE VALIDACIÓN
              precision    recall  f1-score   support

           0       0.94      0.90      0.92      5848
           1       0.43      0.58      0.50       742

    accuracy                           0.87      6590
   macro avg       0.69      0.74      0.71      6590
weighted avg       0.89      0.87      0.88      6590



### LigthGBM

In [None]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             # ('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 # ('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('lgbm', lgbm(seed=0))])

In [None]:
pipeline_modelo

In [None]:
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('f1', 'roc_auc', "precision", "recall"))