In [None]:
pip install pycaret

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import lightgbm as lgbm

In [None]:
import pycaret.classification

In [2]:
# Opción para ver todas las columnas del dataset en el notebook
pd.set_option('display.max_columns', 50)

# Cargamos los datos

In [3]:
# Leemos el dataset con la función de pandas "read_csv"
key = "data/bank-additional-full.csv"
df = pd.read_csv(key, sep=";")

In [4]:
#Eliminamos los outliers de Campaign:

q975_campaign=df.campaign.quantile(0.975)
df.drop(df[df.campaign >= q975_campaign].index, inplace = True)

In [5]:
df.head(2)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


# División en grupo de test y train

In [6]:
# Reemplazamos la columna y (target) por 1 y 0
df.y = df.y.replace('yes', 1)
df.y = df.y.replace('no', 0)

In [7]:
df_train, df_test = train_test_split(df, test_size=0.2, stratify=df.y, random_state=42)

### Análisis global con PyCaret

In [None]:
from pycaret.classification import *
clf = setup(data = df_train, target ="y" )

In [None]:
#best = compare_models()
best = compare_models(sort = 'f1') #default is 'Accuracy', LO CAMBIO POR F1 QUE ES LA QUE TOMAMOS COMO REFERENCIA

### Continúo división train, test y validation

In [8]:
X = df.drop(columns='y')
y = df.y

In [9]:
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42) 

In [10]:
X_temp.shape, y_temp.shape, X_test.shape, y_test.shape

((32075, 20), (32075,), (8019, 20), (8019,))

In [11]:
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, stratify=y_temp, random_state=42)

In [12]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape

((25660, 20), (25660,), (6415, 20), (6415,))

# Pre-procesamiento

In [None]:
#Todas las variables del dataset
variables_categoricas_original = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
variables_numericas_original = ['age', 'duration', 'campaign', 'pdays', 'previous','emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']

In [None]:
#Las variables que identificamos como relevantes
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

In [18]:
class SelectColumnsTransformer():
    def __init__(self, columns=None):
        self.columns = columns

    def transform(self, X, **transform_params):
        cpy_df = X[self.columns].copy()
        return cpy_df

    def fit(self, X, y=None, **fit_params):
        return self

In [None]:
#Aplicamos las transformaciones previas a los conjuntos de Train y Validation
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]
X_v = X_val[variables_categoricas + variables_numericas]     

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             #('standard_scaler', StandardScaler()),
                              ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                  ('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                       ('cat', pipeline_categorico, variables_categoricas)
                                      ])


train = pipeline_completo.fit_transform(X_t)
val = pipeline_completo.fit_transform(X_v)

# Métricas

Dado el desbalance de casos con que cuenta este dataset, las métricas a tomar en consideración para el análisis son: AUC y F1.

# Testeo por modelos

## Decision Tree

In [None]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             #('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 #('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('dt', DecisionTreeClassifier(random_state=0, class_weight="balanced"))])

In [None]:
pipeline_modelo.fit(X_t, y_train)

In [None]:
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('precision','recall','f1', 'roc_auc'))

Análisis de estos resultados sobre el conjunto de validación

In [None]:
train=pipeline_completo.fit_transform(X_t)

In [None]:
dt=DecisionTreeClassifier(random_state=0, class_weight="balanced")
dt.fit(train, y_train)

In [None]:
X_v=X_val[variables_categoricas + variables_numericas]
val=pipeline_completo.fit_transform(X_v)

In [None]:
y_val_pred=dt.predict(val)
print("VALIDACIÓN")
print(classification_report(y_val, y_val_pred))

## Logistic Regression

In [None]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             ('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 # ('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('lr', LogisticRegression(random_state=0, class_weight="balanced"))])

In [None]:
pipeline_modelo

In [None]:
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('f1', 'roc_auc'))

## SVM

In [None]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             # ('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 # ('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('svm', SVC(random_state=0, class_weight="balanced"))])

In [None]:
pipeline_modelo

In [None]:
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('f1', 'roc_auc'))

## Naive Bayes

In [None]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]
X_v = X_val[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             # ('standard_scaler', StandardScaler()),
                             ("kbins_discretizer", KBinsDiscretizer(n_bins=4, encode="ordinal", strategy="uniform")),   #strategy="uniform"
                             ('bins_cat', OneHotEncoder())
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 #('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())
                                 ])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('nb', ComplementNB())])

#The Complement Naive Bayes classifier was designed to correct the “severe assumptions” made by the standard Multinomial Naive Bayes classifier. It is particularly suited for imbalanced data sets.
#En el pre-procesamiento transformé todos los atributos en categóricos, porque es el requerimiento del tipo de modelo

In [None]:
pipeline_modelo

In [None]:
#Cross validation con toda la estimación sobre X_train
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('f1', 'roc_auc'))

In [None]:
#Pre-procesamiento
train = pipeline_completo.fit_transform(X_t)
val = pipeline_completo.transform(X_v)

In [None]:
nb=ComplementNB()
nb.fit(train, y_train)

In [None]:
print("MÉTRICAS CONJUNTO DE TRAIN")
print(classification_report(y_train, nb.predict(train)))
print("MÉTRICAS CONJUNTO DE VALIDACIÓN")
print(classification_report(y_val, nb.predict(val)))

In [None]:
#Optimización de hiperparámetros
#Grilla de parámetros
params={'alpha':[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1, 1.3, 1.5],
        'fit_prior':[True, False],
        'norm':[True,False]
       }

nb=ComplementNB()

In [None]:
#Búsqueda de parámetros
cv_nb = GridSearchCV(nb, params, scoring='f1', cv=5,refit=True,n_jobs=-1)     
cv_nb.fit(train, y_train)

In [None]:
cv_nb.best_params_

In [None]:
#Entrenamiento de la mejor versión encontrada del modelo
nb_best = ComplementNB(alpha=0.8, fit_prior=True, norm=True)
nb_best.fit(train, y_train)

In [None]:
print("MÉTRICAS CONJUNTO DE TRAIN")
print(classification_report(y_train, nb_best.predict(train)))
print("MÉTRICAS CONJUNTO DE VALIDACIÓN")
print(classification_report(y_val, nb_best.predict(val)))

## Random Forest

In [None]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             # ('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 # ('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('rf', RandomForestClassifier(random_state=0, class_weight="balanced"))])

In [None]:
pipeline_modelo

In [None]:
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('f1', 'roc_auc'))

## KNN

In [None]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             ('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 # ('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('knn', KNeighborsClassifier())])

In [None]:
pipeline_modelo

In [None]:
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('f1', 'roc_auc'))

## Modelos Tree Based

### XGBOOST

In [None]:
import xgboost as xgb

In [30]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             # ('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 #('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('xgb', xgb.XGBClassifier(seed=0))])

In [31]:
pipeline_modelo.fit(X_t, y_train)





Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('select_numeric_columns',
                                                                   <__main__.SelectColumnsTransformer object at 0x00000094C3E95340>)]),
                                                  ['age', 'campaign',
                                                   'previous', 'cons.conf.idx',
                                                   'euribor3m']),
                                                 ('cat',
                                                  Pipeline(steps=[('select_categoric_columns',
                                                                   <__main__.SelectColumnsTransformer object at 0x00000094E9C35220>),
                                                                  (...
                               colsample_bytree=1, gamma=0, gpu_id=-1,
                               importan

In [32]:
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('f1', 'roc_auc'))





















{'fit_time': array([2.7139256 , 2.4607482 , 2.46525741, 2.43673253, 2.39470148]),
 'score_time': array([0.08105874, 0.07805586, 0.07605433, 0.07905602, 0.07505345]),
 'test_f1': array([0.35813953, 0.35730858, 0.36792453, 0.3685446 , 0.37641723]),
 'test_roc_auc': array([0.77235912, 0.77015776, 0.77837787, 0.78274101, 0.77563957])}

In [None]:
pipeline_modelo[1].feature_importances_    #No se entiende este 

In [33]:
#Solo Pre-procesamiento
train = pipeline_completo.fit_transform(X_t)
val = pipeline_completo.transform(X_v)



In [34]:
xgb=xgb.XGBClassifier(seed=0)
xgb.fit(train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [35]:
print("MÉTRICAS CONJUNTO DE TRAIN")
print(classification_report(y_train, xgb.predict(train)))
print("MÉTRICAS CONJUNTO DE VALIDACIÓN")
print(classification_report(y_val, xgb.predict(val)))

MÉTRICAS CONJUNTO DE TRAIN
              precision    recall  f1-score   support

           0       0.93      0.99      0.96     23390
           1       0.90      0.42      0.58      2970

    accuracy                           0.93     26360
   macro avg       0.91      0.71      0.77     26360
weighted avg       0.93      0.93      0.92     26360

MÉTRICAS CONJUNTO DE VALIDACIÓN
              precision    recall  f1-score   support

           0       0.91      0.98      0.95      5848
           1       0.62      0.27      0.38       742

    accuracy                           0.90      6590
   macro avg       0.77      0.63      0.66      6590
weighted avg       0.88      0.90      0.88      6590



In [37]:
import xgboost as xgb

In [38]:
#Optimización de hiperparámetros
#Grilla de parámetros
params={'objective':["binary:logistic","binary:hinge","binary:logitraw"],
        'learning_rate':[ 0.1,0.2,0.3],
        'max_depth':[2,4, 6, 7, 8, 10],
        'alpha':[2, 3, 5, 7],
        "n_estimators":[5, 7, 10]
       }
xg=xgb.XGBClassifier()

In [39]:
#Búsqueda de parámetros
cv_xgb = GridSearchCV(xg, params, scoring='f1', cv=5,refit=True,n_jobs=-1)     
cv_xgb.fit(train, y_train)



GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs...
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None, va

In [40]:
cv_xgb.best_params_

{'alpha': 7,
 'learning_rate': 0.1,
 'max_depth': 8,
 'n_estimators': 10,
 'objective': 'binary:hinge'}

In [None]:
import xgboost as xgb

In [41]:
#Entrenamiento de la mejor versión encontrada del modelo
xgb_best = xgb.XGBClassifier(seed=0, alpha= 7, learning_rate= 0.1, max_depth= 8, n_estimators=10, objective='binary:hinge')
xgb_best.fit(train, y_train)

XGBClassifier(alpha=7, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=8,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=10, n_jobs=4, num_parallel_tree=1,
              objective='binary:hinge', random_state=0, reg_alpha=7,
              reg_lambda=1, scale_pos_weight=None, seed=0, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [42]:
print("MÉTRICAS CONJUNTO DE TRAIN")
print(classification_report(y_train, xgb_best.predict(train)))
print("MÉTRICAS CONJUNTO DE VALIDACIÓN")
print(classification_report(y_val, xgb_best.predict(val)))

MÉTRICAS CONJUNTO DE TRAIN
              precision    recall  f1-score   support

           0       0.94      0.91      0.93     23390
           1       0.44      0.56      0.49      2970

    accuracy                           0.87     26360
   macro avg       0.69      0.73      0.71     26360
weighted avg       0.89      0.87      0.88     26360

MÉTRICAS CONJUNTO DE VALIDACIÓN
              precision    recall  f1-score   support

           0       0.94      0.91      0.93      5848
           1       0.44      0.57      0.49       742

    accuracy                           0.87      6590
   macro avg       0.69      0.74      0.71      6590
weighted avg       0.89      0.87      0.88      6590



In [None]:
#Optimización de hiperparámetros
#Grilla de parámetros
params={'objective':["binary:logistic","binary:hinge","binary:logitraw"],
        'learning_rate':[0.05,0.1,0.15,0.2,0.25,0.3,0.4,0.5],
        'max_depth':[2,3, 4,5, 6, 7, 8, 9, 10, 12, 15],
        'alpha':[0, 0.5, 1, 2, 3, 5, 6, 7, 8, 9, 10],
        'lambda':[0.5, 1, 2, 3, 5],
        "n_estimators":[3, 5, 6, 7, 8, 9, 10, 15],
        "booster":["gbtree","dart"],
        "gamma":[0.5,1,2,5, 7, 8],
        "tree_method":["auto","exact","approx","hist"]
       }
xg=xgb.XGBClassifier()

In [None]:
#Búsqueda de parámetros
rcv_xgb = RandomizedSearchCV(xg, params, scoring='f1', cv=5,refit=True,n_jobs=-1)     
rcv_xgb.fit(train, y_train)

In [None]:
rcv_xgb.best_params_

In [None]:
#Entrenamiento de la mejor versión encontrada del modelo
xgb_best_r = xgb.XGBClassifier(seed=0, tree_method= 'approx', objective="binary:hinge", n_estimators=3, max_depth= 6,
                               learning_rate= 0.25, reg_lambda=3, gamma= 8, booster="dart", 
                               alpha= 0 )   #el lambda por default es 1
xgb_best_r.fit(train, y_train)

In [None]:
print("MÉTRICAS CONJUNTO DE TRAIN")
print(classification_report(y_train, xgb_best_r.predict(train)))
print("MÉTRICAS CONJUNTO DE VALIDACIÓN")
print(classification_report(y_val, xgb_best_r.predict(val)))

### LigthGBM

In [None]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             # ('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 # ('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('lgbm', lgbm(seed=0))])

In [None]:
pipeline_modelo

In [None]:
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('f1', 'roc_auc', "precision", "recall"))

## Prueba con las variables del PCA

In [None]:
# Leemos el dataset con la función de pandas "read_csv"
key = "data/final_df.csv"
df_pca = pd.read_csv(key, sep=",")

In [None]:
df_pca.head(2)

In [None]:
df_pca.info()

In [None]:
df_pca.columns

In [None]:
X_pca = df_pca.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'age', 'job', 'marital', 'education',
       'default', 'housing', 'loan', 'contact', 'month', 'day_of_week',
       'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y',
       'job_num', 'marital_num', 'education_num', 'loan_num', 'housing_num',
       'default_num'])
y_pca = df_pca.y

In [None]:
X_pca_temp, X_pca_test, y_pca_temp, y_pca_test = train_test_split(X_pca, y_pca, test_size=0.2, stratify=y, random_state=42) 

# Probando XGBoost con distintas variables

In [None]:
#Todas las variables del dataset
variables_categoricas_original = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
variables_numericas_original = ['age', 'campaign', 'previous','emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']

Todas las variables

In [43]:
#Todas las variables del dataset menos pdays y duration, que se definió que no debían estar
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous','emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']

In [44]:
# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]
X_v = X_val[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             # ('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 #('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

In [45]:
#Solo Pre-procesamiento
train = pipeline_completo.fit_transform(X_t)
val = pipeline_completo.transform(X_v)

In [46]:
xgb_var=xgb.XGBClassifier(seed=0)
xgb_var.fit(train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [47]:
print("MÉTRICAS CONJUNTO DE TRAIN")
print(classification_report(y_train, xgb_var.predict(train)))
print("MÉTRICAS CONJUNTO DE VALIDACIÓN")
print(classification_report(y_val, xgb_var.predict(val)))

MÉTRICAS CONJUNTO DE TRAIN
              precision    recall  f1-score   support

           0       0.93      1.00      0.96     23390
           1       0.92      0.45      0.60      2970

    accuracy                           0.93     26360
   macro avg       0.93      0.72      0.78     26360
weighted avg       0.93      0.93      0.92     26360

MÉTRICAS CONJUNTO DE VALIDACIÓN
              precision    recall  f1-score   support

           0       0.91      0.98      0.94      5848
           1       0.61      0.27      0.38       742

    accuracy                           0.90      6590
   macro avg       0.76      0.63      0.66      6590
weighted avg       0.88      0.90      0.88      6590



Saco las variables de contexto que estaban muy correlacionadas

In [48]:
#Todas las variables del dataset menos pdays y duration, menos las de contexto que estaban muy correlacionadas
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

In [49]:
# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]
X_v = X_val[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             # ('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 #('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

In [50]:
#Solo Pre-procesamiento
train = pipeline_completo.fit_transform(X_t)
val = pipeline_completo.transform(X_v)

In [51]:
xgb_var=xgb.XGBClassifier(seed=0)
xgb_var.fit(train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [52]:
print("MÉTRICAS CONJUNTO DE TRAIN")
print(classification_report(y_train, xgb_var.predict(train)))
print("MÉTRICAS CONJUNTO DE VALIDACIÓN")
print(classification_report(y_val, xgb_var.predict(val)))

MÉTRICAS CONJUNTO DE TRAIN
              precision    recall  f1-score   support

           0       0.93      0.99      0.96     23390
           1       0.90      0.43      0.58      2970

    accuracy                           0.93     26360
   macro avg       0.91      0.71      0.77     26360
weighted avg       0.93      0.93      0.92     26360

MÉTRICAS CONJUNTO DE VALIDACIÓN
              precision    recall  f1-score   support

           0       0.91      0.98      0.94      5848
           1       0.60      0.26      0.37       742

    accuracy                           0.90      6590
   macro avg       0.76      0.62      0.66      6590
weighted avg       0.88      0.90      0.88      6590



Ver otras combinaciones de variables

In [133]:
#Todas las variables del dataset menos pdays y duration, menos las de contexto que estaban muy correlacionadas
#saqué month, luego default (volvió al resultado mejor), saco housing (mejoró), 
#saco previous (empeora)
#saco loan (empeora)
#saco marital (queda igual)
#saco day_of_week (empeora)
#saco education (empeora)
#saco job (empeora)
#saco contact (empeora poquito)
#saco poutcome (empeora poquito)
#saco campaign (empeora poquito)
#saco cons idx (empeora poquito)   

variables_categoricas = ['job', 'education', 'contact','loan', 'day_of_week', 'poutcome']
variables_numericas = ['age', 'campaign','previous', 'cons.conf.idx', 'euribor3m']

In [144]:
# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]
X_v = X_val[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             # ('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 #('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

In [135]:
#Solo Pre-procesamiento
train = pipeline_completo.fit_transform(X_t)
val = pipeline_completo.transform(X_v)

In [136]:
xgb_var=xgb.XGBClassifier(seed=0)
xgb_var.fit(train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [137]:
print("MÉTRICAS CONJUNTO DE TRAIN")
print(classification_report(y_train, xgb_var.predict(train)))
print("MÉTRICAS CONJUNTO DE VALIDACIÓN")
print(classification_report(y_val, xgb_var.predict(val)))

MÉTRICAS CONJUNTO DE TRAIN
              precision    recall  f1-score   support

           0       0.93      0.99      0.96     23390
           1       0.85      0.41      0.55      2970

    accuracy                           0.93     26360
   macro avg       0.89      0.70      0.76     26360
weighted avg       0.92      0.93      0.91     26360

MÉTRICAS CONJUNTO DE VALIDACIÓN
              precision    recall  f1-score   support

           0       0.91      0.97      0.94      5848
           1       0.54      0.25      0.34       742

    accuracy                           0.89      6590
   macro avg       0.73      0.61      0.64      6590
weighted avg       0.87      0.89      0.87      6590



Optimización de parámetros considerando la mejor combinación de variables

In [145]:
variables_categoricas = ['job', 'education', 'contact','loan', 'day_of_week', 'poutcome']
variables_numericas = ['age', 'campaign','previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]
X_v = X_val[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             # ('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 #('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

In [146]:
#Solo Pre-procesamiento
train = pipeline_completo.fit_transform(X_t)
val = pipeline_completo.transform(X_v)

In [147]:
#Optimización de hiperparámetros
#Grilla de parámetros
params={'objective':["binary:logistic","binary:hinge","binary:logitraw"],
        'learning_rate':[ 0.1,0.2,0.3],
        'max_depth':[2,4, 6, 7, 8, 10],
        'alpha':[2, 3, 5, 7],
        "n_estimators":[5, 7, 10]
       }
xg=xgb.XGBClassifier()

In [148]:
#Búsqueda de parámetros
cv_xgb = GridSearchCV(xg, params, scoring='f1', cv=5,refit=True,n_jobs=-1)     
cv_xgb.fit(train, y_train)



GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs...
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None, va

In [149]:
cv_xgb.best_params_   ### no es muy distinto al mejor modelo encontrado con la otra especificación

{'alpha': 7,
 'learning_rate': 0.1,
 'max_depth': 8,
 'n_estimators': 10,
 'objective': 'binary:hinge'}

In [150]:
#Entrenamiento de la mejor versión encontrada del modelo
xgb_best = xgb.XGBClassifier(seed=0, alpha= 7, learning_rate= 0.1, max_depth= 8, n_estimators=10, objective='binary:hinge')
xgb_best.fit(train, y_train)

XGBClassifier(alpha=7, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=8,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=10, n_jobs=4, num_parallel_tree=1,
              objective='binary:hinge', random_state=0, reg_alpha=7,
              reg_lambda=1, scale_pos_weight=None, seed=0, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [151]:
print("MÉTRICAS CONJUNTO DE TRAIN")
print(classification_report(y_train, xgb_best.predict(train)))
print("MÉTRICAS CONJUNTO DE VALIDACIÓN")
print(classification_report(y_val, xgb_best.predict(val)))

MÉTRICAS CONJUNTO DE TRAIN
              precision    recall  f1-score   support

           0       0.94      0.91      0.93     23390
           1       0.44      0.56      0.49      2970

    accuracy                           0.87     26360
   macro avg       0.69      0.73      0.71     26360
weighted avg       0.89      0.87      0.88     26360

MÉTRICAS CONJUNTO DE VALIDACIÓN
              precision    recall  f1-score   support

           0       0.94      0.91      0.93      5848
           1       0.44      0.56      0.49       742

    accuracy                           0.87      6590
   macro avg       0.69      0.74      0.71      6590
weighted avg       0.89      0.87      0.88      6590



**Feature explainability**

In [154]:
variables_categoricas = ['job', 'education', 'contact','loan', 'day_of_week', 'poutcome']
variables_numericas = ['age', 'campaign','previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]
X_v = X_val[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             # ('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 #('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('xgb', xgb.XGBClassifier(seed=0, alpha= 7, learning_rate= 0.1, max_depth= 8, n_estimators=10, objective='binary:hinge'))])

In [167]:
pipeline_modelo.fit(X_t, y_train)



Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('select_numeric_columns',
                                                                   <__main__.SelectColumnsTransformer object at 0x00000094EBADFAC0>)]),
                                                  ['age', 'campaign',
                                                   'previous', 'cons.conf.idx',
                                                   'euribor3m']),
                                                 ('cat',
                                                  Pipeline(steps=[('select_categoric_columns',
                                                                   <__main__.SelectColumnsTransformer object at 0x00000094EB834DC0>),
                                                                  (...
                               importance_type='gain',
                               interaction_constraints=

In [156]:
val = pipeline_completo.transform(X_v)

In [169]:
train.shape

(26360, 38)

In [None]:
#Obtener los nombres de las variables
# Si realizamos one hot encoding, vamos a tener el problema de que se incrementan el numero de features y necesitamos la nueva lista.
numeric_features = variables_numericas
cat_features = pipeline_modelo.named_steps['preprocess'].transformers_[1][1].get_feature_names(variables_categoricas)

In [None]:
onehot_columns = np.array(cat_features)
numeric_features_list = np.array(numeric_features)
numeric_features_list = np.append(numeric_features_list, onehot_columns)

In [None]:
# Es necesario ordenar las los valores del feature importance (utilizamos argsort para tener el orden de los indices)
sorted_idx = pipeline_modelo[1].feature_importances_.argsort()
plt.barh(numeric_features_list[sorted_idx], pipeline_modelo[1].feature_importances_[sorted_idx])
plt.xlabel("Xgboost Feature Importance")
plt.show()

In [161]:
pip install eli5

Collecting eli5
  Downloading eli5-0.11.0-py2.py3-none-any.whl (106 kB)
Installing collected packages: eli5
Successfully installed eli5-0.11.0
Note: you may need to restart the kernel to use updated packages.


In [162]:
#Probando la otra alternativa
import eli5

In [164]:
onehot_columns = cat_features          #depende de paso anterior que no salió
features_list = list(numeric_features)
features_list.extend(onehot_columns)

In [165]:
eli5.explain_weights(pipeline_modelo[1], top=50, feature_names=features_list)

TypeError: 'NoneType' object is not iterable

## **LGBM**

In [7]:
##Pre-procesamiento sobre los conjuntos de Train y Validation
variables_categoricas = ['job', 'marital','education','housing', 'loan','contact','poutcome']
variables_numericas = ['age', 'campaign','previous','euribor3m','cons.conf.idx' ]

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]
X_v = X_val[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             ('standard_scaler', StandardScaler()),
                             #('pca', PCA(n_components=4))
                            ])

pipeline_categorico = Pipeline ([('imputer', SimpleImputer(strategy='most_frequent', missing_values = None)),
                                   ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                       ('cat', pipeline_categorico, variables_categoricas)
                                      ])

train = pipeline_completo.fit_transform(X_t)
val = pipeline_completo.fit_transform(X_v)

NameError: name 'X_train' is not defined

In [22]:
import lightgbm as lgbm

In [17]:
lgb_clf = lgbm.LGBMClassifier()
lgb_clf.fit(train, y_train)

LGBMClassifier()

In [18]:
print("MÉTRICAS CONJUNTO DE TRAIN")
print(classification_report(y_train, lgb_clf.predict(train)))
print("MÉTRICAS CONJUNTO DE VALIDACIÓN")
print(classification_report(y_val, lgb_clf.predict(val)))

MÉTRICAS CONJUNTO DE TRAIN
              precision    recall  f1-score   support

           0       0.92      0.99      0.95     23390
           1       0.78      0.33      0.46      2970

    accuracy                           0.91     26360
   macro avg       0.85      0.66      0.71     26360
weighted avg       0.91      0.91      0.90     26360

MÉTRICAS CONJUNTO DE VALIDACIÓN
              precision    recall  f1-score   support

           0       0.91      0.98      0.94      5848
           1       0.62      0.24      0.34       742

    accuracy                           0.90      6590
   macro avg       0.77      0.61      0.64      6590
weighted avg       0.88      0.90      0.88      6590



In [21]:
lgb_clf = lgbm.LGBMClassifier(objective="binary", class_weight="balanced")
lgb_clf.fit(train, y_train)

LGBMClassifier(class_weight='balanced', objective='binary')

In [22]:
print("MÉTRICAS CONJUNTO DE TRAIN")
print(classification_report(y_train, lgb_clf.predict(train)))
print("MÉTRICAS CONJUNTO DE VALIDACIÓN")
print(classification_report(y_val, lgb_clf.predict(val)))

MÉTRICAS CONJUNTO DE TRAIN
              precision    recall  f1-score   support

           0       0.96      0.87      0.91     23390
           1       0.41      0.70      0.51      2970

    accuracy                           0.85     26360
   macro avg       0.68      0.79      0.71     26360
weighted avg       0.90      0.85      0.87     26360

MÉTRICAS CONJUNTO DE VALIDACIÓN
              precision    recall  f1-score   support

           0       0.95      0.85      0.89      5848
           1       0.34      0.64      0.45       742

    accuracy                           0.82      6590
   macro avg       0.65      0.74      0.67      6590
weighted avg       0.88      0.82      0.84      6590



In [23]:
#Optimización de hiperparámetros
#Grilla de parámetros
params={'boosting_type':["gbdt","dart","goss","rf"],
        'learning_rate':[ 0.1,0.2,0.3,0.5,0.7,0.8,1],
        'max_depth':[1,2,4, 6, 7, 8, 10],
        'reg_alpha':[0, 2, 3, 5],
        "n_estimators":[5, 7, 10, 50, 100, 200, 500]
       }
lgb_clf = lgbm.LGBMClassifier(objective="binary", class_weight="balanced")

In [24]:
#Búsqueda de parámetros
cv_lgb = GridSearchCV(lgb_clf, params, scoring='f1', cv=5,refit=True,n_jobs=-1)     
cv_lgb.fit(train, y_train)

GridSearchCV(cv=5,
             estimator=LGBMClassifier(class_weight='balanced',
                                      objective='binary'),
             n_jobs=-1,
             param_grid={'boosting_type': ['gbdt', 'dart', 'goss', 'rf'],
                         'learning_rate': [0.1, 0.2, 0.3, 0.5, 0.7, 0.8, 1],
                         'max_depth': [1, 2, 4, 6, 7, 8, 10],
                         'n_estimators': [5, 7, 10, 50, 100, 200, 500],
                         'reg_alpha': [0, 2, 3, 5]},
             scoring='f1')

In [25]:
cv_lgb.best_params_

{'boosting_type': 'dart',
 'learning_rate': 0.2,
 'max_depth': 8,
 'n_estimators': 100,
 'reg_alpha': 3}

In [20]:
from sklearn.decomposition import PCA

In [25]:
##Pre-procesamiento sobre los conjuntos de Train y Validation
variables_categoricas = ['job', 'marital','education','housing', 'loan','contact','poutcome']
variables_numericas= ['age', 'campaign', 'previous','emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]
X_v = X_val[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             ('standard_scaler', StandardScaler()),
                             ('pca', PCA(n_components=4))
                            ])

pipeline_categorico = Pipeline ([('imputer', SimpleImputer(strategy='most_frequent', missing_values = None)),
                                   ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                       ('cat', pipeline_categorico, variables_categoricas)
                                      ])

train = pipeline_completo.fit_transform(X_t)
val = pipeline_completo.fit_transform(X_v)

In [26]:
#Entrenamiento de la mejor versión encontrada del modelo
lgb_best= lgbm.LGBMClassifier(objective="binary", class_weight="balanced", boosting_type= 'dart',
                     learning_rate= 0.2, max_depth= 8, n_estimators= 100, reg_alpha= 3 )
lgb_best.fit(train, y_train)

LGBMClassifier(boosting_type='dart', class_weight='balanced', learning_rate=0.2,
               max_depth=8, objective='binary', reg_alpha=3)

In [27]:
print("MÉTRICAS CONJUNTO DE TRAIN")
print(classification_report(y_train, lgb_best.predict(train)))
print("MÉTRICAS CONJUNTO DE VALIDACIÓN")
print(classification_report(y_val, lgb_best.predict(val)))

MÉTRICAS CONJUNTO DE TRAIN
              precision    recall  f1-score   support

           0       0.96      0.86      0.91     22715
           1       0.40      0.70      0.51      2945

    accuracy                           0.85     25660
   macro avg       0.68      0.78      0.71     25660
weighted avg       0.89      0.85      0.86     25660

MÉTRICAS CONJUNTO DE VALIDACIÓN
              precision    recall  f1-score   support

           0       0.94      0.74      0.83      5679
           1       0.24      0.64      0.35       736

    accuracy                           0.73      6415
   macro avg       0.59      0.69      0.59      6415
weighted avg       0.86      0.73      0.77      6415



In [21]:
##Pre-procesamiento sobre los conjuntos de Train y Validation
variables_categoricas = ['job', 'marital','education','housing', 'loan','contact','poutcome']
variables_numericas= ['age', 'campaign', 'previous','emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]
X_v = X_val[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             ('standard_scaler', StandardScaler()),
                             ('pca', PCA(n_components=4))
                            ])

pipeline_categorico = Pipeline ([('imputer', SimpleImputer(strategy='most_frequent', missing_values = None)),
                                   ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                       ('cat', pipeline_categorico, variables_categoricas)
                                      ])

train = pipeline_completo.fit_transform(X_t)
val = pipeline_completo.fit_transform(X_v)

In [13]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
#bagging = BaggingClassifier(KNeighborsClassifier(),max_samples=0.5, max_features=0.5)

In [29]:


search_params = {
'base_estimator__n_neighbors' : [4,5,6,7, 10],
'base_estimator__weights': ['uniform', 'distance'],
'base_estimator__algorithm': ['ball_tree', 'kd_tree', 'brute'],
'base_estimator__p': [1,2],
'n_estimators' : [10, 50, 100, 200],
'max_samples' : [ 0.5, 0.6, 0.7,0.8],
'max_features': [0.5, 0.6,0.7,0.8,0.9]
}

bagging_clf = RandomizedSearchCV(BaggingClassifier(KNeighborsClassifier(), random_state = 42), search_params, cv=5, scoring='f1', n_jobs=-1)
bagging_clf.fit(train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=BaggingClassifier(base_estimator=KNeighborsClassifier(),
                                               random_state=42),
                   n_jobs=-1,
                   param_distributions={'base_estimator__algorithm': ['ball_tree',
                                                                      'kd_tree',
                                                                      'brute'],
                                        'base_estimator__n_neighbors': [4, 5, 6,
                                                                        7, 10],
                                        'base_estimator__p': [1, 2],
                                        'base_estimator__weights': ['uniform',
                                                                    'distance'],
                                        'max_features': [0.5, 0.6, 0.7, 0.8,
                                                         0.9],
                    

In [30]:
bagging_clf.best_params_

{'n_estimators': 10,
 'max_samples': 0.8,
 'max_features': 0.8,
 'base_estimator__weights': 'distance',
 'base_estimator__p': 2,
 'base_estimator__n_neighbors': 5,
 'base_estimator__algorithm': 'brute'}

In [24]:
bagging_clf_best= BaggingClassifier(KNeighborsClassifier(), n_estimators= 10, max_samples= 0.8,
                 max_features= 0.8,random_state = 42)
bagging_clf_best.fit(train,y_train)

BaggingClassifier(base_estimator=KNeighborsClassifier(), max_features=0.8,
                  max_samples=0.8, random_state=42)

In [26]:
print("MÉTRICAS CONJUNTO DE TRAIN")
print(classification_report(y_train, bagging_clf_best.predict(train)))
print("MÉTRICAS CONJUNTO DE VALIDACIÓN")
print(classification_report(y_val, bagging_clf_best.predict(val)))

MÉTRICAS CONJUNTO DE TRAIN
              precision    recall  f1-score   support

           0       0.92      0.99      0.95     22715
           1       0.79      0.30      0.44      2945

    accuracy                           0.91     25660
   macro avg       0.85      0.65      0.69     25660
weighted avg       0.90      0.91      0.89     25660

MÉTRICAS CONJUNTO DE VALIDACIÓN
              precision    recall  f1-score   support

           0       0.90      0.98      0.94      5679
           1       0.56      0.19      0.28       736

    accuracy                           0.89      6415
   macro avg       0.73      0.58      0.61      6415
weighted avg       0.86      0.89      0.87      6415



In [28]:
bagging_clf_best= BaggingClassifier(KNeighborsClassifier(weights= 'distance', p=2,n_neighbors=5, algorithm="brute"), n_estimators= 10, max_samples= 0.8,
                 max_features= 0.8,random_state = 42)
bagging_clf_best.fit(train,y_train)

BaggingClassifier(base_estimator=KNeighborsClassifier(algorithm='brute',
                                                      weights='distance'),
                  max_features=0.8, max_samples=0.8, random_state=42)

In [29]:
print("MÉTRICAS CONJUNTO DE TRAIN")
print(classification_report(y_train, bagging_clf_best.predict(train)))
print("MÉTRICAS CONJUNTO DE VALIDACIÓN")
print(classification_report(y_val, bagging_clf_best.predict(val)))

MÉTRICAS CONJUNTO DE TRAIN
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     22715
           1       0.98      0.81      0.88      2945

    accuracy                           0.98     25660
   macro avg       0.98      0.90      0.94     25660
weighted avg       0.98      0.98      0.97     25660

MÉTRICAS CONJUNTO DE VALIDACIÓN
              precision    recall  f1-score   support

           0       0.90      0.98      0.94      5679
           1       0.53      0.20      0.29       736

    accuracy                           0.89      6415
   macro avg       0.72      0.59      0.61      6415
weighted avg       0.86      0.89      0.86      6415

