In [None]:
pip install pycaret

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import lightgbm as lgbm

In [2]:
import pycaret.classification

In [3]:
# Opción para ver todas las columnas del dataset en el notebook
pd.set_option('display.max_columns', 50)

# Cargamos los datos

In [2]:
# Leemos el dataset con la función de pandas "read_csv"
key = "data/bank-additional-full.csv"
df = pd.read_csv(key, sep=";")

In [3]:
df.head(2)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


# División en grupo de test y train

In [4]:
# Reemplazamos la columna y (target) por 1 y 0
df.y = df.y.replace('yes', 1)
df.y = df.y.replace('no', 0)

In [5]:
df_train, df_test = train_test_split(df, test_size=0.2, stratify=df.y, random_state=42)

### Análisis global con PyCaret

In [8]:
from pycaret.classification import *
clf = setup(data = df_train, target ="y" )

Unnamed: 0,Description,Value
0,session_id,8427
1,Target,y
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(32950, 21)"
5,Missing Values,False
6,Numeric Features,9
7,Categorical Features,11
8,Ordinal Features,False
9,High Cardinality Features,False


In [9]:
#best = compare_models()
best = compare_models(sort = 'f1') #default is 'Accuracy', LO CAMBIO POR F1 QUE ES LA QUE TOMAMOS COMO REFERENCIA

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9148,0.9475,0.5321,0.6437,0.5823,0.5353,0.5385,0.476
xgboost,Extreme Gradient Boosting,0.9126,0.9443,0.5285,0.6299,0.5741,0.5259,0.5288,3.521
catboost,CatBoost Classifier,0.9143,0.9463,0.5181,0.6449,0.5739,0.5269,0.5312,9.512
gbc,Gradient Boosting Classifier,0.9132,0.9442,0.501,0.6447,0.563,0.5157,0.5212,2.33
lda,Linear Discriminant Analysis,0.9075,0.9318,0.4916,0.6055,0.5421,0.4913,0.4949,0.339
knn,K Neighbors Classifier,0.9054,0.8698,0.4769,0.5956,0.5288,0.477,0.4811,0.829
rf,Random Forest Classifier,0.9105,0.9375,0.4345,0.6473,0.5195,0.4723,0.484,1.341
dt,Decision Tree Classifier,0.8884,0.7251,0.515,0.5008,0.5073,0.4444,0.4447,0.139
lr,Logistic Regression,0.9079,0.9284,0.3906,0.6442,0.4855,0.4383,0.4553,3.335
ada,Ada Boost Classifier,0.9071,0.935,0.3809,0.642,0.4775,0.4302,0.4482,0.639


### Continúo división train, test y validation

In [6]:
X = df.drop(columns='y')
y = df.y

In [7]:
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42) 

In [8]:
X_temp.shape, y_temp.shape, X_test.shape, y_test.shape

((32950, 20), (32950,), (8238, 20), (8238,))

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, stratify=y_temp, random_state=42)

In [11]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape

((26360, 20), (26360,), (6590, 20), (6590,))

# Pre-procesamiento

In [None]:
#Todas las variables del dataset
variables_categoricas_original = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
variables_numericas_original = ['age', 'duration', 'campaign', 'pdays', 'previous','emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']

In [None]:
#Las variables que identificamos como relevantes
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

In [12]:
class SelectColumnsTransformer():
    def __init__(self, columns=None):
        self.columns = columns

    def transform(self, X, **transform_params):
        cpy_df = X[self.columns].copy()
        return cpy_df

    def fit(self, X, y=None, **fit_params):
        return self

In [None]:
#Aplicamos las transformaciones previas a los conjuntos de Train y Validation
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]
X_v = X_val[variables_categoricas + variables_numericas]     

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             #('standard_scaler', StandardScaler()),
                              ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                  ('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                       ('cat', pipeline_categorico, variables_categoricas)
                                      ])


train = pipeline_completo.fit_transform(X_t)
val = pipeline_completo.fit_transform(X_v)

# Métricas

Dado el desbalance de casos con que cuenta este dataset, las métricas a tomar en consideración para el análisis son: AUC y F1.

# Testeo por modelos

## Decision Tree

In [13]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             #('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 #('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('dt', DecisionTreeClassifier(random_state=0, class_weight="balanced"))])

In [14]:
pipeline_modelo.fit(X_t, y_train)

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('select_numeric_columns',
                                                                   <__main__.SelectColumnsTransformer object at 0x00000016772FA490>)]),
                                                  ['age', 'campaign',
                                                   'previous', 'cons.conf.idx',
                                                   'euribor3m']),
                                                 ('cat',
                                                  Pipeline(steps=[('select_categoric_columns',
                                                                   <__main__.SelectColumnsTransformer object at 0x00000016772FA400>),
                                                                  ('cat',
                                                                   OneHotEncoder())]),
                    

In [15]:
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('precision','recall','f1', 'roc_auc'))

{'fit_time': array([0.33122706, 0.36825967, 0.21813774, 0.22113824, 0.21715379]),
 'score_time': array([0.08506346, 0.04905415, 0.04905248, 0.05303812, 0.04703593]),
 'test_precision': array([0.32376396, 0.30529595, 0.31764706, 0.3       , 0.26911315]),
 'test_recall': array([0.34175084, 0.32996633, 0.31818182, 0.3030303 , 0.2962963 ]),
 'test_f1': array([0.33251433, 0.3171521 , 0.31791421, 0.30150754, 0.28205128]),
 'test_roc_auc': array([0.62598516, 0.61779635, 0.61627444, 0.60728455, 0.59757472])}

Análisis de estos resultados sobre el conjunto de validación

In [16]:
train=pipeline_completo.fit_transform(X_t)

In [17]:
dt=DecisionTreeClassifier(random_state=0, class_weight="balanced")
dt.fit(train, y_train)

DecisionTreeClassifier(class_weight='balanced', random_state=0)

In [18]:
X_v=X_val[variables_categoricas + variables_numericas]
val=pipeline_completo.fit_transform(X_v)

In [19]:
y_val_pred=dt.predict(val)
print("VALIDACIÓN")
print(classification_report(y_val, y_val_pred))

VALIDACIÓN
              precision    recall  f1-score   support

           0       0.91      0.90      0.91      5848
           1       0.29      0.33      0.31       742

    accuracy                           0.83      6590
   macro avg       0.60      0.61      0.61      6590
weighted avg       0.84      0.83      0.84      6590



## Logistic Regression

In [20]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             ('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 # ('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('lr', LogisticRegression(random_state=0, class_weight="balanced"))])

In [21]:
pipeline_modelo

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('select_numeric_columns',
                                                                   <__main__.SelectColumnsTransformer object at 0x000000167751B5E0>),
                                                                  ('standard_scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'campaign',
                                                   'previous', 'cons.conf.idx',
                                                   'euribor3m']),
                                                 ('cat',
                                                  Pipeline(steps=[('select_categoric_columns',
                                                                   <__main__.SelectColumnsTransformer object at 0x000000161D404220>),
         

In [22]:
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('f1', 'roc_auc'))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'fit_time': array([1.67040277, 0.614434  , 0.64744401, 0.67348075, 0.60243106]),
 'score_time': array([0.16711879, 0.05903745, 0.0740521 , 0.0670445 , 0.06805968]),
 'test_f1': array([0.38568019, 0.38275699, 0.37743006, 0.38307985, 0.3780037 ]),
 'test_roc_auc': array([0.77777022, 0.77820927, 0.76186962, 0.77274185, 0.76329329])}

## SVM

In [23]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             # ('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 # ('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('svm', SVC(random_state=0, class_weight="balanced"))])

In [24]:
pipeline_modelo

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('select_numeric_columns',
                                                                   <__main__.SelectColumnsTransformer object at 0x000000161F6275E0>)]),
                                                  ['age', 'campaign',
                                                   'previous', 'cons.conf.idx',
                                                   'euribor3m']),
                                                 ('cat',
                                                  Pipeline(steps=[('select_categoric_columns',
                                                                   <__main__.SelectColumnsTransformer object at 0x00000016772FAD90>),
                                                                  ('cat',
                                                                   OneHotEncoder())]),
                    

In [25]:
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('f1', 'roc_auc'))

{'fit_time': array([71.35735989, 66.5273931 , 73.93122721, 78.48146605, 66.56244445]),
 'score_time': array([13.16337395, 14.52135754, 12.10403323, 12.84412384, 10.2045126 ]),
 'test_f1': array([0.36551724, 0.36698033, 0.35686103, 0.36570429, 0.35428089]),
 'test_roc_auc': array([0.77124098, 0.76489672, 0.75543143, 0.75780878, 0.74833935])}

## Naive Bayes

In [26]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]
X_v = X_val[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             # ('standard_scaler', StandardScaler()),
                             ("kbins_discretizer", KBinsDiscretizer(n_bins=4, encode="ordinal", strategy="uniform")),   #strategy="uniform"
                             ('bins_cat', OneHotEncoder())
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 #('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())
                                 ])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('nb', ComplementNB())])

#The Complement Naive Bayes classifier was designed to correct the “severe assumptions” made by the standard Multinomial Naive Bayes classifier. It is particularly suited for imbalanced data sets.
#En el pre-procesamiento transformé todos los atributos en categóricos, porque es el requerimiento del tipo de modelo

In [None]:
pipeline_modelo

In [27]:
#Cross validation con toda la estimación sobre X_train
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('f1', 'roc_auc'))

{'fit_time': array([0.28819966, 0.15611768, 0.34522104, 0.34424806, 0.25215054]),
 'score_time': array([0.05403757, 0.19313097, 0.19115925, 0.19313407, 0.05003476]),
 'test_f1': array([0.37112448, 0.38199181, 0.35883171, 0.37016575, 0.37328454]),
 'test_roc_auc': array([0.76807461, 0.77326943, 0.75415585, 0.76461152, 0.75317483])}

In [28]:
#Pre-procesamiento
train = pipeline_completo.fit_transform(X_t)
val = pipeline_completo.transform(X_v)

In [29]:
nb=ComplementNB()
nb.fit(train, y_train)

ComplementNB()

In [30]:
print("MÉTRICAS CONJUNTO DE TRAIN")
print(classification_report(y_train, nb.predict(train)))
print("MÉTRICAS CONJUNTO DE VALIDACIÓN")
print(classification_report(y_val, nb.predict(val)))

MÉTRICAS CONJUNTO DE TRAIN
              precision    recall  f1-score   support

           0       0.95      0.75      0.84     23390
           1       0.26      0.68      0.37      2970

    accuracy                           0.74     26360
   macro avg       0.60      0.71      0.60     26360
weighted avg       0.87      0.74      0.78     26360

MÉTRICAS CONJUNTO DE VALIDACIÓN
              precision    recall  f1-score   support

           0       0.95      0.75      0.83      5848
           1       0.25      0.67      0.37       742

    accuracy                           0.74      6590
   macro avg       0.60      0.71      0.60      6590
weighted avg       0.87      0.74      0.78      6590



In [36]:
#Optimización de hiperparámetros
#Grilla de parámetros
params={'alpha':[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1, 1.3, 1.5],
        'fit_prior':[True, False],
        'norm':[True,False]
       }

nb=ComplementNB()

In [37]:
#Búsqueda de parámetros
cv_nb = GridSearchCV(nb, params, scoring='f1', cv=5,refit=True,n_jobs=-1)     
cv_nb.fit(train, y_train)

GridSearchCV(cv=5, estimator=ComplementNB(), n_jobs=-1,
             param_grid={'alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9,
                                   1, 1.1, 1.3, 1.5],
                         'fit_prior': [True, False], 'norm': [True, False]},
             scoring='f1')

In [38]:
cv_nb.best_params_

{'alpha': 0.8, 'fit_prior': True, 'norm': True}

In [39]:
#Entrenamiento de la mejor versión encontrada del modelo
nb_best = ComplementNB(alpha=0.8, fit_prior=True, norm=True)
nb_best.fit(train, y_train)

ComplementNB(alpha=0.8, norm=True)

In [40]:
print("MÉTRICAS CONJUNTO DE TRAIN")
print(classification_report(y_train, nb_best.predict(train)))
print("MÉTRICAS CONJUNTO DE VALIDACIÓN")
print(classification_report(y_val, nb_best.predict(val)))

MÉTRICAS CONJUNTO DE TRAIN
              precision    recall  f1-score   support

           0       0.92      0.92      0.92     23390
           1       0.39      0.40      0.40      2970

    accuracy                           0.86     26360
   macro avg       0.66      0.66      0.66     26360
weighted avg       0.86      0.86      0.86     26360

MÉTRICAS CONJUNTO DE VALIDACIÓN
              precision    recall  f1-score   support

           0       0.92      0.92      0.92      5848
           1       0.39      0.40      0.40       742

    accuracy                           0.86      6590
   macro avg       0.66      0.66      0.66      6590
weighted avg       0.86      0.86      0.86      6590



## Random Forest

In [41]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             # ('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 # ('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('rf', RandomForestClassifier(random_state=0, class_weight="balanced"))])

In [None]:
pipeline_modelo

Pipeline(memory=None,
         steps=[('preprocess',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('select_numeric_columns',
                                                                   <__main__.SelectColumnsTransformer object at 0x7fd581d16b90>)],
                                                           verbose=False),
                                                  ['age', 'campaign',
                                                   'previous', 'cons.conf.idx',
                                                   'euribor3m']),
                                                 ('cat'...
                 RandomForestClassifier(bootstrap=True, ccp_

In [42]:
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('f1', 'roc_auc'))

{'fit_time': array([4.09590173, 2.42873645, 3.8767612 , 6.14836407, 3.60454082]),
 'score_time': array([0.30023074, 0.30922174, 0.82560349, 0.78356314, 0.30021071]),
 'test_f1': array([0.32018561, 0.32821724, 0.31153389, 0.32328106, 0.35111111]),
 'test_roc_auc': array([0.76488503, 0.76032845, 0.75850802, 0.76109193, 0.75566086])}

## KNN

In [43]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             ('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 # ('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('knn', KNeighborsClassifier())])

In [44]:
pipeline_modelo

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('select_numeric_columns',
                                                                   <__main__.SelectColumnsTransformer object at 0x000000161F66B190>),
                                                                  ('standard_scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'campaign',
                                                   'previous', 'cons.conf.idx',
                                                   'euribor3m']),
                                                 ('cat',
                                                  Pipeline(steps=[('select_categoric_columns',
                                                                   <__main__.SelectColumnsTransformer object at 0x000000161F627970>),
         

In [45]:
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('f1', 'roc_auc'))

{'fit_time': array([0.99973488, 1.16883254, 0.87560606, 0.83959699, 0.84862089]),
 'score_time': array([11.11990619, 14.1170454 , 12.60597467, 11.86042786, 12.34275389]),
 'test_f1': array([0.31336406, 0.33179724, 0.32590856, 0.3087886 , 0.35386338]),
 'test_roc_auc': array([0.70310739, 0.70060157, 0.69511615, 0.70793873, 0.71124455])}

## Modelos Tree Based

### XGBOOST

In [None]:
import xgboost as xgb

In [46]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             # ('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 #('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('xgb', xgb.XGBClassifier(seed=0))])

In [27]:
pipeline_modelo.fit(X_t, y_train)



Pipeline(memory=None,
         steps=[('preprocess',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('select_numeric_columns',
                                                                   <__main__.SelectColumnsTransformer object at 0x00000018268D7B80>)],
                                                           verbose=False),
                                                  ['age', 'campaign',
                                                   'previous', 'cons.conf.idx',
                                                   'euribor3m']),
                                                 ('...
                               learning_rate=0.300000012, ma

In [47]:
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('f1', 'roc_auc'))





















{'fit_time': array([2.40020847, 2.27061367, 2.41471553, 2.26461554, 2.16353726]),
 'score_time': array([0.10107207, 0.07805562, 0.07705498, 0.07705474, 0.07805586]),
 'test_f1': array([0.35813953, 0.35730858, 0.36792453, 0.3685446 , 0.37641723]),
 'test_roc_auc': array([0.77235912, 0.77015776, 0.77837787, 0.78274101, 0.77563957])}

In [29]:
pipeline_modelo[1].feature_importances_    #No se entiende este 

array([0.02291044, 0.02426711, 0.02155347, 0.04411939, 0.09447913,
       0.02118124, 0.02820904, 0.02015119, 0.02030318, 0.02230303,
       0.02138294, 0.0245105 , 0.0220443 , 0.02649062, 0.02912351,
       0.01692515, 0.02376754, 0.02346515, 0.02592699, 0.02173736,
       0.02110064, 0.03140582, 0.02401848, 0.00233567, 0.02450542,
       0.02417863, 0.        , 0.        , 0.01997583, 0.        ,
       0.02702206, 0.        , 0.04010792, 0.        , 0.03302651,
       0.        , 0.19747172], dtype=float32)

In [48]:
#Solo Pre-procesamiento
train = pipeline_completo.fit_transform(X_t)
val = pipeline_completo.transform(X_v)

In [49]:
xgb=xgb.XGBClassifier(seed=0)
xgb.fit(train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [50]:
print("MÉTRICAS CONJUNTO DE TRAIN")
print(classification_report(y_train, xgb.predict(train)))
print("MÉTRICAS CONJUNTO DE VALIDACIÓN")
print(classification_report(y_val, xgb.predict(val)))

MÉTRICAS CONJUNTO DE TRAIN
              precision    recall  f1-score   support

           0       0.93      0.99      0.96     23390
           1       0.90      0.42      0.58      2970

    accuracy                           0.93     26360
   macro avg       0.91      0.71      0.77     26360
weighted avg       0.93      0.93      0.92     26360

MÉTRICAS CONJUNTO DE VALIDACIÓN
              precision    recall  f1-score   support

           0       0.91      0.98      0.95      5848
           1       0.62      0.27      0.38       742

    accuracy                           0.90      6590
   macro avg       0.77      0.63      0.66      6590
weighted avg       0.88      0.90      0.88      6590



In [52]:
import xgboost as xgb

In [53]:
#Optimización de hiperparámetros
#Grilla de parámetros
params={'objective':["binary:logistic","binary:hinge","binary:logitraw"],
        'learning_rate':[ 0.1,0.2,0.3],
        'max_depth':[2,4, 6, 7, 8, 10],
        'alpha':[2, 3, 5, 7],
        "n_estimators":[5, 7, 10]
       }
xg=xgb.XGBClassifier()

In [54]:
#Búsqueda de parámetros
cv_xgb = GridSearchCV(xg, params, scoring='f1', cv=5,refit=True,n_jobs=-1)     
cv_xgb.fit(train, y_train)



GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs...
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None, va

In [55]:
cv_xgb.best_params_

{'alpha': 7,
 'learning_rate': 0.1,
 'max_depth': 8,
 'n_estimators': 10,
 'objective': 'binary:hinge'}

In [None]:
import xgboost as xgb

In [56]:
#Entrenamiento de la mejor versión encontrada del modelo
xgb_best = xgb.XGBClassifier(seed=0, alpha= 7, learning_rate= 0.1, max_depth= 8, n_estimators=10, objective='binary:hinge')
xgb_best.fit(train, y_train)

XGBClassifier(alpha=7, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=8,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=10, n_jobs=4, num_parallel_tree=1,
              objective='binary:hinge', random_state=0, reg_alpha=7,
              reg_lambda=1, scale_pos_weight=None, seed=0, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [57]:
print("MÉTRICAS CONJUNTO DE TRAIN")
print(classification_report(y_train, xgb_best.predict(train)))
print("MÉTRICAS CONJUNTO DE VALIDACIÓN")
print(classification_report(y_val, xgb_best.predict(val)))

MÉTRICAS CONJUNTO DE TRAIN
              precision    recall  f1-score   support

           0       0.94      0.91      0.93     23390
           1       0.44      0.56      0.49      2970

    accuracy                           0.87     26360
   macro avg       0.69      0.73      0.71     26360
weighted avg       0.89      0.87      0.88     26360

MÉTRICAS CONJUNTO DE VALIDACIÓN
              precision    recall  f1-score   support

           0       0.94      0.91      0.93      5848
           1       0.44      0.57      0.49       742

    accuracy                           0.87      6590
   macro avg       0.69      0.74      0.71      6590
weighted avg       0.89      0.87      0.88      6590



In [63]:
#Optimización de hiperparámetros
#Grilla de parámetros
params={'objective':["binary:logistic","binary:hinge","binary:logitraw"],
        'learning_rate':[0.05,0.1,0.15,0.2,0.25,0.3,0.4,0.5],
        'max_depth':[2,3, 4,5, 6, 7, 8, 9, 10, 12, 15],
        'alpha':[0, 0.5, 1, 2, 3, 5, 6, 7, 8, 9, 10],
        'lambda':[0.5, 1, 2, 3, 5],
        "n_estimators":[3, 5, 6, 7, 8, 9, 10, 15],
        "booster":["gbtree","dart"],
        "gamma":[0.5,1,2,5, 7, 8],
        "tree_method":["auto","exact","approx","hist"]
       }
xg=xgb.XGBClassifier()

In [64]:
#Búsqueda de parámetros
rcv_xgb = RandomizedSearchCV(xg, params, scoring='f1', cv=5,refit=True,n_jobs=-1)     
rcv_xgb.fit(train, y_train)



RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100,...
                   param_distributions={'alpha': [0, 0.5, 1, 2, 3, 5, 6, 7, 8,
                                                  9, 10],
                                        'boo

In [65]:
rcv_xgb.best_params_

{'tree_method': 'exact',
 'objective': 'binary:hinge',
 'n_estimators': 6,
 'max_depth': 7,
 'learning_rate': 0.5,
 'lambda': 3,
 'gamma': 5,
 'booster': 'gbtree',
 'alpha': 7}

In [68]:
#Entrenamiento de la mejor versión encontrada del modelo
xgb_best_r = xgb.XGBClassifier(seed=0, tree_method= 'approx', objective="binary:hinge", n_estimators=3, max_depth= 6,
                               learning_rate= 0.25, reg_lambda=3, gamma= 8, booster="dart", 
                               alpha= 0 )   #el lambda por default es 1
xgb_best_r.fit(train, y_train)

XGBClassifier(alpha=0, base_score=0.5, booster='dart', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=8, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.25, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=3, n_jobs=4, num_parallel_tree=1,
              objective='binary:hinge', random_state=0, reg_alpha=0,
              reg_lambda=3, scale_pos_weight=None, seed=0, subsample=1,
              tree_method='approx', validate_parameters=1, verbosity=None)

In [69]:
print("MÉTRICAS CONJUNTO DE TRAIN")
print(classification_report(y_train, xgb_best_r.predict(train)))
print("MÉTRICAS CONJUNTO DE VALIDACIÓN")
print(classification_report(y_val, xgb_best_r.predict(val)))

MÉTRICAS CONJUNTO DE TRAIN
              precision    recall  f1-score   support

           0       0.94      0.91      0.93     23390
           1       0.44      0.54      0.49      2970

    accuracy                           0.87     26360
   macro avg       0.69      0.73      0.71     26360
weighted avg       0.88      0.87      0.88     26360

MÉTRICAS CONJUNTO DE VALIDACIÓN
              precision    recall  f1-score   support

           0       0.94      0.91      0.93      5848
           1       0.46      0.57      0.51       742

    accuracy                           0.88      6590
   macro avg       0.70      0.74      0.72      6590
weighted avg       0.89      0.88      0.88      6590



### LigthGBM

In [None]:
variables_categoricas = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
variables_numericas = ['age', 'campaign', 'previous', 'cons.conf.idx', 'euribor3m']

# Filtramos las variables que seleccionamos
X_t = X_train[variables_categoricas + variables_numericas]

pipeline_numerico = Pipeline([('select_numeric_columns', SelectColumnsTransformer(variables_numericas)),
                             # ('standard_scaler', StandardScaler()),
                             # ("kbins_discretizer", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"))      #strategy="uniform"
                            ])

pipeline_categorico = Pipeline ([('select_categoric_columns', SelectColumnsTransformer(variables_categoricas)),
                                 # ('imputer', SimpleImputer(strategy='most_frequent', missing_values="unknown")),      #podríamos no ponerlo, y que deje "desconocido" como una categoría más
                                 ('cat', OneHotEncoder())])

pipeline_completo = ColumnTransformer([('num', pipeline_numerico, variables_numericas),
                                   ('cat', pipeline_categorico, variables_categoricas),
                                  ])

pipeline_modelo = Pipeline([('preprocess', pipeline_completo),
                            ('lgbm', lgbm(seed=0))])

In [None]:
pipeline_modelo

In [None]:
cross_validate(pipeline_modelo, X_t, y_train, cv=5, scoring=('f1', 'roc_auc', "precision", "recall"))

## Prueba con las variables del PCA

In [72]:
# Leemos el dataset con la función de pandas "read_csv"
key = "data/final_df.csv"
df_pca = pd.read_csv(key, sep=",")

In [73]:
df_pca.head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,age,job,marital,education,default,housing,loan,contact,...,job_num,marital_num,education_num,loan_num,housing_num,default_num,pca1,pca2,pca3,pca4
0,0,0,0.481481,housemaid,married,basic.4y,no,no,no,telephone,...,1,1,1,1,1,1,-0.303521,-0.209731,-0.060447,0.074546
1,1,1,0.493827,services,married,high.school,no,no,no,telephone,...,2,1,2,1,1,1,-0.306797,-0.177784,-0.159761,0.033756


In [78]:
df_pca.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39156 entries, 0 to 39155
Data columns (total 32 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      39156 non-null  int64  
 1   Unnamed: 0.1    39156 non-null  int64  
 2   age             39156 non-null  float64
 3   job             39156 non-null  object 
 4   marital         39156 non-null  object 
 5   education       39156 non-null  object 
 6   default         39156 non-null  object 
 7   housing         39156 non-null  object 
 8   loan            39156 non-null  object 
 9   contact         39156 non-null  object 
 10  month           39156 non-null  object 
 11  day_of_week     39156 non-null  object 
 12  campaign        39156 non-null  float64
 13  pdays           35130 non-null  float64
 14  previous        39156 non-null  float64
 15  poutcome        39156 non-null  object 
 16  emp.var.rate    39156 non-null  float64
 17  cons.price.idx  39156 non-null 

In [75]:
df_pca.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'age', 'job', 'marital', 'education',
       'default', 'housing', 'loan', 'contact', 'month', 'day_of_week',
       'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y',
       'job_num', 'marital_num', 'education_num', 'loan_num', 'housing_num',
       'default_num', 'pca1', 'pca2', 'pca3', 'pca4'],
      dtype='object')

In [76]:
X_pca = df_pca.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'age', 'job', 'marital', 'education',
       'default', 'housing', 'loan', 'contact', 'month', 'day_of_week',
       'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y',
       'job_num', 'marital_num', 'education_num', 'loan_num', 'housing_num',
       'default_num'])
y_pca = df_pca.y

In [77]:
X_pca_temp, X_pca_test, y_pca_temp, y_pca_test = train_test_split(X_pca, y_pca, test_size=0.2, stratify=y, random_state=42) 

ValueError: Found input variables with inconsistent numbers of samples: [39156, 41188]