In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import lightgbm as lgbm

In [2]:
# Leemos el dataset con la función de pandas "read_csv"
key = "data/final_df.csv"
df_pca = pd.read_csv(key, sep=",")

In [3]:
df_pca.head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,age,job,marital,education,default,housing,loan,contact,...,job_num,marital_num,education_num,loan_num,housing_num,default_num,pca1,pca2,pca3,pca4
0,0,0,0.481481,housemaid,married,basic.4y,no,no,no,telephone,...,1,1,1,1,1,1,-0.303521,-0.209731,-0.060447,0.074546
1,1,1,0.493827,services,married,high.school,no,no,no,telephone,...,2,1,2,1,1,1,-0.306797,-0.177784,-0.159761,0.033756


In [4]:
df_pca.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'age', 'job', 'marital', 'education',
       'default', 'housing', 'loan', 'contact', 'month', 'day_of_week',
       'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y',
       'job_num', 'marital_num', 'education_num', 'loan_num', 'housing_num',
       'default_num', 'pca1', 'pca2', 'pca3', 'pca4'],
      dtype='object')

In [5]:
X_pca = df_pca.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'age', 'job', 'marital', 'education',
       'default', 'housing', 'loan', 'contact', 'month', 'day_of_week',
       'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y',
       'job_num', 'marital_num', 'education_num', 'loan_num', 'housing_num',
       'default_num'])
y_pca = df_pca.y

In [7]:
X_pca_temp, X_pca_test, y_pca_temp, y_pca_test = train_test_split(X_pca, y_pca, test_size=0.2, stratify=df_pca.y, random_state=42) 

In [8]:
X_pca_temp.shape, y_pca_temp.shape, X_pca_test.shape, y_pca_test.shape

((31324, 4), (31324,), (7832, 4), (7832,))

In [9]:
X_pca_train, X_pca_val, y_pca_train, y_pca_val = train_test_split(X_pca_temp, y_pca_temp, test_size=0.2, stratify=y_pca_temp, random_state=42)

In [10]:
X_pca_train.shape, y_pca_train.shape, X_pca_val.shape, y_pca_val.shape

((25059, 4), (25059,), (6265, 4), (6265,))

**MODELOS**

In [None]:
""""""
class SelectColumnsTransformer():
    def __init__(self, columns=None):
        self.columns = columns

    def transform(self, X, **transform_params):
        cpy_df = X[self.columns].copy()
        return cpy_df

    def fit(self, X, y=None, **fit_params):
        return self

In [16]:
import xgboost as xgb

In [12]:
xgb=xgb.XGBClassifier(seed=0)
xgb.fit(X_pca_train, y_pca_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [14]:
print("MÉTRICAS CONJUNTO DE TRAIN")
print(classification_report(y_pca_train, xgb.predict(X_pca_train)))
print("MÉTRICAS CONJUNTO DE VALIDACIÓN")
print(classification_report(y_pca_val, xgb.predict(X_pca_val)))

MÉTRICAS CONJUNTO DE TRAIN
              precision    recall  f1-score   support

           0       0.96      0.99      0.98     22472
           1       0.86      0.68      0.76      2587

    accuracy                           0.96     25059
   macro avg       0.91      0.84      0.87     25059
weighted avg       0.95      0.96      0.95     25059

MÉTRICAS CONJUNTO DE VALIDACIÓN
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      5618
           1       0.59      0.46      0.52       647

    accuracy                           0.91      6265
   macro avg       0.76      0.71      0.73      6265
weighted avg       0.90      0.91      0.91      6265



Optimización de parámetros

In [17]:
#Optimización de hiperparámetros
#Grilla de parámetros
params={'objective':["binary:logistic","binary:hinge","binary:logitraw"],
        'learning_rate':[ 0.1,0.2,0.3],
        'max_depth':[2,4, 6, 7, 8, 10],
        'alpha':[2, 3, 5, 7],
        "n_estimators":[5, 7, 10]
       }
xg=xgb.XGBClassifier()

In [18]:
#Búsqueda de parámetros
cv_xgb = GridSearchCV(xg, params, scoring='f1', cv=5,refit=True,n_jobs=-1)     
cv_xgb.fit(X_pca_train, y_pca_train)



GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs...
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None, va

In [19]:
cv_xgb.best_params_

{'alpha': 7,
 'learning_rate': 0.2,
 'max_depth': 10,
 'n_estimators': 5,
 'objective': 'binary:hinge'}

In [20]:
#Entrenamiento de la mejor versión encontrada del modelo
xgb_best = xgb.XGBClassifier(seed=0, alpha= 7, learning_rate= 0.2, max_depth= 10, n_estimators=5, objective='binary:hinge')
xgb_best.fit(X_pca_train, y_pca_train)

XGBClassifier(alpha=7, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.2, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=5, n_jobs=4, num_parallel_tree=1,
              objective='binary:hinge', random_state=0, reg_alpha=7,
              reg_lambda=1, scale_pos_weight=None, seed=0, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [21]:
print("MÉTRICAS CONJUNTO DE TRAIN")
print(classification_report(y_pca_train, xgb_best.predict(X_pca_train)))
print("MÉTRICAS CONJUNTO DE VALIDACIÓN")
print(classification_report(y_pca_val, xgb_best.predict(X_pca_val)))

MÉTRICAS CONJUNTO DE TRAIN
              precision    recall  f1-score   support

           0       0.97      0.94      0.95     22472
           1       0.57      0.72      0.63      2587

    accuracy                           0.91     25059
   macro avg       0.77      0.83      0.79     25059
weighted avg       0.93      0.91      0.92     25059

MÉTRICAS CONJUNTO DE VALIDACIÓN
              precision    recall  f1-score   support

           0       0.96      0.93      0.94      5618
           1       0.51      0.65      0.58       647

    accuracy                           0.90      6265
   macro avg       0.74      0.79      0.76      6265
weighted avg       0.91      0.90      0.91      6265

