# Modelización con aplicación de SMOTE

## Importación de librerías

In [1]:
# Librerías para visualización de datos
import matplotlib.pyplot as plt
import seaborn as sns

# Librerías para manipulación y análisis de datos
import numpy as np
import pandas as pd


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline


from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import RFECV
from sklearn.inspection import permutation_importance
from sklearn.tree import DecisionTreeClassifier

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import make_scorer, recall_score, auc

from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import roc_curve

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline


from toolbox_DS import *
from toolbox_ML import *


import warnings
warnings.filterwarnings(action="ignore", message=r'.*Use subset.*of np.ndarray is not recommended')


## Carga de datos

Train

In [2]:
train_set = pd.read_csv('./data/train_set.csv')
train_set

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,Response,income_missing,age,customes_seniority,Household_members,Total_amount,Total_purchase,Median_amount_purchase,Total_cmp,Total_%_cmp
0,9400,1958,2n Cycle,Single,85485.0,0,0,2014-06-21,73,630,...,0,0,57,1,1.0,1383,19,72.789474,0,0.0
1,2804,1975,Master,Single,46098.0,1,1,2012-08-18,86,57,...,1,0,40,3,3.0,120,11,10.909091,0,0.0
2,1503,1976,PhD,Together,162397.0,1,1,2013-06-03,31,85,...,0,0,39,2,4.0,107,1,107.000000,0,0.0
3,5491,1951,Master,Together,47352.0,0,1,2013-04-11,70,172,...,0,0,64,2,3.0,319,16,19.937500,0,0.0
4,2894,1985,Graduation,Single,72903.0,0,0,2013-10-29,74,1067,...,1,0,30,2,1.0,2013,21,95.857143,3,0.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1785,10785,1969,Graduation,Married,44078.0,1,1,2014-06-19,17,24,...,0,0,46,1,4.0,41,7,5.857143,0,0.0
1786,9964,1979,Graduation,Single,61825.0,0,1,2013-08-07,56,162,...,0,0,36,2,2.0,424,15,28.266667,0,0.0
1787,3412,1951,Master,Married,67381.0,0,1,2013-01-15,67,815,...,0,0,64,2,3.0,957,17,56.294118,1,0.2
1788,2811,1963,PhD,Single,48918.0,1,1,2014-04-12,21,52,...,0,0,52,1,3.0,62,7,8.857143,0,0.0


Test

In [3]:
test_set = pd.read_csv('./data/test_set.csv')
test_set.head(5)

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,Complain,Response,age,customes_seniority,Household_members,Total_amount,Total_purchase,Median_amount_purchase,Total_cmp,Total_%_cmp
0,5092,1949,PhD,Widow,51569.0,0,1,2013-02-12,39,380,...,0,1,66,2,2.0,467,19,24.578947,1,0.2
1,4432,1976,Graduation,Divorced,31615.0,1,0,2013-03-16,82,2,...,0,0,39,2,2.0,51,7,7.285714,0,0.0
2,803,1968,Graduation,Together,40521.0,1,1,2013-04-05,82,12,...,0,0,47,2,4.0,21,4,5.25,0,0.0
3,10262,1980,Graduation,Married,15072.0,2,0,2013-05-10,96,8,...,0,0,35,2,4.0,53,11,4.818182,0,0.0
4,5847,1969,Graduation,Divorced,69901.0,0,1,2013-05-29,95,312,...,0,0,46,2,2.0,882,23,38.347826,0,0.0


Imputo los cambios que vengo aplicando al dataset.    
Train

In [4]:
# Cambio el índice
train_set.set_index('ID', inplace=True)

# Cambio tipo a datetime
train_set['Dt_Customer'] = pd.to_datetime(train_set['Dt_Customer'])

# Cambiar tipo a categóricas
cols_to_category = ['Education', 'Marital_Status']
train_set[cols_to_category] = train_set[cols_to_category].astype('category')

# Elimino columna 'income_missing'
train_set = train_set.drop(columns=['income_missing', 'Year_Birth','Total_%_cmp','Dt_Customer','Median_amount_purchase'])

# Elimino el outlier de Income
train_set = train_set.loc[train_set['Income'] !=666666]

Y también al test

In [5]:
# Cambio el índice
test_set.set_index('ID', inplace=True)

# Cambio tipo a datetime
test_set['Dt_Customer'] = pd.to_datetime(test_set['Dt_Customer'])

# Cambiar tipo a categóricas
cols_to_category = ['Education', 'Marital_Status']
test_set[cols_to_category] = test_set[cols_to_category].astype('category')

# Elimino columna 'income_missing'
test_set = test_set.drop(columns=['Year_Birth','Total_%_cmp','Dt_Customer','Median_amount_purchase'])

# Elimino el outlier de Income
test_set = test_set.loc[test_set['Income'] !=666666]

## Transformación de variables

In [6]:
X_train = train_set.drop(columns='Response')
y_train = train_set['Response']

X_test = test_set.drop(columns='Response')
y_test = test_set['Response']

In [7]:
features_num = X_train.select_dtypes(['int','float']).columns
features_cat = X_train.select_dtypes(['object', 'category']).columns
print('features_num', features_num)
print('features_cat', features_cat)

features_num Index(['Income', 'Kidhome', 'Teenhome', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'age', 'customes_seniority',
       'Household_members', 'Total_amount', 'Total_purchase', 'Total_cmp'],
      dtype='object')
features_cat Index(['Education', 'Marital_Status'], dtype='object')


In [8]:
# Definimos el OrdinalEncoder con el mapeo de 'education_ode'
ordinal_encoder = OrdinalEncoder(categories=[['Basic', '2n Cycle', 'Graduation', 'Master', 'PhD']])

categorical_features_onehot = ['Marital_Status']
categorical_features_ordinal = ['Education']
numerical_features = features_num

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat_onehot', OneHotEncoder(), categorical_features_onehot),
        ('cat_ordinal', Pipeline([
            ('ordinal', ordinal_encoder),
            ('scaler', MinMaxScaler())
        ]), categorical_features_ordinal)
    ]
)

pipeline = Pipeline(steps=[
    ('preprocesor',preprocessor),
    ('algoritmo', RandomForestClassifier())
])

pipeline.fit(X_train,y_train)

X_train_transform = pipeline.named_steps['preprocesor'].transform(X_train)


In [9]:
features_transformed = pipeline.named_steps['preprocesor'].get_feature_names_out()
features_transformed

array(['num__Income', 'num__Kidhome', 'num__Teenhome', 'num__Recency',
       'num__MntWines', 'num__MntFruits', 'num__MntMeatProducts',
       'num__MntFishProducts', 'num__MntSweetProducts',
       'num__MntGoldProds', 'num__NumDealsPurchases',
       'num__NumWebPurchases', 'num__NumCatalogPurchases',
       'num__NumStorePurchases', 'num__NumWebVisitsMonth',
       'num__AcceptedCmp3', 'num__AcceptedCmp4', 'num__AcceptedCmp5',
       'num__AcceptedCmp1', 'num__AcceptedCmp2', 'num__Complain',
       'num__age', 'num__customes_seniority', 'num__Household_members',
       'num__Total_amount', 'num__Total_purchase', 'num__Total_cmp',
       'cat_onehot__Marital_Status_Alone',
       'cat_onehot__Marital_Status_Divorced',
       'cat_onehot__Marital_Status_Married',
       'cat_onehot__Marital_Status_Others',
       'cat_onehot__Marital_Status_Single',
       'cat_onehot__Marital_Status_Together',
       'cat_onehot__Marital_Status_Widow', 'cat_ordinal__Education'],
      dtype=object)

In [10]:
X_train_transform_df = pd.DataFrame(X_train_transform, columns=(features_transformed))
X_train_transform_df

Unnamed: 0,num__Income,num__Kidhome,num__Teenhome,num__Recency,num__MntWines,num__MntFruits,num__MntMeatProducts,num__MntFishProducts,num__MntSweetProducts,num__MntGoldProds,...,num__Total_purchase,num__Total_cmp,cat_onehot__Marital_Status_Alone,cat_onehot__Marital_Status_Divorced,cat_onehot__Marital_Status_Married,cat_onehot__Marital_Status_Others,cat_onehot__Marital_Status_Single,cat_onehot__Marital_Status_Together,cat_onehot__Marital_Status_Widow,cat_ordinal__Education
0,85485.0,0.0,0.0,73.0,630.0,26.0,611.0,44.0,18.0,54.0,...,19.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.25
1,46098.0,1.0,1.0,86.0,57.0,0.0,27.0,0.0,0.0,36.0,...,11.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.75
2,162397.0,1.0,1.0,31.0,85.0,1.0,16.0,2.0,1.0,2.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.00
3,47352.0,0.0,1.0,70.0,172.0,12.0,112.0,8.0,0.0,15.0,...,16.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.75
4,72903.0,0.0,0.0,74.0,1067.0,138.0,750.0,0.0,19.0,39.0,...,21.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1784,44078.0,1.0,1.0,17.0,24.0,1.0,10.0,2.0,0.0,4.0,...,7.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.50
1785,61825.0,0.0,1.0,56.0,162.0,50.0,100.0,55.0,30.0,27.0,...,15.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.50
1786,67381.0,0.0,1.0,67.0,815.0,8.0,53.0,11.0,0.0,70.0,...,17.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.75
1787,48918.0,1.0,1.0,21.0,52.0,0.0,9.0,0.0,0.0,1.0,...,7.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.00


## SMOTE

In [11]:
# Crear un objeto SMOTE
smote = SMOTE()

## Modelización

### Prueba de modelos con todas las variables

In [12]:
# ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)
pipe = ImbPipeline(steps=[
    ('preprocesor', preprocessor),
    ('smote', smote),
    ('algoritmo', RandomForestClassifier())
])


# Definir el grid de hiperparámetros
grid = [
    {'algoritmo': [RandomForestClassifier()],
     'algoritmo__max_depth': [5, 10, 15],
     'algoritmo__n_estimators': [50, 100, 200],
     },
   
    {'algoritmo': [XGBClassifier()],
     'algoritmo__learning_rate': [0.1, 0.3, 0.5],
     'algoritmo__n_estimators': [100, 500, 1000],
     },
    
    {'algoritmo': [LGBMClassifier()],
     'algoritmo__learnin_rate': [0.1, 0.3, 0.5],
     'algoritmo__n_estimators': [100, 500, 1000],
     'algoritmo__objective': ['binary']}
]


# Realizar GridSearchCV con el scorer personalizado
grid_search = GridSearchCV(pipe, 
                           grid, 
                           cv=5, 
                           scoring=make_scorer(recall_score, pos_label=True), 
                           n_jobs=-1)


grid_search.fit(X_train, y_train)

# Obtener el mejor modelo y parámetros
best_model = grid_search.best_estimator_
print("Mejor modelo y parámetros:", grid_search.best_params_)

Mejor modelo y parámetros: {'algoritmo': RandomForestClassifier(), 'algoritmo__max_depth': 5, 'algoritmo__n_estimators': 50}


In [13]:
grid_search.best_score_

0.6057302585604473

In [14]:
df_cv_results = pd.DataFrame(grid_search.cv_results_)
df_cv_results[['param_algoritmo','mean_test_score','std_test_score','rank_test_score']].sort_values('rank_test_score')

Unnamed: 0,param_algoritmo,mean_test_score,std_test_score,rank_test_score
0,RandomForestClassifier(),0.60573,0.080932,1
2,RandomForestClassifier(),0.598253,0.078823,2
1,RandomForestClassifier(),0.587142,0.055937,3
15,"XGBClassifier(base_score=None, booster=None, c...",0.561495,0.041628,4
14,"XGBClassifier(base_score=None, booster=None, c...",0.557582,0.04471,5
11,"XGBClassifier(base_score=None, booster=None, c...",0.557512,0.049766,6
10,"XGBClassifier(base_score=None, booster=None, c...",0.553878,0.04071,7
21,LGBMClassifier(),0.553739,0.040703,8
9,"XGBClassifier(base_score=None, booster=None, c...",0.549965,0.053583,9
26,LGBMClassifier(),0.546331,0.069765,10


Haciendo un pipeline con todas las variables, el mejor sería un RandomForest con una recall de la clase positiva de 0.66

## Ajuste hiperparámetros

### RandomForest

In [15]:
pipe = Pipeline(steps=[
    ('preprocesador', preprocessor),
    ('algoritmo', RandomForestClassifier())
])

param = {
    'algoritmo__n_estimators': [400,450,500],
    'algoritmo__max_depth': [5, 6,7],
    'algoritmo__min_samples_split': [3,4,5],
    'algoritmo__min_samples_leaf': [8,10,12],
    'algoritmo__class_weight': ['balanced']
}

grid_rf = GridSearchCV(pipe, 
                       param_grid=param, 
                       cv=5, 
                       scoring=make_scorer(recall_score, pos_label=True), 
                       n_jobs=-1)

grid_rf.fit(X_train, y_train)

# Obtener el mejor modelo y parámetros
best_model_rf = grid_rf.best_estimator_
best_params_rf = grid_rf.best_params_
print("Mejor modelo y parámetros:", best_params_rf)

# Imprimir el mejor resultado del recall para la clase positiva
best_recall_rf = grid_rf.best_score_
print("Mejor resultado del recall para la clase positiva:", best_recall_rf)


Mejor modelo y parámetros: {'algoritmo__class_weight': 'balanced', 'algoritmo__max_depth': 7, 'algoritmo__min_samples_leaf': 10, 'algoritmo__min_samples_split': 5, 'algoritmo__n_estimators': 450}
Mejor resultado del recall para la clase positiva: 0.7102026554856744


In [16]:
# Acceder al modelo dentro del Pipeline
random_forest_model = best_model_rf.named_steps['algoritmo']

# Obtener las importancias de las características
feature_importances = random_forest_model.feature_importances_

if 'preprocesador' in best_model_rf.named_steps:
    preprocessor = best_model_rf.named_steps['preprocesador']
    feature_names = preprocessor.get_feature_names_out()

    importances_series = pd.Series(feature_importances, index=feature_names)
    print(importances_series.sort_values(ascending=False))

num__Total_cmp                         0.139146
num__Recency                           0.089476
num__Total_amount                      0.085251
num__MntGoldProds                      0.068256
num__MntMeatProducts                   0.054168
num__MntWines                          0.048272
num__Income                            0.046356
num__AcceptedCmp3                      0.043796
num__Household_members                 0.039873
num__NumCatalogPurchases               0.038946
num__AcceptedCmp5                      0.038325
num__NumWebVisitsMonth                 0.037541
num__customes_seniority                0.033241
num__NumStorePurchases                 0.028496
num__MntSweetProducts                  0.025296
num__MntFruits                         0.020871
num__Total_purchase                    0.020419
num__AcceptedCmp1                      0.019181
num__NumWebPurchases                   0.016605
num__age                               0.016490
num__MntFishProducts                   0

### LigthGBM

In [17]:
pipe = Pipeline(steps=[
    ('preprocesador', preprocessor),
    ('algoritmo', LGBMClassifier())
])

param = {
    'algoritmo__max_depth': [4,5, 6],
    'algoritmo__min_child_samples': [2,4,8],
    'algoritmo__min_child_weight': [4, 8,10],
    'algoritmo__learning_rate': [0.09, 0.1, 0.2],
    'algoritmo__n_estimators': [100, 200, 300],
    'algoritmo__is_unbalance' : [True],
    'algoritmo__objective': ['binary'],
}

grid_lgb = GridSearchCV(pipe, 
                       param_grid=param, 
                       cv=5, 
                       scoring=make_scorer(recall_score, pos_label=True), 
                       n_jobs=-1,
                       verbose= 1)

grid_lgb.fit(X_train, y_train)

# Obtener el mejor modelo y parámetros
best_model_lgb = grid_lgb.best_estimator_
best_params_lgb = grid_lgb.best_params_
print("Mejor modelo y parámetros:", best_params_lgb)

# Imprimir el mejor resultado del recall para la clase positiva
best_recall_lgb = grid_lgb.best_score_
print("Mejor resultado del recall para la clase positiva:", best_recall_lgb)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
[LightGBM] [Info] Number of positive: 269, number of negative: 1520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000451 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1853
[LightGBM] [Info] Number of data points in the train set: 1789, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.150363 -> initscore=-1.731754
[LightGBM] [Info] Start training from score -1.731754
Mejor modelo y parámetros: {'algoritmo__is_unbalance': True, 'algoritmo__learning_rate': 0.09, 'algoritmo__max_depth': 4, 'algoritmo__min_child_samples': 2, 'algoritmo__min_child_weight': 10, 'algoritmo__n_estimators': 100, 'algoritmo__objective': 'binary'}
Mejor resultado del recall para la clase positiva: 0.7434661076170511


In [18]:
# Acceder al modelo dentro del Pipeline
lgb_model = best_model_lgb.named_steps['algoritmo']

# Obtener las importancias de las características
feature_importances = lgb_model.feature_importances_

if 'preprocesador' in best_model_lgb.named_steps:
    preprocessor = best_model_lgb.named_steps['preprocesador']
    feature_names = preprocessor.get_feature_names_out()

    importances_series = pd.Series(feature_importances, index=feature_names)
    print(importances_series.sort_values(ascending=False))

num__Recency                           145
num__MntMeatProducts                    67
num__MntGoldProds                       62
num__Total_cmp                          58
num__age                                55
num__NumStorePurchases                  54
num__NumWebVisitsMonth                  50
num__MntSweetProducts                   48
num__MntWines                           46
num__Household_members                  44
num__customes_seniority                 44
num__Income                             43
cat_ordinal__Education                  41
num__MntFruits                          31
num__Total_amount                       28
num__MntFishProducts                    24
num__NumCatalogPurchases                24
num__NumWebPurchases                    18
num__Teenhome                           16
num__NumDealsPurchases                  16
num__Total_purchase                     15
cat_onehot__Marital_Status_Married      14
num__AcceptedCmp5                       14
cat_onehot_

### XGBoost

In [20]:
pipe = Pipeline(steps=[
    ('preprocesador', preprocessor),
    ('algoritmo', XGBClassifier())
])

param = {
    'algoritmo__n_estimators': [300, 400, 450],
    'algoritmo__max_depth': [4, 5, 6],
    'algoritmo__min_child_weight': [6, 8, 10],  # Equivalente a min_samples_leaf en RandomForest
    'algoritmo__gamma': [0, 0.1, 0.2],  # Parámetro para controlar la regularización
    'algoritmo__subsample': [0.7, 0.8, 0.9],  # Porcentaje de muestras usadas para cada árbol
    'algoritmo__colsample_bytree': [0.7, 0.8, 0.9],  # Porcentaje de características usadas para cada árbol
    
}


grid_xgb = GridSearchCV(pipe, 
                       param_grid=param, 
                       cv=5, 
                       scoring=make_scorer(recall_score, pos_label=True), 
                       n_jobs=-1,
                       verbose= 3)

grid_xgb.fit(X_train, y_train)

# Obtener el mejor modelo y parámetros
best_model_xgb = grid_xgb.best_estimator_
best_params_xgb = grid_xgb.best_params_
print("Mejor modelo y parámetros:", best_params_xgb)

# Imprimir el mejor resultado del recall para la clase positiva
best_recall_xgb = grid_xgb.best_score_
print("Mejor resultado del recall para la clase positiva:", best_recall_xgb)

Fitting 5 folds for each of 729 candidates, totalling 3645 fits
Mejor modelo y parámetros: {'algoritmo__colsample_bytree': 0.9, 'algoritmo__gamma': 0, 'algoritmo__max_depth': 5, 'algoritmo__min_child_weight': 6, 'algoritmo__n_estimators': 450, 'algoritmo__subsample': 0.9}
Mejor resultado del recall para la clase positiva: 0.5575122292103424


In [None]:
# Acceder al modelo dentro del Pipeline
xgb_model = best_model_xgb.named_steps['algoritmo']

# Obtener las importancias de las características
feature_importances = xgb_model.feature_importances_

if 'preprocesador' in best_model_xgb.named_steps:
    preprocessor = best_model_xgb.named_steps['preprocesador']
    feature_names = preprocessor.get_feature_names_out()

    importances_series = pd.Series(feature_importances, index=feature_names)
    print(importances_series.sort_values(ascending=False))

num__Total_cmp                         0.220172
num__customes_seniority                0.068627
num__Household_members                 0.066530
cat_onehot__Marital_Status_Single      0.038947
num__NumWebVisitsMonth                 0.036663
num__Recency                           0.035630
num__Teenhome                          0.034904
num__AcceptedCmp5                      0.034557
cat_ordinal__Education                 0.032742
num__NumStorePurchases                 0.029930
num__AcceptedCmp3                      0.029727
num__Kidhome                           0.028177
num__AcceptedCmp1                      0.027486
cat_onehot__Marital_Status_Together    0.027044
num__MntMeatProducts                   0.025993
num__NumDealsPurchases                 0.023398
num__MntGoldProds                      0.023347
cat_onehot__Marital_Status_Married     0.022462
num__NumCatalogPurchases               0.021637
num__Income                            0.019905
num__NumWebPurchases                   0

## Evaluación Modelos

#### Classification Report

In [None]:
# RandomForest
print('RandomForest')
y_pred_rf = grid_rf.predict(X_test)

print(classification_report(y_test,y_pred_rf))
print()

# LigthGBM
print('LightGBM')
y_pred_lgb = grid_lgb.predict(X_test)
print(classification_report(y_test,y_pred_lgb))
print()

# XGBoost
print('LightGBM')
y_pred_xgb = grid_xgb.predict(X_test)
print(classification_report(y_test,y_pred_xgb))
print()



RandomForest


NameError: name 'X_test' is not defined

#### Matrix de confusión

In [None]:
# RandomForest
print('RandomForest')
cm_rf = confusion_matrix(y_test, y_pred_rf)
disp_rf = ConfusionMatrixDisplay(confusion_matrix=cm_rf)
disp_rf.plot()
print()

# LightGBM
print('LightGBM')
cm_lgb = confusion_matrix(y_test, y_pred_lgb)
disp_lgb = ConfusionMatrixDisplay(confusion_matrix=cm_lgb)
disp_lgb.plot()
print()

# XGBoost
print('XGBoost')
cm_xgb = confusion_matrix(y_test, y_pred_xgb)
disp_xgb = ConfusionMatrixDisplay(confusion_matrix=cm_xgb)
disp_xgb.plot()
print()