#### Preparativos

In [1]:
# Instalaciones
!pip install lightgbm



In [3]:
# Importaciones

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV, cross_val_score, train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay

import lightgbm as lgb
from lightgbm import LGBMClassifier

import pickle

In [4]:
# Deslimitar/Limitar display Pandas

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_colwidth', None)

#### Preparación Dataset

In [5]:
# Read Matches data

df0 = pd.read_csv(r'D:\DEV\Python\00_TFM_PALLADIUM\02_DATASETS_GENERADOS\Reservas_Feature_Engineered_v1.csv', sep = ';', decimal=',')

  df0 = pd.read_csv(r'D:\DEV\Python\00_TFM_PALLADIUM\02_DATASETS_GENERADOS\Reservas_Feature_Engineered_v1.csv', sep = ';', decimal=',')


#### Preparación de un DF Mixto para CatBoost Básico

In [6]:
df_mix = df0.copy()

In [7]:
# Reconfigurar dtypes

df_mix.drop(columns=['ID_RESERVA'], inplace=True) # No usar, ID
df_mix.drop(columns=['ID_HOTEL'], inplace=True) # No usar, ID
df_mix['HOTEL'] = df_mix['HOTEL'].astype('category', errors='raise')
df_mix.drop(columns=['LLEGADA'], inplace=True) # No usar, contamina extrapolación
df_mix.drop(columns=['LLEGADA_ANO'], inplace=True) # No usar, contamina extrapolación
df_mix['LLEGADA_MES'] = pd.to_numeric(df_mix['LLEGADA_MES'], errors='raise').astype('category')
df_mix['LLEGADA_DIAm'] = pd.to_numeric(df_mix['LLEGADA_DIAm'], errors='raise').astype('category')
df_mix['LLEGADA_DIAs'] = pd.to_numeric(df_mix['LLEGADA_DIAs'], errors='raise').astype('category')
df_mix['LLEGADA_AVANCE'] = pd.to_numeric(df_mix['LLEGADA_AVANCE'], errors='raise').astype(float)
df_mix.drop(columns=['SALIDA'], inplace=True) # No usar, contamina extrapolación
df_mix.drop(columns=['SALIDA_ANO'], inplace=True) # No usar, contamina extrapolación
df_mix['SALIDA_MES'] = pd.to_numeric(df_mix['SALIDA_MES'], errors='raise').astype('category')
df_mix['SALIDA_DIAm'] = pd.to_numeric(df_mix['SALIDA_DIAm'], errors='raise').astype('category')
df_mix['SALIDA_DIAs'] = pd.to_numeric(df_mix['SALIDA_DIAs'], errors='raise').astype('category')
df_mix['SALIDA_AVANCE'] = pd.to_numeric(df_mix['SALIDA_AVANCE'], errors='raise').astype(float)
df_mix['NOCHES'] = pd.to_numeric(df_mix['NOCHES'], errors='raise').astype('Int64')
df_mix['DURACION_ESTANCIA'] = df_mix['DURACION_ESTANCIA'].astype('category', errors='raise')
df_mix['REGIMEN'] = df_mix['REGIMEN'].astype('category', errors='raise')
df_mix.drop(columns=['ID_TIPO'], inplace=True) # No usar, ID
df_mix['TIPO'] = df_mix['TIPO'].astype('category', errors='raise')
df_mix['USO'] = pd.to_numeric(df_mix['USO'], errors='raise').astype('Int64')
df_mix['PAX_NUM'] = pd.to_numeric(df_mix['PAX_NUM'], errors='raise').astype('Int64')
df_mix['PAX_CAT'] = df_mix['PAX_CAT'].astype('category', errors='raise')
df_mix['ADULTOS'] = pd.to_numeric(df_mix['ADULTOS'], errors='raise').astype('Int64')
df_mix['NENES'] = pd.to_numeric(df_mix['NENES'], errors='raise').astype('Int64')
df_mix['BEBES'] = pd.to_numeric(df_mix['BEBES'], errors='raise').astype('Int64')
df_mix.drop(columns=['ID_CLIENTE'], inplace=True) # No usar, ID
df_mix['TIPO_CLIENTE'] = pd.to_numeric(df_mix['TIPO_CLIENTE'], errors='raise').astype('category')
df_mix['CLIENTE'] = df_mix['CLIENTE'].astype('category', errors='raise')
df_mix['GRUPO'] = pd.to_numeric(df_mix['GRUPO'], errors='raise').astype('category')
df_mix.drop(columns=['ID_MONEDA'], inplace=True) # No usar, ID
df_mix['MONEDA'] = df_mix['MONEDA'].astype('category', errors='raise')
df_mix.drop(columns=['STATUS'], inplace=True) # No usar, redundante con variable objetivo
df_mix.drop(columns=['MOTIVO'], inplace=True) # No usar, imposible ver el futuro
df_mix.drop(columns=['CHECKIN'], inplace=True) # No usar, imposible ver el futuro
df_mix['SUPLETORIA'] = pd.to_numeric(df_mix['SUPLETORIA'], errors='raise').astype('Int64')
df_mix['CUNAS'] = pd.to_numeric(df_mix['CUNAS'], errors='raise').astype('Int64')
df_mix.drop(columns=['FECHA_TOMA'], inplace=True) # No usar, contamina extrapolación
df_mix.drop(columns=['FECHA_TOMA_ANO'], inplace=True) # No usar, contamina extrapolación
df_mix['FECHA_TOMA_MES'] = pd.to_numeric(df_mix['FECHA_TOMA_MES'], errors='raise').astype('category')
df_mix['FECHA_TOMA_DIAm'] = pd.to_numeric(df_mix['FECHA_TOMA_DIAm'], errors='raise').astype('category')
df_mix['FECHA_TOMA_DIAs'] = pd.to_numeric(df_mix['FECHA_TOMA_DIAs'], errors='raise').astype('category')
df_mix['FECHA_TOMA_AVANCE'] = pd.to_numeric(df_mix['FECHA_TOMA_AVANCE'], errors='raise').astype(float)
df_mix.drop(columns=['FECHA_MOD'], inplace=True) # No usar, contamina extrapolación, imposible ver el futuro
df_mix.drop(columns=['FECHA_MOD_ANO'], inplace=True) # No usar, contamina extrapolación, imposible ver el futuro
df_mix.drop(columns=['FECHA_MOD_MES'], inplace=True) # No usar, imposible ver el futuro
df_mix.drop(columns=['FECHA_MOD_DIAm'], inplace=True) # No usar, imposible ver el futuro
df_mix.drop(columns=['FECHA_MOD_DIAs'], inplace=True) # No usar, imposible ver el futuro
df_mix.drop(columns=['FECHA_MOD_AVANCE'], inplace=True) # No usar, imposible ver el futuro
df_mix.drop(columns=['FECHA_CANCELACION'], inplace=True) # No usar, contamina extrapolación, imposible ver el futuro
df_mix.drop(columns=['FECHA_CANCELACION_ANO'], inplace=True) # No usar, contamina extrapolación, imposible ver el futuro
df_mix.drop(columns=['FECHA_CANCELACION_MES'], inplace=True) # No usar, imposible ver el futuro
df_mix.drop(columns=['FECHA_CANCELACION_DIAm'], inplace=True) # No usar, imposible ver el futuro
df_mix.drop(columns=['FECHA_CANCELACION_DIAs'], inplace=True) # No usar, imposible ver el futuro
df_mix.drop(columns=['FECHA_CANCELACION_AVANCE'], inplace=True) # No usar, imposible ver el futuro
df_mix['LT_TOMA_LLEGADA'] = pd.to_numeric(df_mix['LT_TOMA_LLEGADA'], errors='raise').astype('Int64')
df_mix.drop(columns=['LT_TOMA_CANCELACION'], inplace=True) # No usar, imposible ver el futuro
df_mix.drop(columns=['ID_FIDELIDAD'], inplace=True) # No usar, ID
df_mix['FIDELIDAD'] = df_mix['FIDELIDAD'].astype('category', errors='raise')
df_mix.drop(columns=['VALHAB'], inplace=True) # No usar, no está convertido y teniendo el valor total es redundante y/o dependiente de COMERCIALIZADORA
df_mix.drop(columns=['VALPEN'], inplace=True) # No usar, no está convertido y teniendo el valor total es redundante y/o dependiente de COMERCIALIZADORA
df_mix.drop(columns=['VALSERV'], inplace=True) # No usar, no está convertido y teniendo el valor total es redundante y/o dependiente de COMERCIALIZADORA
df_mix.drop(columns=['VALFIJOS'], inplace=True) # No usar, no está convertido y teniendo el valor total es redundante y/o dependiente de COMERCIALIZADORA
df_mix['COMERCIALIZADORA'] = pd.to_numeric(df_mix['COMERCIALIZADORA'], errors='raise').astype('category')
df_mix.drop(columns=['CMVALHAB'], inplace=True) # No usar, no está convertido y teniendo el valor total es redundante y/o dependiente de COMERCIALIZADORA
df_mix.drop(columns=['CMVALPEN'], inplace=True) # No usar, no está convertido y teniendo el valor total es redundante y/o dependiente de COMERCIALIZADORA
df_mix.drop(columns=['CMCVALSERV'], inplace=True) # No usar, no está convertido y teniendo el valor total es redundante y/o dependiente de COMERCIALIZADORA
df_mix['VALOR_USD'] = pd.to_numeric(df_mix['VALOR_USD'], errors='raise').astype(float)
df_mix['VALOR_USD_PAX'] = pd.to_numeric(df_mix['VALOR_USD_PAX'], errors='raise').astype(float)
df_mix['VALOR_USD_NOCHE'] = pd.to_numeric(df_mix['VALOR_USD_NOCHE'], errors='raise').astype(float)
df_mix['VALOR_USD_PAX_NOCHE'] = pd.to_numeric(df_mix['VALOR_USD_PAX_NOCHE'], errors='raise').astype(float)
df_mix.drop(columns=['AUTORIZO'], inplace=True) # No usar, de momento no muy claro
df_mix['GRATIS'] = pd.to_numeric(df_mix['GRATIS'], errors='raise').astype('category')
df_mix['PAIS'] = df_mix['PAIS'].astype('category', errors='raise')
df_mix['CONTINENTE'] = df_mix['CONTINENTE'].astype('category', errors='raise')
df_mix['SEGMENTO'] = df_mix['SEGMENTO'].astype('category', errors='raise')
df_mix['FUENTE_NEGOCIO'] = df_mix['FUENTE_NEGOCIO'].astype('category', errors='raise')
df_mix['CANCELADA'] = pd.to_numeric(df_mix['CANCELADA'], errors='raise').astype('category')

In [8]:
X_mix = df_mix.drop(columns=['CANCELADA'])

y_mix = df_mix['CANCELADA']

X_mix_train, X_mix_test, y_mix_train, y_mix_test = train_test_split(X_mix, y_mix, test_size=0.2, random_state=42)

#### Modelo Optimizado

In [9]:
# Preparación de la validación cruzada
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [11]:
# Modelo LGBM Optimizado v2

modelo_lgbm_cv_ohp = LGBMClassifier(
    random_state=42,
)

param_grid_lgbm_cv_ohp = {
    'n_estimators': [3000],
    'max_depth': [25],
    'learning_rate': [0.35],
    'num_leaves': [75],
}

grid_search_lgbm_cv_ohp = GridSearchCV(
    estimator=modelo_lgbm_cv_ohp,
    param_grid=param_grid_lgbm_cv_ohp,
    cv=kf,
    scoring="accuracy",
    n_jobs=-1,
    verbose=3,
    )

grid_search_lgbm_cv_ohp.fit(X_mix_train, y_mix_train)
print(f"Best HyperParameters: {grid_search_lgbm_cv_ohp.best_params_}")

# Asignar mejor estimador al modelo
modelo_lgbm_cv_ohp = grid_search_lgbm_cv_ohp.best_estimator_

# Resultados sobre test
y_mix_pred_test_lgbm_cv_ohp = modelo_lgbm_cv_ohp.predict(X_mix_test)
accuracy_test = accuracy_score(y_mix_test, y_mix_pred_test_lgbm_cv_ohp)
print("Accuracy Test     :", accuracy_test)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[LightGBM] [Info] Number of positive: 346797, number of negative: 564634
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.087809 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3106
[LightGBM] [Info] Number of data points in the train set: 911431, number of used features: 41
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.380497 -> initscore=-0.487438
[LightGBM] [Info] Start training from score -0.487438
Best HyperParameters: {'learning_rate': 0.35, 'max_depth': 25, 'n_estimators': 3000, 'num_leaves': 75}
Accuracy Test     : 0.8141210754066129


#### Pickle

In [12]:
with open("modelo_lgbm_General.pkl", "wb") as f:
    pickle.dump(modelo_lgbm_cv_ohp, f)

In [13]:
with open("modelo_lgbm_General.pkl", "rb") as f:
    modelo_lgbm_General = pickle.load(f)

# Predecir
#y_pred = modelo_lgbm_Grande.predict(X_test)

In [14]:
y_mix_pred_test_lgbm_General = modelo_lgbm_General.predict(X_mix_test)
accuracy_test = accuracy_score(y_mix_test, y_mix_pred_test_lgbm_General)
print("Accuracy Test     :", accuracy_test)

Accuracy Test     : 0.8141210754066129


In [None]:
# Clavado, funciona perfectamente