# EQUIPO 36 | Avance 5: Modelo Final
## Proyecto: Predicción de infestaciones de gorgojo del agave
## Integrantes equipo 36:

| Nombre | Matrícula |
| ------ | --------- |
| André Martins Cordebello | A00572928 |
| Enrique Eduardo Solís Da Costa | A00572678 |
| Delbert Francisco Custodio Vargas | A01795613 |

In [23]:
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.ensemble import VotingClassifier, StackingClassifier
from imblearn.ensemble  import BalancedRandomForestClassifier
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from bayes_opt import BayesianOptimization
from imblearn.over_sampling import SMOTENC
from imblearn.over_sampling import SMOTE
from catboost import CatBoostClassifier
from sklearn.neighbors import BallTree
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import lightgbm as lgb
import xgboost as xgb
import seaborn as sns
import pandas as pd
import numpy as np
import joblib
import optuna
import pickle
import time

## Cargamos el dataset final

In [2]:
df = pd.read_excel("data_with_weather_information.xlsx")

In [3]:
df.dtypes

tramp_id                               object
sampling_date                  datetime64[ns]
lat                                   float64
lon                                   float64
municipality                           object
plantation_age                          int64
capture_count                         float64
state                                  object
square_area_imputed                   float64
month                                   int64
year                                    int64
year-month                             object
day_of_year_sin                       float64
day_of_year_cos                       float64
day_of_week_sin                       float64
day_of_week_cos                       float64
week_of_year_sin                      float64
week_of_year_cos                      float64
month_sin                             float64
month_cos                             float64
critical_season                         int64
severity_encoded                  

# Modelos individuales: `LightGBM` y `CatBoost`

In [63]:
# Copiamos el dataframe con la información
train_test_df = df.copy()
train_test_df = train_test_df.sort_values(by='sampling_date').reset_index(drop=True)

# Hacemos un encoding basico para State y Municipalidad
for col in ['state', 'municipality']:
    le = LabelEncoder()
    train_test_df[col] = le.fit_transform(train_test_df[col])

# Generamos la mascara para obtener los datos de antes del 2025 y del 2025 por separado
train_mask = train_test_df['sampling_date'].dt.year < 2025
test_mask  = train_test_df['sampling_date'].dt.year == 2025

# Excluimos la variable objetivo (severity_encoded) y algunos variables o features que ya tenemos contenidos en nuestros
# features creados. `capture_count` no podemos tomarlo en cuenta porque se relaciona directamente con la severidad.
exclude_cols = [
    'severity_encoded','tramp_id', 'capture_count', 
    'month', 'year-month', 'sampling_date', 'municipality', 
    'state'
]

# Cargamos los features a tomar en cuenta (obviamos los features en exclude_cols)
features = [col for col in train_test_df.columns if col not in exclude_cols]

# Generamos nuestro split de entrenamiento y test por medio de las mascaras train_mask y test_mask
X_train, y_train = train_test_df.loc[train_mask, features], train_test_df.loc[train_mask, 'severity_encoded'] # El train dataset es la data historica de 2014 a 2024
X_test,  y_test  = train_test_df.loc[test_mask,  features], train_test_df.loc[test_mask,  'severity_encoded'] # El test dataset es la data a partir de 2025

### Modelo `LightGBM`

In [64]:
LightGBM_X_train = X_train.copy()
LightGBM_X_test  = X_test.copy()
LightGBM_Y_train = y_train.copy()
LightGBM_Y_test  = y_test.copy()

scaler = MinMaxScaler()
LightGBM_X_train[['distance_to_nearest_hotspot']] = scaler.fit_transform(LightGBM_X_train[['distance_to_nearest_hotspot']])
LightGBM_X_test[['distance_to_nearest_hotspot']] = scaler.transform(LightGBM_X_test[['distance_to_nearest_hotspot']])

lgbm_best_params = LGBMClassifier(
    boosting_type = "gbdt",
    objective = "multiclass",
    num_class = 4,
    class_weight = "balanced",
    is_unbalance = False,
    device_type = "gpu",
    min_gain_to_split = 0.001,  
    random_state = 42,
    verbose = -1,
    learning_rate= 0.031205207400998834,
    num_leaves= 110,
    max_depth= 11,
    feature_fraction= 0.91959030797181,
    bagging_fraction= 0.7694621015318531,
    lambda_l1= 1.8616621273598788,
    lambda_l2= 2.6453430076619573,
    min_child_samples= 70,
    n_estimators= 299
)

lgbm_best_params.fit(LightGBM_X_train, LightGBM_Y_train)

y_pred_lgbm_best = lgbm_best_params.predict(LightGBM_X_test, categorical_features=['critical_season'])


print("\n\nResultados para LightGBM después de usar Optuna:\n\n")
print(classification_report(LightGBM_Y_test, y_pred_lgbm_best, digits=3))

print("\n\nMatriz de confusión para LightGBM:\n")
print(confusion_matrix(y_test, y_pred_lgbm_best))
y_pred_proba_lgbm_best_params = lgbm_best_params.predict_proba(X_test)



Resultados para LightGBM después de usar Optuna:


              precision    recall  f1-score   support

           0      0.263     0.330     0.293     24445
           1      0.771     0.707     0.737     82928
           2      0.131     0.153     0.141      2544
           3      0.383     1.000     0.554       110

    accuracy                          0.611    110027
   macro avg      0.387     0.548     0.431    110027
weighted avg      0.643     0.611     0.625    110027



Matriz de confusión para LightGBM:

[[ 8079 16056   296    14]
 [21921 58619  2282   106]
 [  725  1373   389    57]
 [    0     0     0   110]]


In [65]:
# Guardamos el modelo para su futuro uso
with open('ligthgbm.pkl', 'wb') as file:
    pickle.dump(lgbm_best_params, file)

### Modelo `CatBoost`

In [66]:
CatBoost_X_train = X_train.copy()
CatBoost_X_test  = X_test.copy()
CatBoost_Y_train = y_train.copy()
CatBoost_Y_test  = y_test.copy()

scaler = MinMaxScaler()
CatBoost_X_train[['distance_to_nearest_hotspot']] = scaler.fit_transform(CatBoost_X_train[['distance_to_nearest_hotspot']])
CatBoost_X_test[['distance_to_nearest_hotspot']] = scaler.transform(CatBoost_X_test[['distance_to_nearest_hotspot']])


class_counts = CatBoost_Y_train.value_counts().sort_index()
num_classes = len(class_counts)
total = len(CatBoost_Y_train)
class_weights = {i: total / (num_classes * count) for i, count in class_counts.items()}

weights = CatBoost_Y_train.map(class_weights)

cat_model_best = CatBoostClassifier(                 
    loss_function='MultiClass',    
    eval_metric='TotalF1',         
    auto_class_weights='Balanced', 
    random_seed=42,
    task_type='GPU',               
    #verbose=100,
    iterations= 900,
    learning_rate= 0.03169683936081736,
    depth= 8,
    l2_leaf_reg= 1.0126729009882989,
    bagging_temperature= 0.13742770730370382,
    border_count= 238,
    random_strength= 1.490982729702571,
    grow_policy= 'SymmetricTree',
    logging_level= 'Silent'
)

categorical_cols = ['critical_season']

cat_model_best.fit(
    CatBoost_X_train,
    CatBoost_Y_train,
    cat_features=categorical_cols if 'categorical_cols' in locals() else None,
    eval_set=(CatBoost_X_test, CatBoost_Y_test),
    use_best_model=True
)

y_pred_cd_best = cat_model_best.predict(CatBoost_X_test)
y_pred_cd_best = y_pred_cd_best.flatten()

print("Resultados para CatBoost según Optuna:\n")
print(classification_report(CatBoost_Y_test, y_pred_cd_best))


print("\nMatriz de confusión para CatBoost:")
print(confusion_matrix(CatBoost_Y_test, y_pred_cd_best))
y_pred_proba_cv_best_params = cat_model_best.predict_proba(CatBoost_X_test)

Resultados para CatBoost según Optuna:

              precision    recall  f1-score   support

           0       0.31      0.20      0.24     24445
           1       0.76      0.70      0.73     82928
           2       0.07      0.48      0.12      2544
           3       0.33      1.00      0.49       110

    accuracy                           0.58    110027
   macro avg       0.37      0.59      0.40    110027
weighted avg       0.64      0.58      0.61    110027


Matriz de confusión para CatBoost:
[[ 4965 17050  2411    19]
 [11127 57787 13869   145]
 [  161  1105  1214    64]
 [    0     0     0   110]]


In [68]:
# Guardamos el modelo para su futuro uso
with open('catboost.pkl', 'wb') as file:
    pickle.dump(cat_model_best, file)

# Estrategias de ensamble homogeneas

## `LightGBM Bagging`

In [None]:

import lightgbm as lgb
import numpy as np
from sklearn.metrics import f1_score
from sklearn.utils import resample

# Cargamos nuestros train y test sets
Bagging_X_train = X_train.copy()
Bagging_Y_train = y_train.copy()
Bagging_X_test  = X_test.copy()
Bagging_Y_test  = y_test.copy()

#  Cantidad de modelos a generar para el Bagging
n_models = 15
models = []
oof_preds = []
test_preds = []


for i in range(n_models):
    
    # Generamos nuevos sets de entrenamiento y test por cada ciclo
    X_train_sub, y_train_sub = resample(Bagging_X_train, Bagging_Y_train, replace=True, random_state=42 + i)
    
    # Instanciamos LightGBM Classifier
    model = lgb.LGBMClassifier( 
                                boosting_type = "gbdt",
                                objective = "multiclass",
                                num_class = 4,
                                class_weight = "balanced",
                                is_unbalance = False,
                                device_type = "gpu",
                                min_gain_to_split = 0.001,  
                                random_state = 42,
                                verbose = -1,
                                learning_rate= 0.031205207400998834,
                                num_leaves= 110,
                                max_depth= 11,
                                feature_fraction= 0.91959030797181,
                                bagging_fraction= 0.7694621015318531,
                                lambda_l1= 1.8616621273598788,
                                lambda_l2= 2.6453430076619573,
                                min_child_samples= 70,
                                n_estimators= 299
                            )
    
    # Entrenamos sobre los sets de entrenamiento y test por ciclo
    model.fit(X_train_sub, y_train_sub)
    models.append(model)

    # Guardamos las predicciones out-of-fold
    y_pred = model.predict_proba(Bagging_X_test)
    test_preds.append(y_pred)

# Promediamos los resultados
avg_proba = np.mean(test_preds, axis=0)
y_pred_final = np.argmax(avg_proba, axis=1)

# Calculamos el F1 Score
f1_macro = f1_score(Bagging_Y_test, y_pred_final, average="macro")
print(f"F1-macro para Bagging con LightGBM: {f1_macro:.4f}")

# Imprimimos resultados
print("\n\nResultados para LightGBM después de usar Bagging:\n\n")
print(classification_report(Bagging_Y_test, y_pred_final, digits=3))

print("\n\nMatriz de confusión para LightGBM Bagging:\n")
print(confusion_matrix(Bagging_Y_test, y_pred_final))

F1-macro para Bagging con LightGBM: 0.4316


Resultados para LightGBM después de usar Optuna:


              precision    recall  f1-score   support

           0      0.257     0.283     0.269     24445
           1      0.767     0.747     0.757     82928
           2      0.171     0.132     0.149      2544
           3      0.381     1.000     0.551       110

    accuracy                          0.630    110027
   macro avg      0.394     0.541     0.432    110027
weighted avg      0.639     0.630     0.634    110027



Matriz de confusión para LightGBM Bagging:

[[ 6914 17361   155    15]
 [19367 61978  1476   107]
 [  650  1501   336    57]
 [    0     0     0   110]]


## `LightGBM Stacking`

In [74]:
X_train_stack = X_train.copy()
y_train_stack = y_train.copy()
X_test_stack  = X_test.copy()
y_test_stack  = y_test.copy()

lgbm_1 = lgb.LGBMClassifier(
    boosting_type='gbdt', 
    objective='multiclass', 
    num_class=4,            
    class_weight='balanced',
    learning_rate=0.05,
    n_estimators=200,
    max_depth=-1,
    num_leaves=31,
    random_state=42,
    device='gpu',
    is_unbalance=False
)

lgbm_2 = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=4,
    learning_rate=0.05,
    num_leaves=31,
    max_depth=10,
    feature_fraction=0.9,
    bagging_fraction=0.8,
    random_state=42,
    device_type='gpu',
    is_unbalance=False,
    class_weight='balanced'
)

lgbm_3 = lgb.LGBMClassifier(
    boosting_type = "gbdt",
    objective = "multiclass",
    num_class = 4,
    class_weight = "balanced",
    is_unbalance = False,
    device_type = "gpu",
    min_gain_to_split = 0.001,  
    random_state = 42,
    verbose = -1,
    learning_rate= 0.031205207400998834,
    num_leaves= 110,
    max_depth= 11,
    feature_fraction= 0.91959030797181,
    bagging_fraction= 0.7694621015318531,
    lambda_l1= 1.8616621273598788,
    lambda_l2= 2.6453430076619573,
    min_child_samples= 70,
    n_estimators= 299
)

meta_model = LogisticRegression(max_iter=1000, multi_class='multinomial')


stack_model = StackingClassifier(
    estimators=[
        ('lgbm1', lgbm_1),
        ('lgbm2', lgbm_2),
        ('lgbm3', lgbm_3)
    ],
    final_estimator=meta_model,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1
)

stack_model.fit(X_train_stack, y_train_stack)

y_pred = stack_model.predict(X_test_stack)

print("Resultados luego de hacer LightGBM Stacking")
print(classification_report(y_test_stack, y_pred, digits=4))

print("\nMatriz de confusion para LightGBM stacking:\n")
print(confusion_matrix(y_test_stack, y_pred))



Resultados luego de hacer LightGBM Stacking
              precision    recall  f1-score   support

           0     0.2677    0.2617    0.2647     24445
           1     0.7679    0.7950    0.7812     82928
           2     0.0800    0.0008    0.0016      2544
           3     0.3976    0.9182    0.5549       110

    accuracy                         0.6583    110027
   macro avg     0.3783    0.4939    0.4006    110027
weighted avg     0.6405    0.6583    0.6482    110027


Matriz de confusion para LightGBM stacking:

[[ 6397 18034     6     8]
 [16898 65927    11    92]
 [  599  1890     2    53]
 [    0     3     6   101]]


## `CatBoost Bagging`

In [75]:
# Generamos nuestros Train y Test datasets
Bagging_X_train = X_train.copy()
Bagging_Y_train = y_train.copy()
Bagging_X_test  = X_test.copy()
Bagging_Y_test  = y_test.copy()

#  Cantidad de modelos a generar para el Bagging
n_models = 5
models = []
oof_preds = []
test_preds = []

scaler = MinMaxScaler()
Bagging_X_train[['distance_to_nearest_hotspot']] = scaler.fit_transform(Bagging_X_train[['distance_to_nearest_hotspot']])
Bagging_X_test[['distance_to_nearest_hotspot']] = scaler.transform(Bagging_X_test[['distance_to_nearest_hotspot']])


class_counts = Bagging_Y_train.value_counts().sort_index()
num_classes = len(class_counts)
total = len(Bagging_Y_train)
class_weights = {i: total / (num_classes * count) for i, count in class_counts.items()}

weights = Bagging_Y_train.map(class_weights)

categorical_cols = ['critical_season']

for i in range(n_models):
    
    X_train_sub, y_train_sub = resample(Bagging_X_train, Bagging_Y_train, replace=True, random_state=42 + i)

    model = CatBoostClassifier(                 
        loss_function='MultiClass',    
        eval_metric='TotalF1',         
        auto_class_weights='Balanced', 
        random_seed=42,
        task_type='GPU',               
        #verbose=100,
        iterations= 900,
        learning_rate= 0.03169683936081736,
        depth= 8,
        l2_leaf_reg= 1.0126729009882989,
        bagging_temperature= 0.13742770730370382,
        border_count= 238,
        random_strength= 1.490982729702571,
        grow_policy= 'SymmetricTree',
        logging_level= 'Silent'
    )
    
    
    model.fit(
        X_train_sub,
        y_train_sub,
        cat_features=categorical_cols if 'categorical_cols' in locals() else None,
        eval_set=(Bagging_X_test, Bagging_Y_test),
        use_best_model=True,
        verbose=False)
    
    models.append(model)

    # Guardamos las predicciones out-of-fold
    y_pred = model.predict_proba(Bagging_X_test)
    test_preds.append(y_pred)
    
    
avg_proba = np.mean(test_preds, axis=0)
y_pred_final = np.argmax(avg_proba, axis=1)

f1_macro = f1_score(Bagging_Y_test, y_pred_final, average="macro")
print(f"F1-macro para Bagging con CatBoost: {f1_macro:.4f}")


print("\n\nResultados para CatBoost después de usar Bagging:\n\n")
print(classification_report(Bagging_Y_test, y_pred_final, digits=3))

print("\n\nMatriz de confusión para CatBoost Bagging:\n")
print(confusion_matrix(Bagging_Y_test, y_pred_final))

F1-macro para Bagging con CatBoost: 0.3919


Resultados para CatBoost después de usar Bagging:


              precision    recall  f1-score   support

           0      0.286     0.156     0.202     24445
           1      0.752     0.737     0.744     82928
           2      0.076     0.451     0.131      2544
           3      0.325     1.000     0.491       110

    accuracy                          0.601    110027
   macro avg      0.360     0.586     0.392    110027
weighted avg      0.632     0.601     0.609    110027



Matriz de confusión para CatBoost Bagging:

[[ 3821 18976  1629    19]
 [ 9445 61088 12250   145]
 [  113  1220  1147    64]
 [    0     0     0   110]]


## `Stacking CatBoost` 

In [78]:
X_train_stack = X_train.copy()
y_train_stack = y_train.copy()
X_test_stack  = X_test.copy()
y_test_stack  = y_test.copy()


cb_1 = CatBoostClassifier(                 
    loss_function='MultiClass',    
    eval_metric='TotalF1',         
    auto_class_weights='Balanced', 
    random_seed=42,
    task_type='GPU',               
    iterations=900,
    learning_rate=0.03169683936081736,
    depth=8,
    l2_leaf_reg=1.0126729009882989,
    bagging_temperature=0.13742770730370382,
    border_count=238,
    random_strength=1.490982729702571,
    grow_policy='SymmetricTree',
    verbose=0  # evita conflictos con verbose
)

cb_2 = CatBoostClassifier(
    iterations=300,
    learning_rate=0.05,
    depth=10,
    l2_leaf_reg=3,
    loss_function="MultiClass",
    eval_metric="TotalF1",
    auto_class_weights="Balanced",
    task_type="GPU",
    random_seed=100,
    verbose=0
)

cb_3 = CatBoostClassifier(
    iterations=500,
    learning_rate=0.02,
    depth=8,
    l2_leaf_reg=2,
    loss_function="MultiClass",
    eval_metric="TotalF1",
    auto_class_weights="Balanced",
    task_type="GPU",
    random_seed=2025,
    verbose=0
)



meta_model = LogisticRegression(
    max_iter=2000,
    multi_class='multinomial',
    solver='lbfgs'
)


models = [cb_1, cb_2, cb_3]

train_probas = []
test_probas  = []
results = []

for i, model in enumerate(models, 1):
    print(f"\nCatBoost #{i} ...")
    start = time.time()
    
    model.fit(X_train_stack, y_train_stack)
    
    y_train_pred = model.predict_proba(X_train_stack)
    y_test_pred  = model.predict_proba(X_test_stack)
    
    train_probas.append(y_train_pred)
    test_probas.append(y_test_pred)
    
    y_pred_class = np.argmax(y_test_pred, axis=1)
    f1_macro = f1_score(y_test_stack, y_pred_class, average="macro")
    elapsed = time.time() - start
    
    results.append([f"CatBoost_Model_{i}", f1_macro, elapsed])
    print(f"Modelo #{i} Listo — F1-macro: {f1_macro:.4f} | Tiempo: {elapsed:.2f}s")

results_df = pd.DataFrame(results, columns=["Model", "F1-macro", "Training Time (s)"])
print(results_df)

X_meta_train = np.hstack(train_probas)
X_meta_test  = np.hstack(test_probas)

meta_model = LogisticRegression(
    max_iter=2000,
    multi_class='multinomial',
    solver='lbfgs'
)

meta_model.fit(X_meta_train, y_train_stack)

y_pred_stack = meta_model.predict(X_meta_test)

print("\nCatBoost Stacking resultados:")
print(classification_report(y_test_stack, y_pred_stack, digits=4))
print("\nMatriz de confusion:")
print(confusion_matrix(y_test_stack, y_pred_stack))


CatBoost #1 ...
Modelo #1 Listo — F1-macro: 0.4071 | Tiempo: 25.74s

CatBoost #2 ...
Modelo #2 Listo — F1-macro: 0.3695 | Tiempo: 16.61s

CatBoost #3 ...
Modelo #3 Listo — F1-macro: 0.3757 | Tiempo: 13.60s
              Model  F1-macro  Training Time (s)
0  CatBoost_Model_1  0.407078          25.741772
1  CatBoost_Model_2  0.369531          16.608711
2  CatBoost_Model_3  0.375697          13.601240





CatBoost Stacking resultados:
              precision    recall  f1-score   support

           0     0.2302    0.1933    0.2102     24445
           1     0.7579    0.8163    0.7860     82928
           2     0.1515    0.0039    0.0077      2544
           3     0.3717    0.3818    0.3767       110

    accuracy                         0.6587    110027
   macro avg     0.3778    0.3489    0.3451    110027
weighted avg     0.6263    0.6587    0.6397    110027


Matriz de confusion:
[[ 4726 19711     3     5]
 [15156 67697    27    48]
 [  647  1869    10    18]
 [    0    42    26    42]]


# Estrategias de ensamble heterogéneas

## `VotingClassifier`

In [66]:
Voting_X_train = X_train.copy()
Voting_Y_train = y_train.copy()
Voting_X_test  = X_test.copy()
Voting_Y_test  = y_test.copy()

tscv = TimeSeriesSplit(n_splits=3)

def objective(trial):
    w1 = trial.suggest_float("lgb_weight", 0.0, 2.0)
    w2 = trial.suggest_float("cb_weight", 0.0, 2.0)
    f1_scores = []

    for train_idx, val_idx in tscv.split(Voting_X_train):
        X_tr, X_val = Voting_X_train.iloc[train_idx], Voting_X_train.iloc[val_idx]
        y_tr, y_val = Voting_Y_train.iloc[train_idx], Voting_Y_train.iloc[val_idx]

        prob_lgb = lgbm_best_params.predict_proba(X_val)
        prob_cb  = cat_model_best.predict_proba(X_val)

        combined_proba = (w1 * prob_lgb + w2 * prob_cb) / (w1 + w2 + 1e-8)
        y_pred = np.argmax(combined_proba, axis=1)

        f1 = f1_score(y_val, y_pred, average="macro")
        f1_scores.append(f1)

    return np.mean(f1_scores)

study_name = "VotingClassifier_Optim"
storage_name = f"sqlite:///{study_name}.db"

study_VotingClassifier = optuna.create_study(study_name=study_name, storage=storage_name, direction='maximize', load_if_exists=True)
study_VotingClassifier.optimize(objective, n_trials=30, show_progress_bar=False)

best_w1 = study_VotingClassifier.best_params["lgb_weight"]
best_w2 = study_VotingClassifier.best_params["cb_weight"]

prob_lgb_test = lgbm_best_params.predict_proba(Voting_X_test)
prob_cb_test  = cat_model_best.predict_proba(Voting_X_test)
combined_proba_test = (best_w1 * prob_lgb_test + best_w2 * prob_cb_test) / (best_w1 + best_w2 + 1e-8)

y_pred_voting = np.argmax(combined_proba_test, axis=1)

print(f"\n\nMejor F1-Macro: {study_VotingClassifier.best_value:.4f}")
print(f"Pesos óptimos: {study_VotingClassifier.best_params}")

[I 2025-10-21 22:48:32,944] Using an existing study with name 'VotingClassifier_Optim' instead of creating a new one.
[I 2025-10-21 22:48:42,403] Trial 20 finished with value: 0.5792678789619482 and parameters: {'lgb_weight': 0.4403983508656484, 'cb_weight': 0.14649566927715146}. Best is trial 6 with value: 0.5799062965369872.
[I 2025-10-21 22:48:51,674] Trial 21 finished with value: 0.5793159451961619 and parameters: {'lgb_weight': 1.7154732357023226, 'cb_weight': 1.2617068154368596}. Best is trial 6 with value: 0.5799062965369872.
[I 2025-10-21 22:49:00,919] Trial 22 finished with value: 0.5799092167221863 and parameters: {'lgb_weight': 0.6693836902992534, 'cb_weight': 0.011682848753281995}. Best is trial 22 with value: 0.5799092167221863.
[I 2025-10-21 22:49:10,517] Trial 23 finished with value: 0.5798696485244378 and parameters: {'lgb_weight': 0.8110971831590188, 'cb_weight': 0.17634646392270098}. Best is trial 22 with value: 0.5799092167221863.
[I 2025-10-21 22:49:20,130] Trial 24



Mejor F1-Macro: 0.5799
Pesos óptimos: {'lgb_weight': 1.9516866156032004, 'cb_weight': 0.6156930840292797}


In [61]:
# Crear VotingClassifier 
voting_model = VotingClassifier( 
                                estimators=[ ('lgb', lgbm_best_params), ('cb', cat_model_best) ], 
                                voting='soft', 
                                weights=[0.5685868882216454, 0.010730550423117324], 
                                n_jobs=-1 
                            ) 

start = time.time() 

voting_model.fit(Voting_X_train, Voting_Y_train) 

train_time = time.time() - start 

y_pred_voting = voting_model.predict(Voting_X_test) 
y_proba_voting = voting_model.predict_proba(Voting_X_test) 

print(f"Tiempo de entrenamiento: {train_time:.2f} segundos\n") 
print("\nReporte de clasificación (VotingClassifier)") 
print(classification_report(Voting_Y_test, y_pred_voting, digits=3)) 
print("\nMatriz de confusión para VotingClassifier:") 
print(confusion_matrix(Voting_Y_test, y_pred_voting))

Tiempo de entrenamiento: 383.96 segundos


Reporte de clasificación (VotingClassifier)
              precision    recall  f1-score   support

           0      0.257     0.293     0.274     24445
           1      0.767     0.728     0.747     82928
           2      0.124     0.158     0.139      2544
           3      0.383     1.000     0.554       110

    accuracy                          0.618    110027
   macro avg      0.383     0.545     0.428    110027
weighted avg      0.638     0.618     0.628    110027


Matriz de confusión para VotingClassifier:
[[ 7158 16941   332    14]
 [19956 60344  2522   106]
 [  687  1397   403    57]
 [    0     0     0   110]]
