# Librerías utilizadas a lo largo del proyeto

In [1]:
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.ensemble import VotingClassifier, StackingClassifier
from imblearn.ensemble  import BalancedRandomForestClassifier
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from bayes_opt import BayesianOptimization
from imblearn.over_sampling import SMOTENC
from imblearn.over_sampling import SMOTE
from catboost import CatBoostClassifier
from sklearn.neighbors import BallTree
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import lightgbm as lgb
import xgboost as xgb
import seaborn as sns
import pandas as pd
import numpy as np
import joblib
import optuna
import pickle
import time

  from .autonotebook import tqdm as notebook_tqdm


# Utilizando SMOTE en el dataset completo

In [2]:
df = pd.read_excel('data_with_weather_information.xlsx')

SMOTE_df = df.copy()

for col in ['state', 'municipality']:
    le = LabelEncoder()
    SMOTE_df[col] = le.fit_transform(SMOTE_df[col])

train_mask = (SMOTE_df['sampling_date'].dt.year < 2025) & (SMOTE_df['sampling_date'].dt.year >= 2014)
test_mask  = SMOTE_df['sampling_date'].dt.year == 2025

exclude_cols = [
    'severity_encoded','tramp_id', 'capture_count',
    'month', 'year-month', 'sampling_date',
]

features = [col for col in SMOTE_df.columns if col not in exclude_cols]

X_train, y_train = SMOTE_df.loc[train_mask, features], SMOTE_df.loc[train_mask, 'severity_encoded'] # El train dataset es la data historica de 2014 a 2024
X_test,  y_test  = SMOTE_df.loc[test_mask,  features], SMOTE_df.loc[test_mask,  'severity_encoded'] # El test dataset es la data a partir de 2025


categorical_features = [
    X_train.columns.get_loc(col)
    for col in ['municipality', 'state', 'critical_season' ]
]

smote_nc = SMOTENC(
    categorical_features=categorical_features,
    random_state=42,
    sampling_strategy='auto'
)

X_resampled, y_resampled = smote_nc.fit_resample(X_train, y_train)

In [3]:
print("Antes de SMOTENC:")
print(y_train.value_counts(normalize=True))

print("\nDespues de SMOTENC:")
print(pd.Series(y_resampled).value_counts(normalize=True))

Antes de SMOTENC:
severity_encoded
1    0.607012
0    0.371983
2    0.019409
3    0.001596
Name: proportion, dtype: float64

Despues de SMOTENC:
severity_encoded
0    0.25
2    0.25
1    0.25
3    0.25
Name: proportion, dtype: float64


In [4]:
# Creamos un dataFrame con la data sintetica
X_resampled_df = pd.DataFrame(X_resampled, columns=features)
y_resampled_df = pd.DataFrame(y_resampled, columns=['severity_encoded'])

# Combinamos los feature y target para ver las dimensiones del nuevo trainset
SMOTE_resampled_df = pd.concat([X_resampled_df, y_resampled_df], axis=1)

SMOTE_resampled_df.shape

(1742924, 23)

In [9]:
chunk_size = 800000
output_file = 'SMOTE_resampled_train.xlsx'

with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
    for i in range(0, len(SMOTE_resampled_df), chunk_size):
        chunk_df = SMOTE_resampled_df.iloc[i:i + chunk_size]
        sheet_name = f'Sheet_{i // chunk_size + 1}'
        chunk_df.to_excel(writer, sheet_name=sheet_name, index=False)

print(f"✅ DataFrame con {len(SMOTE_resampled_df):,} filas exportado en Excel correctamente.")

✅ DataFrame con 1,742,924 filas exportado en Excel correctamente.


## LightGBM + Optuna + SMOTE

In [26]:
# Copiamos el train set luego del resampling
lgbm_df = SMOTE_resampled_df.copy()

# Columnas que no son features
exclude_cols = [
    'severity_encoded', 'tramp_id', 'capture_count', 
    'month', 'year-month', 'sampling_date', 'municipality', 'state'
]

# Seleccionamos solos los features
features = [col for col in lgbm_df.columns if col not in exclude_cols]

X = lgbm_df[features]
y = lgbm_df['severity_encoded']


def objective_lgbm(trial):
    params = {
        "boosting_type": "gbdt",
        "objective": "multiclass",
        "num_class": 4,
        "class_weight": "balanced",
        "is_unbalance": False,
        "device_type": "gpu",
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "num_leaves": trial.suggest_int("num_leaves", 20, 120),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.7, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.7, 1.0),
        "lambda_l1": trial.suggest_float("lambda_l1", 0.0, 5.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.0, 5.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 50, 200),
        "min_gain_to_split": 0.001,
        "n_estimators": trial.suggest_int("n_estimators", 200, 600),
        "random_state": 42,
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 50, 150),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0.001, 0.1),
        "verbose": -1,
        "force_col_wise" : True,
        "bagging_freq": 5,
    }

    model = LGBMClassifier(**params)
    model.fit(X, y)

    # Predicciones en test
    y_pred = model.predict(X_test[features])

    # Evaluación por F1 Macro en el test set
    f1_macro = f1_score(y_test, y_pred, average='macro')


    return f1_macro

study_name = "SMOTE_LightGBM_Optimization"
storage_name = f"sqlite:///{study_name}.db"

lgbm_study = optuna.create_study(
    study_name=study_name,
    storage=storage_name,
    direction='maximize',
    load_if_exists=True
)


lgbm_study.optimize(objective_lgbm, n_trials=30)

print("\n\n✅ Mejores parámetros encontrados:")
print(lgbm_study.best_params)
print(f"✅ Mejor F1 Macro en test: {lgbm_study.best_value:.4f}")

[I 2025-11-01 22:04:44,544] Using an existing study with name 'SMOTE_LightGBM_Optimization' instead of creating a new one.
[I 2025-11-01 22:05:19,066] Trial 19 finished with value: 0.4346099706250123 and parameters: {'learning_rate': 0.04373914146931453, 'num_leaves': 62, 'max_depth': 3, 'feature_fraction': 0.8796413997894356, 'bagging_fraction': 0.9859871173148639, 'lambda_l1': 0.053848167315025286, 'lambda_l2': 3.173025241142264, 'min_child_samples': 128, 'n_estimators': 248, 'min_data_in_leaf': 143, 'min_gain_to_split': 0.02116881542156135}. Best is trial 19 with value: 0.4346099706250123.
[I 2025-11-01 22:05:56,206] Trial 20 finished with value: 0.4332683137383515 and parameters: {'learning_rate': 0.03808518361276278, 'num_leaves': 53, 'max_depth': 3, 'feature_fraction': 0.885443492250489, 'bagging_fraction': 0.9864162163803565, 'lambda_l1': 0.1170003174520391, 'lambda_l2': 3.184680570619044, 'min_child_samples': 120, 'n_estimators': 268, 'min_data_in_leaf': 149, 'min_gain_to_split



✅ Mejores parámetros encontrados:
{'learning_rate': 0.06135176595096744, 'num_leaves': 57, 'max_depth': 3, 'feature_fraction': 0.8365377773708654, 'bagging_fraction': 0.9732747345344976, 'lambda_l1': 0.3259595787450307, 'lambda_l2': 3.094387437597022, 'min_child_samples': 115, 'n_estimators': 246, 'min_data_in_leaf': 150, 'min_gain_to_split': 0.00663956054949366}
✅ Mejor F1 Macro en test: 0.4378


In [27]:
lgbm_best_params = lgbm_study.best_params
lgbm_best_params

{'learning_rate': 0.06135176595096744,
 'num_leaves': 57,
 'max_depth': 3,
 'feature_fraction': 0.8365377773708654,
 'bagging_fraction': 0.9732747345344976,
 'lambda_l1': 0.3259595787450307,
 'lambda_l2': 3.094387437597022,
 'min_child_samples': 115,
 'n_estimators': 246,
 'min_data_in_leaf': 150,
 'min_gain_to_split': 0.00663956054949366}

In [30]:
lgbm_best_params = LGBMClassifier(
    boosting_type = "gbdt",
    objective = "multiclass",
    num_class = 4,
    class_weight = "balanced",
    is_unbalance = False,
    device_type = "gpu",
    min_gain_to_split = 0.001,  
    random_state = 42,
    verbose = -1,
    learning_rate= 0.06135176595096744,
    num_leaves= 57,
    max_depth= 3,
    feature_fraction= 0.8365377773708654,
    bagging_fraction= 0.9732747345344976,
    lambda_l1= 0.3259595787450307,
    lambda_l2= 3.094387437597022,
    min_child_samples= 115,
    n_estimators= 246,
    min_data_in_leaf = 150
    
)

lgbm_best_params.fit(X, y)

y_pred_lgbm_best = lgbm_best_params.predict(X_test[features])


print("Resultados para LightGBM después de usar Optuna y SMOTE:\n\n")
print(classification_report(y_test, y_pred_lgbm_best, digits=3))

print("\n\nMatriz de confusión con LightGBM y SMOTE:\n")
print(confusion_matrix(y_test,y_pred_lgbm_best))

y_pred_proba_lgbm_best_params = lgbm_best_params.predict_proba(X_test[features])

Resultados para LightGBM después de usar Optuna y SMOTE:


              precision    recall  f1-score   support

           0      0.252     0.175     0.206     24445
           1      0.759     0.809     0.783     82928
           2      0.160     0.273     0.202      2544
           3      0.395     0.955     0.559       110

    accuracy                          0.656    110027
   macro avg      0.391     0.553     0.437    110027
weighted avg      0.632     0.656     0.641    110027



Matriz de confusión con LightGBM y SMOTE:

[[ 4268 19963   201    13]
 [12258 67128  3442   100]
 [  396  1405   695    48]
 [    0     0     5   105]]


# T-SNE

In [15]:
df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
sampling_date,827856.0,2018-12-29 19:19:48.102278144,2014-01-01 00:00:00,2016-05-19 00:00:00,2017-11-10 00:00:00,2021-12-28 00:00:00,2025-08-31 00:00:00,
lat,827856.0,20.642391,18.987685,20.161786,20.597498,20.884595,23.439659,0.728798
lon,827856.0,-102.755748,-114.108837,-103.721093,-102.65035,-102.184024,-98.662612,1.30121
plantation_age,827856.0,2.428498,0.0,1.0,2.0,4.0,19.0,1.825808
capture_count,827856.0,4.264352,0.0,0.0,2.0,5.0,427.0,8.156295
square_area_imputed,827856.0,9.435819,0.0047,1.88,4.0,9.57,787.82,16.249825
month,827856.0,7.05305,1.0,4.0,7.0,10.0,12.0,3.359216
year,827856.0,2018.446555,2014.0,2016.0,2017.0,2021.0,2025.0,3.652609
day_of_year_sin,827856.0,-0.125555,-0.999991,-0.811539,-0.255353,0.559589,0.999991,0.704395
day_of_year_cos,827856.0,-0.020622,-0.999963,-0.708627,-0.03012,0.664855,1.0,0.698312
