In [1]:
import pandas as pd
import lightgbm as lgb
import optuna
from sklearn.metrics import mean_absolute_error

# === 1. Cargar dataset ===
df = pd.read_csv("dataset_base.csv")
df['periodo'] = pd.to_datetime(df['periodo'])

# 🔧 Limpiar nombres de columnas
df.columns = df.columns.str.replace(r"[^\w]", "_", regex=True)

# === 2. Features y target ===
features = [col for col in df.columns if col not in ['product_id', 'periodo', 'tn']]
target = 'tn'

# === 3. Separar train y validación ===
df_train = df[df['periodo'] < '2019-12-01']
df_val = df[df['periodo'] == '2019-12-01']

X_train = df_train[features].copy()
y_train = df_train[target]
X_val = df_val[features].copy()
y_val = df_val[target]

# 🔁 4. Convertir columnas object a category ===
for col in X_train.select_dtypes(include='object').columns:
    X_train[col] = X_train[col].astype('category')
    X_val[col] = X_val[col].astype('category')

# === 5. Función objetivo de Optuna ===
def objective(trial):
    params = {
    'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
    'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.3, log=True),
    'max_depth': trial.suggest_int('max_depth', 3, 15),
    'num_leaves': trial.suggest_int('num_leaves', 20, 512),
    'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 300),
    'min_child_weight': trial.suggest_float('min_child_weight', 0.001, 10.0),  # Regulariza leaf-wise
    'subsample': trial.suggest_float('subsample', 0.5, 1.0),  # aka bagging_fraction
    'subsample_freq': trial.suggest_int('subsample_freq', 1, 10),
    'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),  # aka feature_fraction
    'colsample_bynode': trial.suggest_float('colsample_bynode', 0.5, 1.0),
    'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 5.0),  # L1
    'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 5.0),  # L2
    'max_bin': trial.suggest_int('max_bin', 64, 512),
    'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 1.0),  # gain mínimo para hacer split
    'cat_smooth': trial.suggest_float('cat_smooth', 1.0, 100.0),
    #'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),  # o agregar 'goss' si no usás subsample
    'random_state': 42,
    'verbosity': -1
}

    

    model = lgb.LGBMRegressor(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    return mean_absolute_error(y_val, y_pred)

# === 6. Ejecutar la búsqueda bayesiana con persistencia ===
study = optuna.create_study(
    direction="minimize",
    study_name="lgbm_datasetbase",
    storage="sqlite:///optuna_lgbm_datasetbase.db",
    load_if_exists=True
)
study.optimize(objective, n_trials=30)

# === 7. Ver resultados ===
print(" Mejor MAE:", study.best_value)
print(" Mejores hiperparámetros:", study.best_params)


  from .autonotebook import tqdm as notebook_tqdm
[I 2025-07-14 10:56:20,369] A new study created in RDB with name: lgbm_datasetbase
[I 2025-07-14 11:00:56,650] Trial 0 finished with value: 0.029371363910001284 and parameters: {'n_estimators': 865, 'learning_rate': 0.010393621677634825, 'max_depth': 9, 'num_leaves': 409, 'min_data_in_leaf': 249, 'min_child_weight': 7.994555833371292, 'subsample': 0.6243098399394597, 'subsample_freq': 4, 'colsample_bytree': 0.7630448169559698, 'colsample_bynode': 0.5232727532044896, 'reg_alpha': 1.2613551291675522, 'reg_lambda': 3.842877838913601, 'max_bin': 104, 'min_split_gain': 0.5055658548184295, 'cat_smooth': 70.61527805462639}. Best is trial 0 with value: 0.029371363910001284.
[I 2025-07-14 11:05:06,765] Trial 1 finished with value: 0.01808622141197805 and parameters: {'n_estimators': 718, 'learning_rate': 0.01843666909042403, 'max_depth': 11, 'num_leaves': 353, 'min_data_in_leaf': 60, 'min_child_weight': 2.7721450057144104, 'subsample': 0.8798189

 Mejor MAE: 0.011013104984155293
 Mejores hiperparámetros: {'n_estimators': 834, 'learning_rate': 0.06449926163783713, 'max_depth': 13, 'num_leaves': 197, 'min_data_in_leaf': 208, 'min_child_weight': 3.7932779938198546, 'subsample': 0.7032151245633396, 'subsample_freq': 7, 'colsample_bytree': 0.9893937066314805, 'colsample_bynode': 0.8148358693555268, 'reg_alpha': 4.962755134948597, 'reg_lambda': 3.8191748367071927, 'max_bin': 512, 'min_split_gain': 0.006311109685921704, 'cat_smooth': 49.82693114488869}
