In [None]:
import pandas as pd
import lightgbm as lgb
import optuna
from sklearn.metrics import mean_absolute_error

# === 1. Cargar dataset ===
df = pd.read_csv("top20nuevo.csv")
df['periodo'] = pd.to_datetime(df['periodo'])

# 🔧 Limpiar nombres de columnas
df.columns = df.columns.str.replace(r"[^\w]", "_", regex=True)

# === 2. Features y target ===
features = [col for col in df.columns if col not in ['product_id', 'periodo', 'tn']]
target = 'tn'

# === 3. Separar train y validación ===
df_train = df[df['periodo'] < '2019-12-01']
df_val = df[df['periodo'] == '2019-12-01']

X_train = df_train[features].copy()
y_train = df_train[target]
X_val = df_val[features].copy()
y_val = df_val[target]

# 🔁 4. Convertir columnas object a category ===
for col in X_train.select_dtypes(include='object').columns:
    X_train[col] = X_train[col].astype('category')
    X_val[col] = X_val[col].astype('category')

# === 5. Función objetivo de Optuna ===
def objective(trial):
    params = {
    'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
    'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.3, log=True),
    'max_depth': trial.suggest_int('max_depth', 3, 15),
    'num_leaves': trial.suggest_int('num_leaves', 20, 512),
    'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 300),
    'min_child_weight': trial.suggest_float('min_child_weight', 0.001, 10.0),  # Regulariza leaf-wise
    'subsample': trial.suggest_float('subsample', 0.5, 1.0),  # aka bagging_fraction
    'subsample_freq': trial.suggest_int('subsample_freq', 1, 10),
    'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),  # aka feature_fraction
    'colsample_bynode': trial.suggest_float('colsample_bynode', 0.5, 1.0),
    'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 5.0),  # L1
    'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 5.0),  # L2
    'max_bin': trial.suggest_int('max_bin', 64, 512),
    'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 1.0),  # gain mínimo para hacer split
    'cat_smooth': trial.suggest_float('cat_smooth', 1.0, 100.0),
    'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),  # o agregar 'goss' si no usás subsample
    'random_state': 42,
    'verbosity': -1
}

    

    model = lgb.LGBMRegressor(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    return mean_absolute_error(y_val, y_pred)

# === 6. Ejecutar la búsqueda bayesiana con persistencia ===
study = optuna.create_study(
    direction="minimize",
    study_name="lgbm_dstn201",
    storage="sqlite:///optuna_lgbm_top201nuevo.db",
    load_if_exists=True
)
study.optimize(objective, n_trials=50)

# === 7. Ver resultados ===
print("✅ Mejor MAE:", study.best_value)
print("🧪 Mejores hiperparámetros:", study.best_params)


  from .autonotebook import tqdm as notebook_tqdm
[I 2025-07-12 21:53:13,406] A new study created in RDB with name: lgbm_dstn201
[I 2025-07-12 22:24:18,028] Trial 0 finished with value: 0.0006900484413865793 and parameters: {'n_estimators': 828, 'learning_rate': 0.12429524985447538, 'max_depth': 15, 'num_leaves': 456, 'min_data_in_leaf': 286, 'min_child_weight': 0.5619194307398462, 'subsample': 0.726053141459008, 'subsample_freq': 7, 'colsample_bytree': 0.7797069254086433, 'colsample_bynode': 0.6383449194049172, 'reg_alpha': 4.072667326486733, 'reg_lambda': 0.34147791579771325, 'max_bin': 495, 'min_split_gain': 0.7124778300368457, 'cat_smooth': 42.47362024156224, 'boosting_type': 'dart'}. Best is trial 0 with value: 0.0006900484413865793.
