In [None]:
pip install optuna

In [None]:
pip install -U scikit-learn


In [None]:
# ------------------------- 1. IMPORTS -------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import holidays
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from lightgbm import LGBMRegressor
import optuna

plt.style.use("seaborn-v0_8-whitegrid") 

# -------------------- 2. CARGA Y LIMPIEZA ---------------------
df = pd.read_csv("/content/ocupacion_diaria_hotel.csv", parse_dates=["fecha"])
df = df.iloc[4:]                           
df = df[df["fecha"] <= "2021-01-01"].reset_index(drop=True)

# -------------------- 3. FEATURE ENGINEERING -----------------
df["dow"]        = df["fecha"].dt.dayofweek
df["dow_sin"]    = np.sin(2 * np.pi * df["dow"]   / 7)
df["dow_cos"]    = np.cos(2 * np.pi * df["dow"]   / 7)
df["month"]      = df["fecha"].dt.month
df["month_sin"]  = np.sin(2 * np.pi * df["month"] / 12)
df["month_cos"]  = np.cos(2 * np.pi * df["month"] / 12)
df["es_pandemia"]           = (df["fecha"] >= "2020-03-01").astype(int)
df["es_viernes_o_sabado"]   = df["dow"].isin([4, 5]).astype(int)
df["temporada_alta"]        = df["month"].isin([6, 7, 11]).astype(int)
mx_holidays                 = holidays.MX(years=[2019, 2020])
df["es_festivo"]            = df["fecha"].isin(mx_holidays).astype(int)

for lag in (1, 7, 14):
    df[f"lag_{lag}"] = df["personas"].shift(lag)
df["rolling_mean_7"] = df["personas"].shift(1).rolling(7).mean()

df = df.dropna().reset_index(drop=True)

# ---------------- 4. ESCALADO Y VARIABLES DE MODELO ----------
features_to_scale = [
    "personas", "lag_1", "lag_7", "lag_14", "rolling_mean_7",
    "dow_sin", "dow_cos", "month_sin", "month_cos",
]
scaler = MinMaxScaler()
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

feature_cols = [
    "lag_1", "lag_7", "lag_14", "rolling_mean_7",
    "dow_sin", "dow_cos", "month_sin", "month_cos",
    "temporada_alta", "es_festivo",
    "es_viernes_o_sabado", "es_pandemia",
]
target_col = "personas"

# --------------- 5. TRAIN-TEST SPLIT TEMPORAL ----------------
FECHA_TEST = pd.to_datetime("2020-08-10")
mask_train = df["fecha"] < FECHA_TEST
mask_test  = ~mask_train

X_train, y_train = df.loc[mask_train, feature_cols], df.loc[mask_train, target_col]
X_test,  y_test  = df.loc[mask_test,  feature_cols], df.loc[mask_test,  target_col]
fechas_test      = df.loc[mask_test,  "fecha"]

print(f"Tamaño train: {X_train.shape} | Tamaño test: {X_test.shape}")

# -------- 6. MODELO LIGHTGBM BASE (VALORES FIJOS) ------------
lgb_base = LGBMRegressor(
    n_estimators     = 200,
    max_depth        = 8,
    learning_rate    = 0.08,
    subsample        = 0.8,
    colsample_bytree = 0.8,
    random_state     = 42,
    n_jobs           = -1,
)
lgb_base.fit(X_train, y_train)
y_pred_base = lgb_base.predict(X_test)

# ----- 7. UTILIDAD PARA DESNORMALIZAR Y MÉTRICAS -------------
def inverse_scale(y_scaled: np.ndarray) -> np.ndarray:
    tmp = np.zeros((len(y_scaled), len(features_to_scale)))
    tmp[:, features_to_scale.index("personas")] = y_scaled
    return scaler.inverse_transform(tmp)[:, features_to_scale.index("personas")]

y_test_real      = inverse_scale(y_test.values)
y_pred_base_real = inverse_scale(y_pred_base)

mae_base  = mean_absolute_error(y_test_real, y_pred_base_real)
rmse_base = mean_squared_error(y_test_real, y_pred_base_real)
print(f"LightGBM BASE ⇒ MAE: {mae_base:.2f} | RMSE: {rmse_base:.2f}")

# -------- 8. AJUSTE DE HÍPERPARÁMETROS CON OPTUNA ------------
def objective_lgb(trial):
    params = {
        "n_estimators"     : trial.suggest_int("n_estimators", 100, 500),
        "max_depth"        : trial.suggest_int("max_depth",    3,  12),
        "learning_rate"    : trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample"        : trial.suggest_float("subsample",      0.7, 1.0),
        "colsample_bytree" : trial.suggest_float("colsample_bytree",0.7, 1.0),
        "random_state"     : 42,
        "n_jobs"           : -1,
        "verbosity"        : -1,
    }
    model = LGBMRegressor(**params)
    model.fit(X_train, y_train)
    preds_real = inverse_scale(model.predict(X_test))
    return mean_absolute_error(y_test_real, preds_real)

study = optuna.create_study(direction="minimize")
study.optimize(objective_lgb, n_trials=300, show_progress_bar=True)

print("Mejores hiperparámetros:", study.best_params)
print(f"Mejor MAE Optuna       : {study.best_value:.2f}")

# --------- 9. ENTRENAR MODELO ÓPTIMO Y EVALUAR ---------------
best_params = study.best_params | {"random_state": 42, "n_jobs": -1}
lgb_best = LGBMRegressor(**best_params)
lgb_best.fit(X_train, y_train)

y_pred_best_real = inverse_scale(lgb_best.predict(X_test))
mae_best  = mean_absolute_error(y_test_real, y_pred_best_real)
rmse_best = mean_squared_error(y_test_real, y_pred_best_real)
print(f"LightGBM OPTUNA ⇒ MAE: {mae_best:.2f} | RMSE: {rmse_best:.2f}")

# -------------- 10. GRÁFICA PREDICCIÓN VS REAL ---------------
plt.figure(figsize=(15, 6))
plt.plot(fechas_test, y_test_real, label="Real", linewidth=2)
plt.plot(fechas_test, y_pred_best_real, "--", label="Predicho (LightGBM)", alpha=0.8)
plt.title("Ocupación diaria: real vs predicha (LightGBM)")
plt.xlabel("Fecha"); plt.ylabel("Personas hospedadas")
plt.legend(); plt.tight_layout(); plt.show()

# ---------------- 11. FUNCIÓN DE FORECAST LGB ----------------
def forecast_lightgbm(model, last_df: pd.DataFrame,
                      days: int = 30, n_history: int = 14):
    """Pronostica 'days' días usando LightGBM con lags autorregresivos."""
    import copy, datetime
    last_date   = last_df["fecha"].max()
    forecast_dates = pd.date_range(last_date + datetime.timedelta(days=1),
                                   periods=days)
    window = copy.deepcopy(
        last_df.tail(n_history)[feature_cols].values
    )

    preds_scaled = []
    for fecha in forecast_dates:
        dow = fecha.dayofweek
        month = fecha.month
        dow_sin   = np.sin(2 * np.pi * dow   / 7)
        dow_cos   = np.cos(2 * np.pi * dow   / 7)
        month_sin = np.sin(2 * np.pi * month / 12)
        month_cos = np.cos(2 * np.pi * month / 12)

        es_viernes_o_sabado = int(dow in (4, 5))
        temporada_alta      = int(month in (6, 7, 11))
        es_festivo          = int(fecha in holidays.MX(years=[2019, 2020, 2021]))
        es_pandemia         = int(fecha >= pd.to_datetime("2020-03-01"))

        lag_1  = window[-1, feature_cols.index("lag_1")]
        lag_7  = window[-7, feature_cols.index("lag_1")]  if len(window) >= 7  else lag_1
        lag_14 = window[-14, feature_cols.index("lag_1")] if len(window) >= 14 else lag_1
        rolling_mean_7 = np.mean(window[-7:, feature_cols.index("lag_1")])

        new_row = np.array([[
            lag_1, lag_7, lag_14, rolling_mean_7,
            dow_sin, dow_cos, month_sin, month_cos,
            temporada_alta, es_festivo,
            es_viernes_o_sabado, es_pandemia
        ]])

        pred_scaled = model.predict(new_row)[0]
        preds_scaled.append(pred_scaled)

        new_row[0, 0] = pred_scaled 
        window = np.vstack([window, new_row])[1:]

    preds_real = inverse_scale(np.array(preds_scaled))
    return forecast_dates, preds_real

# ---------------- 12. FORECAST Y VISUALIZACIÓN ----------------
forecast_dates, forecast_real = forecast_lightgbm(lgb_best, df)

plt.figure(figsize=(15, 6))
plt.plot(fechas_test, y_test_real, label="Real (test)", linewidth=2)
plt.plot(forecast_dates, forecast_real, "--", label="Forecast 30 días", linewidth=2)
plt.axvline(fechas_test.max(), color="grey", linestyle="--", label="Inicio forecast")
plt.title("Forecast de ocupación hotelera (LightGBM)")
plt.xlabel("Fecha"); plt.ylabel("Personas hospedadas")
plt.legend(); plt.tight_layout(); plt.show()
