# 📦 Paso 1: Cargar y preparar datos

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pickle

# Cargar dataset
df = pd.read_csv("dataset_top90_series_completas.csv", parse_dates=["periodo"])
df["product_id"] = df["product_id"].astype(str)

# Filtrar hasta octubre 2019
train_df = df[df["periodo"] <= "2019-10-01"].copy()

# Seleccionar features
exclude = ["tn", "customer_id", "periodo", "product_id"]
y = train_df["tn"]
X = train_df.drop(columns=[col for col in exclude if col in train_df.columns])

# División
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
sample_weight = np.log1p(y_train + 1)

# Guardar datos
with open("datos_entrenamiento.pkl", "wb") as f:
    pickle.dump((X_train, X_val, y_train, y_val, sample_weight, X.columns.tolist()), f)

print("✅ Datos preparados y guardados")

✅ Datos preparados y guardados


# 🔍 Paso 2: Entrenamiento con Optuna + XGBoost

In [2]:
import optuna
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import pickle
import json
import joblib

# Cargar datos
with open("datos_entrenamiento.pkl", "rb") as f:
    X_train, X_val, y_train, y_val, sample_weight, columnas = pickle.load(f)

# Estudio Optuna
storage = "sqlite:///optuna_tn_study_protegidow.db"
study = optuna.create_study(
    direction='minimize',
    study_name="prediccion_tn2",
    storage=storage,
    load_if_exists=True,
    sampler=optuna.samplers.TPESampler(),
    pruner=optuna.pruners.MedianPruner(n_startup_trials=5)
)

# Objetivo protegido
def objective(trial):
    try:
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 350),
            'max_depth': trial.suggest_int('max_depth', 3, 7),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 5.0),
            'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 5.0),
            'tree_method': 'hist',
            'random_state': 42,
            'n_jobs': -1,
            'verbosity': 1
        }
        model = XGBRegressor(**params)
        model.fit(X_train, y_train, sample_weight=sample_weight)
        pred = model.predict(X_val)
        return np.sqrt(mean_squared_error(y_val, pred))
    except Exception as e:
        trial.set_user_attr("failed_reason", str(e))
        return float("inf")

study.optimize(objective, n_trials=30)

# Mejor modelo
best_params = study.best_params
best_params.update({'tree_method': 'hist', 'random_state': 42})
model_final = XGBRegressor(**best_params)
model_final.fit(pd.concat([X_train, X_val]), pd.concat([y_train, y_val]), sample_weight=np.log1p(pd.concat([y_train, y_val]) + 1))

joblib.dump(model_final, "modelo_xgb.pkl")
with open("mejores_parametros.json", "w") as f:
    json.dump(best_params, f)

print("✅ Modelo entrenado y guardado")

  from .autonotebook import tqdm as notebook_tqdm
[I 2025-06-27 18:26:25,248] A new study created in RDB with name: prediccion_tn2
[I 2025-06-27 18:26:50,034] Trial 0 finished with value: 0.5327711375604227 and parameters: {'n_estimators': 156, 'max_depth': 7, 'learning_rate': 0.0738073637122238, 'subsample': 0.6177512789924311, 'colsample_bytree': 0.99658319384641, 'reg_alpha': 0.004485314420782371, 'reg_lambda': 1.4114992973578566}. Best is trial 0 with value: 0.5327711375604227.
[I 2025-06-27 18:27:25,431] Trial 1 finished with value: 0.5930630639209056 and parameters: {'n_estimators': 258, 'max_depth': 3, 'learning_rate': 0.09338766516365588, 'subsample': 0.7662707767155423, 'colsample_bytree': 0.9790768838224574, 'reg_alpha': 4.976585525681322, 'reg_lambda': 0.894024889772575}. Best is trial 0 with value: 0.5327711375604227.
[I 2025-06-27 18:27:55,239] Trial 2 finished with value: 0.5228716802047851 and parameters: {'n_estimators': 300, 'max_depth': 6, 'learning_rate': 0.089745570

✅ Modelo entrenado y guardado


# 📈 Paso 3: Predecir febrero 2020

In [3]:
import joblib
import pandas as pd

# Cargar modelo y columnas
model_final = joblib.load("modelo_xgb.pkl")
with open("datos_entrenamiento.pkl", "rb") as f:
    _, _, _, _, _, columnas = pickle.load(f)

# Cargar base original
df = pd.read_csv("dataset_top90_series_completas.csv", parse_dates=["periodo"])
df["product_id"] = df["product_id"].astype(str)

# Diciembre → febrero
febrero = df[df["periodo"] == "2019-12-01"].copy()
febrero["periodo"] = pd.to_datetime("2020-02-01")
febrero = febrero.groupby("product_id", as_index=False).first()

# Lags
for i, mes in zip(range(1, 4), ["2019-12-01", "2019-11-01", "2019-10-01"]):
    lag_df = (
        df[df["periodo"] == mes]
        .groupby("product_id", as_index=False)["tn"]
        .mean()
        .rename(columns={"tn": f"lag_{i}"})
    )
    febrero = febrero.merge(lag_df, on="product_id", how="left")

# Completar columnas faltantes
faltantes = set(columnas) - set(febrero.columns)
for col in faltantes:
    febrero[col] = 0
febrero_X = febrero[columnas]

# Predecir y exportar
febrero["tn_pred"] = model_final.predict(febrero_X)
febrero[["product_id", "tn_pred"]].to_csv("prediccion_febrero_top90xgb.csv", index=False)

print("✅ Predicción exportada como 'prediccion_febrero_top90xgb.csv'")

✅ Predicción exportada como 'prediccion_febrero_top90xgb.csv'
