In [None]:
# 📦 BLOQUE 1: Carga de datos
import pandas as pd
df = pd.read_csv("dataset_base.csv", low_memory=False)
df['periodo'] = pd.to_datetime(df['periodo'], format='%Y-%m')
df = df.sort_values(['product_id', 'periodo'])

In [None]:
# ⚖️ BLOQUE 2: Normalización estándar por producto
from sklearn.preprocessing import StandardScaler

df_scaled = df.copy()
cols_a_normalizar = ['tn', 'cust_request_tn']
df_scaled[cols_a_normalizar] = df_scaled.groupby('product_id')[cols_a_normalizar].transform(lambda x: StandardScaler().fit_transform(x.values.reshape(-1,1)).flatten())

In [None]:
# 🔣 BLOQUE 3: Codificación de variables categóricas
from sklearn.preprocessing import LabelEncoder

categoricas = ['cat1', 'cat2']
for col in categoricas:
    le = LabelEncoder()
    df_scaled[col] = le.fit_transform(df_scaled[col].astype(str))

In [None]:
# 🌎 BLOQUE 4: Variables exógenas
df_exog = pd.read_csv("variables_exogenas.csv")
df_exog['periodo'] = pd.to_datetime(df_exog['periodo'], format='%Y-%m')
df = df_scaled.merge(df_exog, on='periodo', how='left')

In [None]:
# 📊 BLOQUE 5: Clustering con DTW
from tslearn.clustering import TimeSeriesKMeans
from tslearn.utils import to_time_series_dataset

series = df.pivot_table(index='periodo', columns='product_id', values='tn', aggfunc='sum').fillna(0).T
X = to_time_series_dataset(series.values)
modelo_dtw = TimeSeriesKMeans(n_clusters=50, metric="dtw", random_state=42)
df['cluster_dtw'] = modelo_dtw.fit_predict(X)[df['product_id'].values - 1]

In [None]:
# 🔮 BLOQUE 6: Variables generadas con Prophet
# Supone que ya se tienen las features prophet generadas por producto en un archivo externo
df_prophet = pd.read_csv("features_prophet.csv")
df_prophet['periodo'] = pd.to_datetime(df_prophet['periodo'], format='%Y-%m')
df = df.merge(df_prophet, on=['product_id', 'periodo'], how='left')

In [None]:
# 🕒 BLOQUE 7: División temporal
train = df[df['periodo'] < '2019-12-01']
val = df[df['periodo'] == '2019-12-01']
test = df[df['periodo'] == '2020-02-01']

features = [c for c in df.columns if c not in ['tn', 'periodo', 'product_id']]
X_train, y_train = train[features], train['tn']
X_val, y_val = val[features], val['tn']
X_test = test[features]

In [None]:
# 🌲 BLOQUE 8: XGBoost entrenamiento
import xgboost as xgb
from sklearn.metrics import mean_absolute_error

params = {
    'max_depth': 7,
    'learning_rate': 0.05,
    'n_estimators': 400,
    'subsample': 0.9,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.5,
    'reg_lambda': 1.0,
    'random_state': 42
}

model = xgb.XGBRegressor(**params)
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=20, verbose=False)
print("MAE validación diciembre:", mean_absolute_error(y_val, model.predict(X_val)))

In [None]:
# 🧾 BLOQUE 9: Predicción febrero 2020
preds = model.predict(X_test)
df_test = test[['product_id']].copy()
df_test['tn_predicho'] = preds
df_test.to_csv("prediccion_febrero2020_xgb.csv", index=False)
print("✅ Exportado a prediccion_febrero2020_xgb.csv")