In [1]:
import pandas as pd
df=pd.read_csv("dataset_top90_series_completas.csv")
df.shape

(795024, 114)

In [None]:
# 💾 Script LightGBM para predecir tn en febrero 2020

import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error

# 1. Cargar datos
df = pd.read_csv("top_20_corr.csv")
df['periodo'] = pd.to_datetime(df['periodo'])

# 2. Agregar ventas por producto y mes
df_prod = df.groupby(['product_id', 'periodo'])['tn'].sum().reset_index()

# 3. Crear lags
def crear_lags(df, lags=[1, 2, 3]):
    df_lags = df.copy()
    for lag in lags:
        df_lags[f'lag_{lag}'] = df_lags.groupby('product_id')['tn'].shift(lag)
    return df_lags

df_lags = crear_lags(df_prod)

# 4. Agregar variable mes
df_lags['mes'] = df_lags['periodo'].dt.month

# 5. Eliminar filas con NaN por los lags
df_train = df_lags.dropna()

# 6. Separar features y target
# Usá todas excepto tn, periodo y product_id
features = [col for col in df_train.columns if col not in ['tn', 'periodo', 'product_id']]
#features = ['lag_1', 'lag_2', 'lag_3', 'mes']
target = 'tn'

# 7. Separar train y validación
train_data = df_train[df_train['periodo'] < '2019-12-01']
val_data = df_train[df_train['periodo'] == '2019-12-01']

X_train = train_data[features]
y_train = train_data[target]
X_val = val_data[features]
y_val = val_data[target]

# 8. Hiperparámetros de Optuna
mejores_params = {
   'n_estimators': 774,
    'learning_rate': 0.013833513325596666,
    'max_depth': 10,
    'num_leaves': 183,
    'subsample': 0.7698019331794512,
    'colsample_bytree': 0.7228716766181406,
    'random_state': 42,
    'min_data_in_leaf': 320,
    'reg_alpha': 0.5095229446241581,
    'reg_lambda': 1.616069961165865,
    'linear_tree': True
}

modelo = lgb.LGBMRegressor(**mejores_params)
modelo.fit(X_train, y_train)

# 9. Validación
y_pred_val = modelo.predict(X_val)
print("📊 MAE en diciembre 2019:", mean_absolute_error(y_val, y_pred_val))

# 10. Predicción enero 2020
enero_df = df_prod[df_prod['periodo'] == '2019-12-01'].copy()
enero_df['periodo'] = pd.to_datetime('2020-01-01')
enero_df['lag_1'] = df_prod[df_prod['periodo'] == '2019-12-01']['tn'].values
enero_df['lag_2'] = df_prod[df_prod['periodo'] == '2019-11-01']['tn'].values
enero_df['lag_3'] = df_prod[df_prod['periodo'] == '2019-10-01']['tn'].values
enero_df['mes'] = 1
enero_df['tn'] = modelo.predict(enero_df[features])

# 11. Predicción febrero 2020
feb_df = pd.DataFrame()
feb_df['product_id'] = enero_df['product_id']
feb_df['periodo'] = pd.to_datetime('2020-02-01')
feb_df['lag_1'] = enero_df['tn']  # predicción enero
feb_df['lag_2'] = df_prod[df_prod['periodo'] == '2019-12-01']['tn'].values
feb_df['lag_3'] = df_prod[df_prod['periodo'] == '2019-11-01']['tn'].values
feb_df['mes'] = 2
feb_df['tn_predicho'] = modelo.predict(feb_df[features])

# 12. Exportar resultados
resultado = feb_df[['product_id', 'tn_predicho']]
resultado.to_csv("prediccion_feb2020_top20.csv", index=False)
print("✅ Predicción de febrero 2020 guardada en prediccion_feb2020_top20.csv")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000140 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 751
[LightGBM] [Info] Number of data points in the train set: 736, number of used features: 4
[LightGBM] [Info] Start training from score 545.043467
📊 MAE en diciembre 2019: 79.51344859292284
✅ Predicción de febrero 2020 guardada en prediccion_feb2020_top40.csv


In [4]:
import pandas as pd

# 1. Cargar predicciones de Autogluon (febrero 2020)
df_pred = pd.read_csv("prediccion_feb2020_top40.csv")  # asumimos que tiene 'product_id' y 'tn_predicha'
total_predicho = df_pred["tn_predicho"].sum()

# 2. Cargar dataset base
df_real = pd.read_csv("top_40_corr.csv")
df_real["periodo"] = pd.to_datetime(df_real["periodo"])

# 3. Agrupar toneladas reales por período
tn_por_periodo = df_real.groupby("periodo")["tn"].sum().reset_index()

# 4. Agregar predicción de febrero 2020 como nueva fila
nueva_fila = pd.DataFrame({"periodo": [pd.Timestamp("2020-02-01")], "tn": [total_predicho]})
comparacion = pd.concat([tn_por_periodo, nueva_fila], ignore_index=True)

# 5. Ordenar por período
comparacion = comparacion.sort_values("periodo")

# 6. Mostrar
print(comparacion.tail(6))  # últimas filas para ver cómo se compara febrero 2020


      periodo            tn
31 2019-08-01   8142.772430
32 2019-09-01  12112.214620
33 2019-10-01  13491.000060
34 2019-11-01  11420.581260
35 2019-12-01   9960.630910
36 2020-02-01  10219.230486
