In [1]:
import pandas as pd
df=pd.read_csv("datasetwg_features.csv")
df.shape

(9460980, 66)

In [10]:
# Cargá sólo las primeras 10 filas para testear
df_test = pd.read_csv("datasetwg_features.csv", nrows=10)
print(df_test.head())


   customer_id  product_id  periodo  plan_precios_cuidados  cust_request_qty  \
0        10004       20001  2017-01                    0.0               9.0   
1        10003       20001  2017-01                    0.0              17.0   
2        10002       20001  2017-01                    0.0              17.0   
3        10001       20001  2017-01                    0.0              11.0   
4        10084       20001  2017-01                    0.0               1.0   

   cust_request_tn         tn  stock_final cat1         cat2  ...  \
0        184.72927  184.72927          0.0   HC  ROPA LAVADO  ...   
1        143.49426  143.49426          0.0   HC  ROPA LAVADO  ...   
2         38.68301   35.72806          0.0   HC  ROPA LAVADO  ...   
3         99.43861   99.43861          0.0   HC  ROPA LAVADO  ...   
4          0.22386    0.22386          0.0   HC  ROPA LAVADO  ...   

  tn_min_movil_3 delta1_media_movil_6  tn_std_movil_3  tn_min_movil_6  \
0            NaN               

In [None]:
# 💾 Script LightGBM optimizado para predecir tn en febrero 2020

import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error

# 1. Leer dataset por chunks y agrupar tn por producto y período
chunks = pd.read_csv("datasetwg_features.csv", chunksize=500_000, usecols=['product_id', 'periodo', 'tn'])

df_prod = pd.DataFrame()

for chunk in chunks:
    chunk['periodo'] = pd.to_datetime(chunk['periodo'])
    agrupado = chunk.groupby(['product_id', 'periodo'])['tn'].sum().reset_index()
    df_prod = pd.concat([df_prod, agrupado], ignore_index=True)

# Reagrupar para evitar duplicados por combinación de product_id y periodo
df_prod = df_prod.groupby(['product_id', 'periodo'])['tn'].sum().reset_index()

# 2. Crear lags
def crear_lags(df, lags=[1, 2, 3]):
    df_lags = df.copy()
    for lag in lags:
        df_lags[f'lag_{lag}'] = df_lags.groupby('product_id')['tn'].shift(lag)
    return df_lags

df_lags = crear_lags(df_prod)

# 3. Agregar variable mes
df_lags['mes'] = df_lags['periodo'].dt.month

# 4. Eliminar filas con NaN por los lags
df_train = df_lags.dropna()

# 5. Separar features y target
features = [col for col in df_train.columns if col not in ['tn', 'periodo', 'product_id']]
target = 'tn'

# 6. Separar train y validación
train_data = df_train[df_train['periodo'] < '2019-12-01']
val_data = df_train[df_train['periodo'] == '2019-12-01']

X_train = train_data[features]
y_train = train_data[target]
X_val = val_data[features]
y_val = val_data[target]

# 7. Hiperparámetros de Optuna
mejores_params = {
    'n_estimators': 637,
    'learning_rate': 0.02650784457484953,
    'max_depth': 11,
    'num_leaves': 235,
    'subsample': 0.8343868380897532,
    'colsample_bytree': 0.8951027444070669,
    'random_state': 42,
    'min_data_in_leaf': 41,
    'reg_alpha': 0.6533888575804235,
    'reg_lambda': 2.405036107158264,
   # 'linear_tree': True
}

modelo = lgb.LGBMRegressor(**mejores_params)
modelo.fit(X_train, y_train)

# 8. Validación
y_pred_val = modelo.predict(X_val)
print("📊 MAE en diciembre 2019:", mean_absolute_error(y_val, y_pred_val))

# 9. Predicción enero 2020
dic = {
    'lag_1': df_prod[df_prod['periodo'] == '2019-12-01'].set_index('product_id')['tn'],
    'lag_2': df_prod[df_prod['periodo'] == '2019-11-01'].set_index('product_id')['tn'],
    'lag_3': df_prod[df_prod['periodo'] == '2019-10-01'].set_index('product_id')['tn']
}

enero_df = df_prod[df_prod['periodo'] == '2019-12-01'].copy()
enero_df['periodo'] = pd.to_datetime('2020-01-01')
enero_df['lag_1'] = dic['lag_1']
enero_df['lag_2'] = dic['lag_2']
enero_df['lag_3'] = dic['lag_3']
enero_df['mes'] = 1
enero_df['tn'] = modelo.predict(enero_df[features])

# 10. Predicción febrero 2020
feb_df = pd.DataFrame()
feb_df['product_id'] = enero_df['product_id']
feb_df['periodo'] = pd.to_datetime('2020-02-01')
feb_df['lag_1'] = enero_df['tn']
feb_df['lag_2'] = dic['lag_1'].reindex(enero_df['product_id'].values).values
feb_df['lag_3'] = dic['lag_2'].reindex(enero_df['product_id'].values).values
feb_df['mes'] = 2
feb_df['tn_predicho'] = modelo.predict(feb_df[features])

# 11. Exportar resultados
resultado = feb_df[['product_id', 'tn_predicho']]
resultado.to_csv("prediccion_feb2020_lightgbm_optim.csv", index=False)
print("✅ Predicción de febrero 2020 guardada en prediccion_feb2020_lightgbm_optim.csv")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000413 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 778
[LightGBM] [Info] Number of data points in the train set: 24960, number of used features: 4
[LightGBM] [Info] Start training from score 40.316508
📊 MAE en diciembre 2019: 8.595236545364013
✅ Predicción de febrero 2020 guardada en prediccion_feb2020_lightgbm_optim.csv
