In [1]:
# ==============================================
# COMPARACIÓN: Predicción vs Datos Reales - DICIEMBRE 2023
# Para los 86 productos del archivo productos_ranking.csv
# ==============================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Dropout, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau


In [3]:
# =========================
# 1. Cargar datos y preparar semanalmente
# =========================
df_ranking = pd.read_csv("productos_ranking.csv")
df = pd.read_excel("../DATA/DataHackathon.xlsx")
articulos_top = df_ranking['Articulo'].tolist()
df = df[df['Articulo'].isin(articulos_top)]

df['Semana'] = df['Creacion Orden de Venta'].dt.isocalendar().week
df['Anio'] = df['Creacion Orden de Venta'].dt.year
df_semanal = df.groupby(['Articulo', 'Anio', 'Semana']).agg({'Cantidad': 'sum'}).reset_index()

le_articulo = LabelEncoder()
df_semanal['Articulo_encoded'] = le_articulo.fit_transform(df_semanal['Articulo'])

# Escaler por producto
escalers = {}
df_semanal['Cantidad_scaled'] = 0.0
for articulo in df_semanal['Articulo'].unique():
    scaler = MinMaxScaler()
    idx = df_semanal['Articulo'] == articulo
    df_semanal.loc[idx, 'Cantidad_scaled'] = scaler.fit_transform(df_semanal.loc[idx, ['Cantidad']])
    escalers[articulo] = scaler


In [4]:
# =========================
# 2. Crear secuencias con relleno
# =========================
def crear_secuencias(df, articulo_id, sequence_length):
    df_art = df[df['Articulo_encoded'] == articulo_id].sort_values(['Anio', 'Semana']).copy()
    df_art['Fecha'] = pd.to_datetime(df_art['Anio'].astype(str) + '-' + df_art['Semana'].astype(str) + '-1', format='%G-%V-%u')
    df_art.set_index('Fecha', inplace=True)
    serie = df_art['Cantidad_scaled'].resample('W').sum().reindex(
        pd.date_range(start=df_art.index.min(), end=df_art.index.max(), freq='W'), fill_value=0
    )
    X, y = [], []
    for i in range(len(serie) - sequence_length):
        X.append(serie[i:i+sequence_length].values)
        y.append(serie[i+sequence_length])
    articulo_array = np.full((len(X), 1), articulo_id)
    return [np.array(X), articulo_array], np.array(y)


In [5]:

# =========================
# 3. Modelo LSTM
# =========================
def crear_modelo(sequence_length, num_articulos):
    input_qty = Input(shape=(sequence_length,))
    input_art = Input(shape=(1,))
    emb = Embedding(num_articulos, 8)(input_art)
    emb = tf.keras.layers.Flatten()(emb)
    x_qty = tf.keras.layers.Reshape((sequence_length, 1))(input_qty)
    lstm_out = LSTM(64)(x_qty)
    concat = Concatenate()([lstm_out, emb])
    x = Dense(64, activation='relu')(concat)
    x = Dropout(0.2)(x)
    output = Dense(1)(x)
    model = Model([input_qty, input_art], output)
    model.compile(optimizer='adam', loss='mse')
    return model

In [6]:

# =========================
# 4. Entrenamiento y comparación contra diciembre 2023
# =========================
resultados = []

for articulo in articulos_top:
    articulo_id = le_articulo.transform([articulo])[0]
    scaler_producto = escalers[articulo]

    # Filtrar datos hasta noviembre 2023 (semana 48 aprox)
    df_hist = df_semanal[
        (df_semanal['Articulo'] == articulo) &
        ((df_semanal['Anio'] < 2023) | ((df_semanal['Anio'] == 2023) & (df_semanal['Semana'] < 49)))
    ]

    X, y = crear_secuencias(df_hist, articulo_id, sequence_length=12)
    if X is None or len(X[0]) < 10:
        continue

    model = crear_modelo(12, len(articulos_top))
    model.fit(X, y, epochs=30, batch_size=16, validation_split=0.2, verbose=0,
              callbacks=[EarlyStopping(patience=5, restore_best_weights=True)])

    # Predicción diciembre 2023
    ult_seq = X[0][-1]
    input_art = np.array([[articulo_id]])
    pred_fut = []
    for _ in range(4):
        pred = model.predict([ult_seq.reshape(1, -1, 1), input_art], verbose=0)
        pred_fut.append(pred[0][0])
        ult_seq = np.append(ult_seq[1:], pred[0][0])
    pred_fut = scaler_producto.inverse_transform(np.array(pred_fut).reshape(-1, 1))
    total_pred = np.sum(pred_fut)

    # Datos reales diciembre 2023
    reales = df_semanal[
        (df_semanal['Articulo'] == articulo) & (df_semanal['Anio'] == 2023) & (df_semanal['Semana'] >= 49)
    ]
    reales = reales.groupby(['Articulo'])['Cantidad'].sum().values[0] if not reales.empty else 0

    resultados.append({
        'Articulo': articulo,
        'Real_Dic2023': reales,
        'Pred_Dic2023': total_pred,
        'Error_Abs': abs(reales - total_pred),
        'Error_%': abs(reales - total_pred) / reales * 100 if reales != 0 else np.nan
    })


  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
 

In [8]:
df_resultados = pd.DataFrame(resultados)
print(df_resultados.tail(10))


    Articulo  Real_Dic2023  Pred_Dic2023    Error_Abs     Error_%
75  IVP11576          1050   9945.079102  8895.079102  847.150391
76  IVP07272          2700    545.753479  2154.246521   79.786908
77  IVP07145         12421   6078.796387  6342.203613   51.060330
78  IVP15042             0   1105.321411  1105.321411         NaN
79  IVP07201           135   1161.398071  1026.398071  760.294868
80  IVP11419          1448    713.193176   734.806824   50.746328
81  IVP11253          1570    695.832275   874.167725   55.679473
82  IVP11346           270   2049.179688  1779.179688  658.955440
83  IVP09035             0   1679.379150  1679.379150         NaN
84  IVP11158          1136    806.136230   329.863770   29.037304


In [10]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Ya debes tener cargado:
# - df_productos_ranking
# - df_semanal
# - scaler por producto
# - modelo LSTM entrenado (uno por producto o reentrenado cada vez)

resultados_anuales = []

# Definimos semanas del año 2023 completas (hasta semana 52)
semanas_2023 = list(range(1, 53))
anio_objetivo = 2023


In [13]:
for articulo_nombre in df_ranking['Articulo']:
    articulo_id = le_articulo.transform([articulo_nombre])[0]

    # Filtrar histórico antes de 2023
    df_hist = df_semanal[
        (df_semanal['Articulo_encoded'] == articulo_id) &
        (df_semanal['Anio'] < 2023)
    ]

    # Crear secuencias históricas con la función que rellena semanas vacías
    X, y = crear_secuencias(df_hist, articulo_id=articulo_id, sequence_length=12)

    if X is None or len(X[0]) < 5:
        print(f"El artículo {articulo_nombre} no tiene suficientes datos para entrenar.")
        continue

    model = crear_modelo(sequence_length=12, num_articulos=len(df_ranking))
    model.fit(X, y, epochs=30, batch_size=16, validation_split=0.2, verbose=0,
              callbacks=[EarlyStopping(patience=5, restore_best_weights=True)])

    # Predicción semana a semana (52 semanas)
    ultima_secuencia = X[0][-1]  # Última secuencia conocida
    input_articulo = np.array([[articulo_id]])
    predicciones_futuras = []

    for _ in range(52):
        pred = model.predict([ultima_secuencia.reshape(1, -1, 1), input_articulo], verbose=0)
        predicciones_futuras.append(pred[0][0])
        ultima_secuencia = np.append(ultima_secuencia[1:], pred[0][0])

    # Desescalar
    scaler_producto = escalers[articulo_nombre]
    predicciones_futuras = scaler_producto.inverse_transform(np.array(predicciones_futuras).reshape(-1, 1)).flatten()

    # ========================
    # Obtener datos reales del 2023
    df_reales = df_semanal[
        (df_semanal['Articulo_encoded'] == articulo_id) &
        (df_semanal['Anio'] == anio_objetivo) &
        (df_semanal['Semana'].isin(semanas_2023))
    ].sort_values(['Anio', 'Semana'])

    reales = df_reales['Cantidad'].values

    # Padding en caso de que falten semanas
    if len(reales) < 52:
        reales = np.pad(reales, (0, 52 - len(reales)), constant_values=0)

    # ========================
    # Evaluación
    mae = mean_absolute_error(reales, predicciones_futuras)
    mse = mean_squared_error(reales, predicciones_futuras)

    resultados_anuales.append({
        'Articulo': articulo_nombre,
        'MAE_2023': mae,
        'MSE_2023': mse,
        'Real_2023_Total': np.sum(reales),
        'Predicho_2023_Total': np.sum(predicciones_futuras)
    })


  y.append(serie[i+sequence_length])


El artículo IVP15248 no tiene suficientes datos para entrenar.
El artículo IVP15246 no tiene suficientes datos para entrenar.


  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])


El artículo IVP15237 no tiene suficientes datos para entrenar.


  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
  y.append(serie[i+sequence_length])
 

In [14]:
# Mostrar resultado general
resultados_df = pd.DataFrame(resultados_anuales)
print(resultados_df.head())


   Articulo     MAE_2023      MSE_2023  Real_2023_Total  Predicho_2023_Total
0  IVP01021  1256.992188  1.580404e+06                0         65363.593750
1  IVP07378   375.263306  7.283831e+05            11615         10375.769531
2  IVP07099  8892.121094  1.815135e+08           454599        379179.406250
3  IVP11065   666.548828  1.359322e+06            14400         25855.621094
4  IVP11633   727.269897  7.319098e+05            15600         30945.519531
