# Predicción de ventas (Mejorado)

Este notebook reestructura el entrenamiento para:

- Usar **split temporal Train/Val/Test** (sin usar el test como validación).
- Tratar `product_id` y `category_id` como **categóricos** con **Embeddings**.
- Entrenar sobre `log1p(ventas)` para estabilizar picos y mejorar la generalización.
- Usar `Huber` + `ReduceLROnPlateau` + `EarlyStopping`.
- Comparar contra un **baseline** (`lag_1`).


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pathlib import Path
import unicodedata

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Reproducibilidad
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

DATA_DIR = Path("../datasets")  # cambia si aplica


2025-12-12 00:17:47.230003: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-12-12 00:17:47.244847: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-12 00:17:48.002542: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-12-12 00:17:50.289836: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off,

In [2]:
# 1) Cargar datasets
df_ventas = pd.read_csv(DATA_DIR / "ventas_normalizado.csv")
df_eventos = pd.read_csv(DATA_DIR / "eventos_productos.csv")

print(df_ventas.head())
print(df_ventas.info())
print(df_eventos.head())
print(df_eventos.info())


   Unnamed: 0       fecha           product_name    category_off  ventas  \
0           0  2022-12-01  nectar de durazno 1 l       juice-box      10   
1           1  2022-12-01  nectar de durazno 1 l       juice-box      10   
2           2  2022-12-01     arroz blanco 500 g  rice-white-dry       3   
3           3  2022-12-01     arroz blanco 500 g  rice-white-dry       3   
4           4  2022-12-01     arroz blanco 500 g  rice-white-dry       3   

   precio  perecedero  en_temporada  temp_inicio_mes  temp_fin_mes  
0   33.72         0.0             0              7.0           9.0  
1   33.72         0.0             0              7.0           9.0  
2   45.84         0.0             1              1.0          12.0  
3   45.84         0.0             1              1.0          12.0  
4   45.84         0.0             1              1.0          12.0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840632 entries, 0 to 840631
Data columns (total 10 columns):
 #   Column       

In [3]:
# 2) Normalización de texto para joins robustos
def normalize_text(texto: str) -> str:
    if pd.isna(texto):
        return ""
    texto = str(texto).lower().strip()
    texto = unicodedata.normalize("NFD", texto)
    texto = "".join(c for c in texto if unicodedata.category(c) != "Mn")
    texto = " ".join(texto.split())
    return texto

# Fechas
df_ventas["fecha"] = pd.to_datetime(df_ventas["fecha"])
df_eventos["date"] = pd.to_datetime(df_eventos["date"])

# Claves normalizadas
df_ventas["product_key"] = df_ventas["product_name"].apply(normalize_text)
df_eventos["product_key"] = df_eventos["producto_relacionado"].apply(normalize_text)

# Reducir eventos a lo necesario
df_eventos_reduc = df_eventos[["event", "date", "product_key"]].drop_duplicates()

# Merge por (fecha, producto)
df_ventas_evt = df_ventas.merge(
    df_eventos_reduc,
    left_on=["fecha", "product_key"],
    right_on=["date", "product_key"],
    how="left"
)

df_ventas_evt["hay_evento"] = df_ventas_evt["event"].notna().astype(int)
df_ventas_evt = df_ventas_evt.drop(columns=["date"])

df_ventas_evt.head(10)


Unnamed: 0.1,Unnamed: 0,fecha,product_name,category_off,ventas,precio,perecedero,en_temporada,temp_inicio_mes,temp_fin_mes,product_key,event,hay_evento
0,0,2022-12-01,nectar de durazno 1 l,juice-box,10,33.72,0.0,0,7.0,9.0,nectar de durazno 1 l,,0
1,1,2022-12-01,nectar de durazno 1 l,juice-box,10,33.72,0.0,0,7.0,9.0,nectar de durazno 1 l,,0
2,2,2022-12-01,arroz blanco 500 g,rice-white-dry,3,45.84,0.0,1,1.0,12.0,arroz blanco 500 g,,0
3,3,2022-12-01,arroz blanco 500 g,rice-white-dry,3,45.84,0.0,1,1.0,12.0,arroz blanco 500 g,,0
4,4,2022-12-01,arroz blanco 500 g,rice-white-dry,3,45.84,0.0,1,1.0,12.0,arroz blanco 500 g,,0
5,5,2022-12-01,arroz blanco 500 g,rice-white-dry,3,45.84,0.0,1,1.0,12.0,arroz blanco 500 g,,0
6,6,2022-12-01,betabel,fruits-vegetables,33,30.22,1.0,1,1.0,12.0,betabel,,0
7,7,2022-12-01,queso gouda 300 g,cheese-hard,8,77.62,1.0,1,1.0,12.0,queso gouda 300 g,,0
8,8,2022-12-01,queso gouda 300 g,cheese-hard,8,77.62,1.0,1,1.0,12.0,queso gouda 300 g,,0
9,9,2022-12-01,queso gouda 300 g,cheese-hard,8,77.62,1.0,1,1.0,12.0,queso gouda 300 g,,0


In [4]:
# 3) Feature engineering calendario + IDs
ventas = df_ventas_evt.copy()

ventas["anio"] = ventas["fecha"].dt.year
ventas["mes"] = ventas["fecha"].dt.month
ventas["dia"] = ventas["fecha"].dt.day
ventas["dia_semana"] = ventas["fecha"].dt.weekday
ventas["es_fin_semana"] = ventas["dia_semana"].isin([5, 6]).astype(int)

# Cíclicas
ventas["mes_sin"] = np.sin(2 * np.pi * ventas["mes"] / 12)
ventas["mes_cos"] = np.cos(2 * np.pi * ventas["mes"] / 12)
ventas["dow_sin"] = np.sin(2 * np.pi * ventas["dia_semana"] / 7)
ventas["dow_cos"] = np.cos(2 * np.pi * ventas["dia_semana"] / 7)

# IDs categóricos
ventas["product_id"], product_uniques = pd.factorize(ventas["product_name"])
ventas["category_id"], category_uniques = pd.factorize(ventas["category_off"])

n_products = len(product_uniques)
n_categories = len(category_uniques)
print("n_products:", n_products, "n_categories:", n_categories)

ventas = ventas.sort_values(["product_id", "fecha"])
ventas.head()


n_products: 144 n_categories: 57


Unnamed: 0.1,Unnamed: 0,fecha,product_name,category_off,ventas,precio,perecedero,en_temporada,temp_inicio_mes,temp_fin_mes,...,mes,dia,dia_semana,es_fin_semana,mes_sin,mes_cos,dow_sin,dow_cos,product_id,category_id
0,0,2022-12-01,nectar de durazno 1 l,juice-box,10,33.72,0.0,0,7.0,9.0,...,12,1,3,0,-2.449294e-16,1.0,0.433884,-0.900969,0,0
1,1,2022-12-01,nectar de durazno 1 l,juice-box,10,33.72,0.0,0,7.0,9.0,...,12,1,3,0,-2.449294e-16,1.0,0.433884,-0.900969,0,0
315,315,2022-12-01,nectar de durazno 1 l,juice-box,5,39.24,0.0,0,7.0,9.0,...,12,1,3,0,-2.449294e-16,1.0,0.433884,-0.900969,0,0
316,316,2022-12-01,nectar de durazno 1 l,juice-box,5,39.24,0.0,0,7.0,9.0,...,12,1,3,0,-2.449294e-16,1.0,0.433884,-0.900969,0,0
1455,1455,2022-12-02,nectar de durazno 1 l,juice-box,5,33.66,0.0,0,7.0,9.0,...,12,2,4,0,-2.449294e-16,1.0,-0.433884,-0.900969,0,0


In [5]:
# 4) Lags y medias móviles por producto
def crear_ventanas(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    g = df.groupby("product_id")["ventas"]
    df["lag_1"] = g.shift(1)
    df["lag_7"] = g.shift(7)
    df["lag_14"] = g.shift(14)
    df["media_7"] = g.shift(1).rolling(7).mean()
    df["media_28"] = g.shift(1).rolling(28).mean()
    df["media_90"] = g.shift(1).rolling(90).mean()
    return df

ventas = crear_ventanas(ventas)

ventas_modelo = ventas.dropna(subset=["lag_1","lag_7","lag_14","media_7","media_28","media_90"]).copy()
ventas_modelo.shape, ventas_modelo.head()


((827672, 30),
        Unnamed: 0      fecha           product_name category_off  ventas  \
 17523       17523 2022-12-23  nectar de durazno 1 l    juice-box      12   
 17524       17524 2022-12-23  nectar de durazno 1 l    juice-box      12   
 17717       17717 2022-12-24  nectar de durazno 1 l    juice-box      10   
 17718       17718 2022-12-24  nectar de durazno 1 l    juice-box      10   
 18081       18081 2022-12-24  nectar de durazno 1 l    juice-box      12   
 
        precio  perecedero  en_temporada  temp_inicio_mes  temp_fin_mes  ...  \
 17523   39.51         0.0             0              7.0           9.0  ...   
 17524   39.51         0.0             0              7.0           9.0  ...   
 17717   32.94         0.0             0              7.0           9.0  ...   
 17718   32.94         0.0             0              7.0           9.0  ...   
 18081   40.99         0.0             0              7.0           9.0  ...   
 
         dow_sin   dow_cos  product_id 

## Split temporal (Train / Val / Test)

- **Test**: a partir de una fecha de corte.
- **Val**: la última ventana del train (por ejemplo 60 días) para `EarlyStopping`.
- **Train**: todo lo anterior.

In [6]:
# 5) Definir columnas
cat_cols = ["product_id", "category_id"]
num_cols = [
    "perecedero","precio","en_temporada","hay_evento",
    "anio","mes","dia_semana","es_fin_semana",
    "mes_sin","mes_cos","dow_sin","dow_cos",
    "lag_1","lag_7","lag_14","media_7","media_28","media_90"
]

target_col = "ventas"

# 6) Split temporal
fecha_corte_test = pd.to_datetime("2025-06-01")  # igual a tu notebook
VAL_DAYS = 60  # valida sobre los últimos 60 días antes del corte de test
fecha_inicio_val = fecha_corte_test - pd.Timedelta(days=VAL_DAYS)

mask_train = ventas_modelo["fecha"] < fecha_inicio_val
mask_val   = (ventas_modelo["fecha"] >= fecha_inicio_val) & (ventas_modelo["fecha"] < fecha_corte_test)
mask_test  = ventas_modelo["fecha"] >= fecha_corte_test

df_train = ventas_modelo.loc[mask_train].copy()
df_val   = ventas_modelo.loc[mask_val].copy()
df_test  = ventas_modelo.loc[mask_test].copy()

print("Train:", df_train.shape, df_train["fecha"].min(), "->", df_train["fecha"].max())
print("Val  :", df_val.shape,   df_val["fecha"].min(),   "->", df_val["fecha"].max())
print("Test :", df_test.shape,  df_test["fecha"].min(),  "->", df_test["fecha"].max())


Train: (641291, 30) 2022-12-02 00:00:00 -> 2025-04-01 00:00:00
Val  : (46020, 30) 2025-04-02 00:00:00 -> 2025-05-31 00:00:00
Test : (140361, 30) 2025-06-01 00:00:00 -> 2025-11-30 00:00:00


In [7]:
# 7) Preparar matrices (categóricas separadas + numéricas escaladas)

def to_arrays(df: pd.DataFrame):
    X_cat = df[cat_cols].astype("int32").to_numpy()
    X_num = df[num_cols].astype("float32").to_numpy()
    y = df[target_col].astype("float32").to_numpy()
    return X_cat, X_num, y

Xc_train, Xn_train, y_train = to_arrays(df_train)
Xc_val,   Xn_val,   y_val   = to_arrays(df_val)
Xc_test,  Xn_test,  y_test  = to_arrays(df_test)

# Limpieza (por si hay NaN/infs en num)
def clean_numeric(Xc, Xn, y):
    mask = np.isfinite(Xn).all(axis=1) & np.isfinite(y)
    return Xc[mask], Xn[mask], y[mask]

Xc_train, Xn_train, y_train = clean_numeric(Xc_train, Xn_train, y_train)
Xc_val,   Xn_val,   y_val   = clean_numeric(Xc_val,   Xn_val,   y_val)
Xc_test,  Xn_test,  y_test  = clean_numeric(Xc_test,  Xn_test,  y_test)

print("Shapes:", Xc_train.shape, Xn_train.shape, y_train.shape)

# Escalado SOLO numéricas
scaler = StandardScaler()
Xn_train_s = scaler.fit_transform(Xn_train).astype("float32")
Xn_val_s   = scaler.transform(Xn_val).astype("float32")
Xn_test_s  = scaler.transform(Xn_test).astype("float32")

# Target transform: log1p (reduce picos y hace el entrenamiento más estable)
y_train_t = np.log1p(y_train).astype("float32")
y_val_t   = np.log1p(y_val).astype("float32")
y_test_t  = np.log1p(y_test).astype("float32")


Shapes: (641291, 2) (641291, 18) (641291,)


## Modelo con Embeddings

- `product_id` y `category_id` entran como **índices** a embeddings.
- Se concatena con las variables numéricas escaladas.
- Se predice `log1p(ventas)`.

In [8]:
# 8) Construir modelo (Functional API)

# Inputs
inp_prod = layers.Input(shape=(), dtype="int32", name="product_id")
inp_cat  = layers.Input(shape=(), dtype="int32", name="category_id")
inp_num  = layers.Input(shape=(Xn_train_s.shape[1],), dtype="float32", name="num_features")

# Embeddings (tamaños heurísticos)
emb_prod_dim = min(32, int(np.ceil(np.sqrt(n_products))))
emb_cat_dim  = min(8,  int(np.ceil(np.sqrt(n_categories))))

emb_prod = layers.Embedding(input_dim=n_products, output_dim=emb_prod_dim, name="emb_product")(inp_prod)
emb_cat  = layers.Embedding(input_dim=n_categories, output_dim=emb_cat_dim, name="emb_category")(inp_cat)

emb_prod = layers.Flatten()(emb_prod)
emb_cat  = layers.Flatten()(emb_cat)

x = layers.Concatenate()([inp_num, emb_prod, emb_cat])

# Bloque denso
x = layers.Dense(256, activation="relu")(x)
x = layers.Dropout(0.25)(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.25)(x)
x = layers.Dense(64, activation="relu")(x)

out = layers.Dense(1, name="y_log1p")(x)

model = keras.Model(inputs=[inp_prod, inp_cat, inp_num], outputs=out)

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=3e-4),
    loss=keras.losses.Huber(delta=0.5),  # delta en escala log
    metrics=[keras.metrics.MeanAbsoluteError(name="mae_log")]
)

model.summary()


2025-12-12 00:17:59.289188: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [None]:
# 9) Entrenamiento con callbacks
callbacks = [
    keras.callbacks.EarlyStopping(monitor="val_loss", patience=8, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3, min_lr=1e-6, verbose=1),
]

history = model.fit(
    {"product_id": Xc_train[:,0], "category_id": Xc_train[:,1], "num_features": Xn_train_s},
    y_train_t,
    validation_data=(
        {"product_id": Xc_val[:,0], "category_id": Xc_val[:,1], "num_features": Xn_val_s},
        y_val_t
    ),
    epochs=200,
    batch_size=256,
    callbacks=callbacks,
    verbose=1
)


Epoch 1/200
[1m2506/2506[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 9ms/step - loss: 0.0935 - mae_log: 0.3505 - val_loss: 0.0821 - val_mae_log: 0.3267 - learning_rate: 3.0000e-04
Epoch 2/200
[1m2506/2506[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 9ms/step - loss: 0.0634 - mae_log: 0.2670 - val_loss: 0.0739 - val_mae_log: 0.3030 - learning_rate: 3.0000e-04
Epoch 3/200
[1m2506/2506[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 10ms/step - loss: 0.0580 - mae_log: 0.2498 - val_loss: 0.0691 - val_mae_log: 0.2876 - learning_rate: 3.0000e-04
Epoch 4/200
[1m2506/2506[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 8ms/step - loss: 0.0552 - mae_log: 0.2406 - val_loss: 0.0631 - val_mae_log: 0.2675 - learning_rate: 3.0000e-04
Epoch 5/200
[1m2506/2506[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 9ms/step - loss: 0.0532 - mae_log: 0.2337 - val_loss: 0.0617 - val_mae_log: 0.2638 - learning_rate: 3.0000e-04
Epoch 6/200
[1m2506/2506[0m [32m━━━━

In [None]:
# 10) Curvas de entrenamiento
plt.figure(figsize=(10,5))
plt.plot(history.history["loss"], label="Train loss")
plt.plot(history.history["val_loss"], label="Val loss")
plt.xlabel("Épocas")
plt.ylabel("Loss (Huber sobre log1p)")
plt.title("Curva de entrenamiento")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# 11) Evaluación en TEST (en escala original)
y_pred_log = model.predict(
    {"product_id": Xc_test[:,0], "category_id": Xc_test[:,1], "num_features": Xn_test_s},
    verbose=0
).ravel()

y_pred = np.expm1(y_pred_log)  # invertir log1p
y_pred = np.clip(y_pred, 0, None)

mae_nn = mean_absolute_error(y_test, y_pred)
mape_nn = mean_absolute_percentage_error(y_test, y_pred)

print("NN (Embeddings) MAE:", mae_nn)
print("NN (Embeddings) MAPE:", mape_nn)


In [None]:
# 12) Baseline: 'venta de ayer' (lag_1) en TEST
# Como df_test ya tiene lag_1 calculado:
baseline_pred = df_test["lag_1"].to_numpy()
baseline_true = df_test["ventas"].to_numpy()

mae_base = mean_absolute_error(baseline_true, baseline_pred)
mape_base = mean_absolute_percentage_error(baseline_true, baseline_pred)

print("Baseline lag_1 MAE:", mae_base)
print("Baseline lag_1 MAPE:", mape_base)
print("\nMejora MAE (%):", (mae_base - mae_nn)/mae_base*100)


In [None]:
# 13) Real vs Predicho (scatter)
plt.figure(figsize=(7,7))
plt.scatter(y_test, y_pred, alpha=0.3, s=10)
max_val = max(y_test.max(), y_pred.max())
plt.plot([0, max_val], [0, max_val], color="red", linewidth=2)
plt.xlabel("Ventas reales")
plt.ylabel("Ventas predichas")
plt.title("Real vs Predicho - NN con Embeddings")
plt.grid(True)
plt.show()


## Diagnóstico por producto (opcional)

Grafica las ventas reales vs predichas para un producto específico en el set de TEST.

In [None]:
# 14) Serie por producto (elige uno que exista)
nombre_producto = "aguacate hass"  # cambia a uno de tu dataset

if nombre_producto not in ventas_modelo["product_name"].unique():
    print("Producto no encontrado. Ejemplos:", ventas_modelo["product_name"].dropna().unique()[:10])
else:
    pid = ventas_modelo.loc[ventas_modelo["product_name"] == nombre_producto, "product_id"].iloc[0]

    df_test_prod = df_test[df_test["product_id"] == pid].copy()
    if df_test_prod.empty:
        print("No hay registros de TEST para ese producto.")
    else:
        # Construir arrays del subset (sin re-entrenar)
        Xc_p = df_test_prod[cat_cols].astype("int32").to_numpy()
        Xn_p = df_test_prod[num_cols].astype("float32").to_numpy()
        Xn_p = scaler.transform(Xn_p).astype("float32")
        y_p  = df_test_prod["ventas"].to_numpy()

        y_pred_log_p = model.predict(
            {"product_id": Xc_p[:,0], "category_id": Xc_p[:,1], "num_features": Xn_p},
            verbose=0
        ).ravel()
        y_pred_p = np.clip(np.expm1(y_pred_log_p), 0, None)

        plt.figure(figsize=(12,5))
        plt.plot(df_test_prod["fecha"], y_p, label="Real", linewidth=2)
        plt.plot(df_test_prod["fecha"], y_pred_p, label="Predicho", linewidth=2)
        plt.title(f"Ventas reales vs predichas - {nombre_producto}")
        plt.xlabel("Fecha")
        plt.ylabel("Ventas")
        plt.legend()
        plt.grid(True)
        plt.show()
