# CatBoost pipeline (basado en Modelo_prediccion)

Este notebook carga las funciones de `DataLoader.py`, prepara los datos respetando validación temporal, entrena un `CatBoostRegressor`, evalúa el modelo, calcula elasticidades y exporta las predicciones en formato Kaggle.

In [1]:
# Imports y funciones
import warnings
warnings.filterwarnings('ignore')
from DataLoader import importar_ventas, preparar_test, agregar_fourier, crear_archivo_kaggle, leer_csv
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
import os
print('Imports OK')

Imports OK


In [2]:
# 1) Cargar datos (usa las funciones del DataLoader)

ventas = importar_ventas()

test = preparar_test('ids_test.csv')

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

TypeError: preparar_test() missing 2 required positional arguments: 'ventas' and 'FEATURES'

In [14]:
# 2) Añadir Fourier (k=2) si se pudo cargar

agregar_fourier(ventas, k=2)

agregar_fourier(test, k=2)

print('Fourier agregado')

Fourier agregado


In [15]:
# 3) Definir features
cat_features = ['subgroup_cod','cluster']
num_features = ['max_price','min_price','year',
                'anual_sin_1','anual_cos_1','mensual_sin_1','mensual_cos_1']

# Categóricas (cat_features) → CatBoost las maneja de forma nativa.
# Numéricas (num_features) → precios, variables de tiempo, Fourier y lags

features = cat_features + num_features
print('Features definidos:', features)

Features definidos: ['subgroup_cod', 'cluster', 'max_price', 'min_price', 'year', 'anual_sin_1', 'anual_cos_1', 'mensual_sin_1', 'mensual_cos_1']


In [16]:
# 4) Split temporal: evitar leakage
if ventas is not None:
    max_date = ventas['date'].max()
    val_start = max_date - pd.Timedelta(weeks=4)
    train = ventas[ventas['date'] < val_start].copy()
    valid = ventas[ventas['date'] >= val_start].copy()
    print('Train dates:', train['date'].min(), '->', train['date'].max())
    print('Valid dates:', valid['date'].min(), '->', valid['date'].max())
else:
    train = valid = None

Train dates: 2021-01-01 00:00:00 -> 2023-12-02 00:00:00
Valid dates: 2023-12-03 00:00:00 -> 2023-12-31 00:00:00


In [17]:
# 5) Preparar matrices para CatBoost
if train is not None and valid is not None and len(valid)>0:
    X_train = train[features]
    y_train = train['demand']
    X_valid = valid[features]
    y_valid = valid['demand']
    
    # Pools
    train_pool = Pool(X_train, y_train, cat_features=cat_features)
    valid_pool = Pool(X_valid, y_valid, cat_features=cat_features)
    print('Pools creados.')
else:
    train_pool = valid_pool = None
    print('No se pudieron crear pools por falta de datos')

Pools creados.


In [18]:
# 6) Entrenar CatBoost
if train_pool is not None and valid_pool is not None:
    model = CatBoostRegressor(
        iterations=2000, # Número máximo de árboles de decisión
        learning_rate=0.05, # Controla cuánto se ajusta el modelo en cada iteración
        depth=8, # Profundidad máxima de cada árbol.
        loss_function='RMSE', # Función de pérdida que el modelo optimiza
        eval_metric='RMSE', # Métrica usada para evaluar el rendimiento en el conjunto de validación durante el entrenamiento
        random_seed=42, # Semilla aleatoria para reproducibilidad.
        early_stopping_rounds=100, # Si el modelo no mejora la métrica de validación en 100 iteraciones consecutivas, se detiene antes de llegar a iterations.
        verbose=100 # Imprime el progreso cada 100 iteraciones.
    )
    model.fit(train_pool, eval_set=valid_pool)
    model.save_model('catboost_model.cbm')
    print('Modelo entrenado y guardado: catboost_model.cbm')
    
else:
    model = None
    print('No se entrenó el modelo')

0:	learn: 3.5553744	test: 3.2847950	best: 3.2847950 (0)	total: 1.64s	remaining: 54m 29s
100:	learn: 2.9293682	test: 2.7957041	best: 2.7957041 (100)	total: 2m 35s	remaining: 48m 46s
200:	learn: 2.7907077	test: 2.6686494	best: 2.6686494 (200)	total: 5m 26s	remaining: 48m 42s
300:	learn: 2.7156669	test: 2.6107117	best: 2.6107117 (300)	total: 8m 33s	remaining: 48m 16s
400:	learn: 2.6673578	test: 2.5505584	best: 2.5505584 (400)	total: 10m 57s	remaining: 43m 41s
500:	learn: 2.6344509	test: 2.5019560	best: 2.5019560 (500)	total: 13m 33s	remaining: 40m 33s
600:	learn: 2.6111725	test: 2.4691150	best: 2.4691150 (600)	total: 15m 59s	remaining: 37m 12s
700:	learn: 2.5929627	test: 2.4519621	best: 2.4519621 (700)	total: 18m 33s	remaining: 34m 23s
800:	learn: 2.5790419	test: 2.4383921	best: 2.4383921 (800)	total: 21m 6s	remaining: 31m 36s
900:	learn: 2.5678437	test: 2.4306163	best: 2.4306163 (900)	total: 23m 38s	remaining: 28m 50s
1000:	learn: 2.5581518	test: 2.4251760	best: 2.4251760 (1000)	total: 2

In [19]:
# 7) Evaluación en validación
if model is not None and valid is not None:
    y_pred_val = model.predict(valid[features])
    rmse = root_mean_squared_error(valid['demand'], y_pred_val)
    mae  = mean_absolute_error(valid['demand'], y_pred_val)
    print(f'Valid RMSE: {rmse:.4f}, MAE: {mae:.4f}')
else:
    print('No hay evaluación realizada')

Valid RMSE: 2.3972, MAE: 1.7043


In [20]:
# 8) Predicción sobre test preparado y preparación de Kaggle
if model is not None and test is not None:
    # Asegurarse que test tenga las mismas columnas
    for c in num_features:
        if c not in test.columns:
            test[c] = np.nan
    # Rellenar NANs de price con último precio conocido por grupo
    test['date'] = pd.to_datetime(test['date'])
    ventas_for_merge = ventas[['date','store_cod','subgroup_cod','mean_price']].copy()
    ventas_for_merge['date'] = pd.to_datetime(ventas_for_merge['date'])
    ventas_for_merge = ventas_for_merge.sort_values('date')
    test['fecha_busqueda'] = test['date'] - pd.DateOffset(weeks=1)
    test = pd.merge_asof(
        test.sort_values('fecha_busqueda'),
        ventas_for_merge.rename(columns={'mean_price':'precio_hist','date':'fecha_ref'}).sort_values('fecha_ref'),
        left_on='fecha_busqueda',
        right_on='fecha_ref',
        by=['store_cod','subgroup_cod'],
        direction='backward'
    )
    
    test['mean_price'] = test['precio_hist'].fillna(test['mean_price']).fillna(0)
    # ordenar y predecir
    test = test.sort_values(['subgroup_cod','store_cod','date']).reset_index(drop=True)
    X_test = test[features].fillna(0)
    test_pool = Pool(X_test, cat_features=cat_features)
    y_test_pred = model.predict(test_pool)
    pred_df = pd.DataFrame({
        'store_subgroup_date_id': test['store_subgroup_date_id'],
        'demand': y_test_pred
    })
    kaggle = crear_archivo_kaggle(pred_df)
    print('Predicciones realizadas y archivo kaggle creado (prediccion_kaggle.csv)')
else:
    print('No se generaron predicciones para test')

Predicciones realizadas y archivo kaggle creado (prediccion_kaggle.csv)


In [None]:
# 10) Función para calcular elasticidad aproximada por fila
def computar_elasticidad(model, row, features, cat_features, price_col='mean_price', deltas=[-0.1, 0.1]):
    base = row.copy()
    X_base = pd.DataFrame([base[features]]).fillna(0)
    q0 = model.predict(Pool(X_base, cat_features=cat_features))[0]
    outs = {}
    for d in deltas:
        r2 = base.copy()
        r2[price_col] = base[price_col] * (1 + d)
        X2 = pd.DataFrame([r2[features]]).fillna(0)
        q2 = model.predict(Pool(X2, cat_features=cat_features))[0]
        elast = ((q2 - q0) / q0) / d if q0 != 0 else np.nan
        outs[d] = {'price': r2[price_col], 'q_pred': q2, 'elasticity': elast}
    return outs

# Calcular elasticidad para las primeras 20 filas del test y guardarlas
elastic_results = []
if model is not None and test is not None:
    for i in range(min(20, len(test))):
        r = test.iloc[i].to_dict()
        er = computar_elasticidad(model, r, features, cat_features, deltas=[-0.05, -0.1, 0.05, 0.1])
        elastic_results.append({'id': test.iloc[i]['store_subgroup_date_id'], 'elastic': er})
    # Guardar
    import json
    with open('elastic_results_sample.json','w') as f:
        json.dump(elastic_results, f, indent=2)
    print('Elasticidades calculadas (muestra) y guardadas en elastic_results_sample.json')
else:
    print('No se calcularon elasticidades')