In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import joblib
import warnings
import time
warnings.filterwarnings('ignore')

# Importar transformadores personalizados
import os, sys
sys.path.append(r"C:\Users\hsuna\Desktop\Proyecto Final Product Development\proyecto final v2\repo-seriestemporales-g4-pd\notebooks")

from custom_transformers import (
    TemporalFeatures, LagCreator, OutlierTreatment, 
    LogTransformation, CategoricalEncoder, ToNumericTransformer
)

print("="*80)
print("ENTRENAMIENTO Y SELECCI√ìN DE MODELOS")
print("="*80)

# ==================================================================================
# CARGA DE DATOS
# ==================================================================================

print("\nCargando datos...")
df = pd.read_csv(r'C:\Users\hsuna\Desktop\Proyecto Final Product Development\proyecto final v2\repo-seriestemporales-g4-pd\data\raw\ventas.csv')

# Convertir y ordenar por fecha
df['Fecha_Venta'] = pd.to_datetime(df['Fecha_Venta'], dayfirst=True, errors='coerce', format='mixed')
df = df.sort_values(['Codigo_Sucursal', 'Codigo_Producto', 'Fecha_Venta'])
df = df.reset_index(drop=True)

print(f"Total de registros: {len(df):,}")
print(f"Fecha m√≠nima: {df['Fecha_Venta'].min()}")
print(f"Fecha m√°xima: {df['Fecha_Venta'].max()}")

# Verificar categor√≠as √∫nicas
print(f"\nSucursales √∫nicas: {df['Codigo_Sucursal'].nunique()}")
print(f"Productos √∫nicos: {df['Codigo_Producto'].nunique()}")

# ==================================================================================
# DIVISI√ìN TEMPORAL DE DATOS (80% TRAIN - 20% VALIDACI√ìN)
# ==================================================================================

split_idx = int(len(df) * 0.8)
train_data = df.iloc[:split_idx].copy()
val_data = df.iloc[split_idx:].copy()

print("\n" + "="*80)
print("DIVISI√ìN TEMPORAL DE DATOS")
print("="*80)
print(f"Datos de entrenamiento: {len(train_data):,} registros")
print(f"  Per√≠odo: {train_data['Fecha_Venta'].min()} a {train_data['Fecha_Venta'].max()}")
print(f"  Sucursales: {train_data['Codigo_Sucursal'].nunique()}")
print(f"  Productos: {train_data['Codigo_Producto'].nunique()}")

print(f"\nDatos de validaci√≥n: {len(val_data):,} registros")
print(f"  Per√≠odo: {val_data['Fecha_Venta'].min()} a {val_data['Fecha_Venta'].max()}")
print(f"  Sucursales: {val_data['Codigo_Sucursal'].nunique()}")
print(f"  Productos: {val_data['Codigo_Producto'].nunique()}")

# Verificar si hay categor√≠as nuevas en validaci√≥n
train_sucursales = set(train_data['Codigo_Sucursal'].unique())
val_sucursales = set(val_data['Codigo_Sucursal'].unique())
nuevas_sucursales = val_sucursales - train_sucursales

train_productos = set(train_data['Codigo_Producto'].unique())
val_productos = set(val_data['Codigo_Producto'].unique())
nuevos_productos = val_productos - train_productos

if nuevas_sucursales:
    print(f"\n‚ö†Ô∏è  Sucursales nuevas en validaci√≥n: {nuevas_sucursales}")
if nuevos_productos:
    print(f"‚ö†Ô∏è  Productos nuevos en validaci√≥n: {nuevos_productos}")

# Preparar X e y
X_train = train_data.drop(['Total'], axis=1)
y_train = train_data['Total']
X_val = val_data.drop(['Total'], axis=1)
y_val = val_data['Total']

print(f"\nShape X_train: {X_train.shape}")
print(f"Shape X_val: {X_val.shape}")

# ==================================================================================
# CREAR Y AJUSTAR PIPELINE CON DATOS DE ENTRENAMIENTO
# ==================================================================================

print("\n" + "="*80)
print("CREANDO PIPELINE DE PREPROCESAMIENTO")
print("="*80)

OUTLIER_VARS = ['Unidades_Vendidas']
LOG_TRANSFORM_VARS = ['Unidades_Vendidas_tratado']

# Crear pipeline de preprocesamiento
preprocessing_pipeline = Pipeline([
    ('temporal_features', TemporalFeatures()),
    ('lag_creator', LagCreator(lags=[1, 7, 14, 30])),
    ('categorical_encoder', CategoricalEncoder()),
    ('outlier_treatment', OutlierTreatment(columns=OUTLIER_VARS)),
    ('log_transformation', LogTransformation(columns=LOG_TRANSFORM_VARS)),
    ('to_numeric', ToNumericTransformer()),
    ('numerical_imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

print("‚úì Pipeline creado")
print("‚úì Ajustando pipeline con datos de entrenamiento...")

# Ajustar el pipeline SOLO con datos de entrenamiento
preprocessing_pipeline.fit(X_train, y_train)

print("‚úì Pipeline ajustado exitosamente")

# ==================================================================================
# CONFIGURACI√ìN DE MODELOS CON 3 CONFIGURACIONES CADA UNO
# ==================================================================================

models_config = {
    # LinearRegression (3 configuraciones)
    'LinearRegression_1': LinearRegression(),
    'LinearRegression_2': LinearRegression(fit_intercept=False),
    'LinearRegression_3': LinearRegression(positive=True),
    
    # Ridge (3 configuraciones)
    'Ridge_1': Ridge(alpha=0.1, random_state=42),
    'Ridge_2': Ridge(alpha=1.0, random_state=42),
    'Ridge_3': Ridge(alpha=10.0, random_state=42),
    
    # Lasso (3 configuraciones)
    'Lasso_1': Lasso(alpha=0.1, random_state=42, max_iter=2000),
    'Lasso_2': Lasso(alpha=1.0, random_state=42, max_iter=2000),
    'Lasso_3': Lasso(alpha=10.0, random_state=42, max_iter=2000),
    
    # RandomForest (3 configuraciones)
    'RandomForest_1': RandomForestRegressor(n_estimators=50, max_depth=10, random_state=42, n_jobs=-1),
    'RandomForest_2': RandomForestRegressor(n_estimators=100, max_depth=20, random_state=42, n_jobs=-1),
    'RandomForest_3': RandomForestRegressor(n_estimators=200, max_depth=None, random_state=42, n_jobs=-1),
    
    # GradientBoosting (3 configuraciones)
    'GradientBoosting_1': GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, random_state=42),
    'GradientBoosting_2': GradientBoostingRegressor(n_estimators=100, learning_rate=0.05, random_state=42),
    'GradientBoosting_3': GradientBoostingRegressor(n_estimators=200, learning_rate=0.01, random_state=42),
}

print(f"\nTotal de modelos a entrenar: {len(models_config)}")
print("5 familias de modelos √ó 3 configuraciones = 15 modelos")

# ==================================================================================
# ENTRENAMIENTO Y EVALUACI√ìN DE MODELOS
# ==================================================================================

print("\n" + "="*80)
print("ENTRENAMIENTO DE MODELOS")
print("="*80)

results = {}
start_time = time.time()

for idx, (model_name, model) in enumerate(models_config.items(), 1):
    try:
        print(f"\n[{idx}/{len(models_config)}] Entrenando {model_name}...")
        
        # Crear pipeline completo con preprocesamiento y modelo
        full_pipeline = Pipeline([
            ('preprocessing', preprocessing_pipeline),
            ('model', model)
        ])
        
        # Entrenar
        model_start = time.time()
        full_pipeline.fit(X_train, y_train)
        model_time = time.time() - model_start
        
        # Predecir en validaci√≥n
        y_pred = full_pipeline.predict(X_val)
        
        # Calcular m√©tricas
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        mae = mean_absolute_error(y_val, y_pred)
        r2 = r2_score(y_val, y_pred)
        
        results[model_name] = {
            'rmse': rmse,
            'mae': mae,
            'r2': r2,
            'training_time': model_time,
            'pipeline': full_pipeline
        }
        
        print(f"  ‚úì RMSE: Q{rmse:,.2f}")
        print(f"  ‚úì MAE: Q{mae:,.2f}")
        print(f"  ‚úì R¬≤: {r2:.4f}")
        print(f"  ‚úì Tiempo: {model_time:.2f}s")
        
    except Exception as e:
        print(f"  ‚úó Error: {str(e)}")
        continue

total_time = time.time() - start_time

print("\n" + "="*80)
print(f"ENTRENAMIENTO COMPLETADO EN {total_time:.2f} SEGUNDOS")
print("="*80)

# ==================================================================================
# SELECCI√ìN DEL MEJOR MODELO
# ==================================================================================

print("\n" + "="*80)
print("RESULTADOS FINALES")
print("="*80)

if len(results) == 0:
    print("ERROR: No se entren√≥ ning√∫n modelo exitosamente")
    raise Exception("No hay modelos entrenados")

results_df = pd.DataFrame({
    'Modelo': list(results.keys()),
    'RMSE': [results[m]['rmse'] for m in results.keys()],
    'MAE': [results[m]['mae'] for m in results.keys()],
    'R¬≤': [results[m]['r2'] for m in results.keys()],
    'Tiempo_seg': [results[m]['training_time'] for m in results.keys()]
})

results_df = results_df.sort_values('RMSE')

print("\n" + "-"*80)
print("RANKING COMPLETO DE MODELOS (ordenado por RMSE)")
print("-"*80)
print(results_df.to_string(index=False))

best_model_name = results_df.iloc[0]['Modelo']
best_model_rmse = results_df.iloc[0]['RMSE']
best_model_r2 = results_df.iloc[0]['R¬≤']

print("\n" + "="*80)
print("üèÜ MODELO GANADOR")
print("="*80)
print(f"Modelo: {best_model_name}")
print(f"RMSE: Q{best_model_rmse:,.2f}")
print(f"R¬≤: {best_model_r2:.4f}")
print("="*80)

# ==================================================================================
# GUARDAR PIPELINE COMPLETO CON MEJOR MODELO
# ==================================================================================

print("\n" + "="*80)
print("GUARDANDO RESULTADOS")
print("="*80)

best_pipeline = results[best_model_name]['pipeline']
pipeline_path = r'C:\Users\hsuna\Desktop\Proyecto Final Product Development\proyecto final v2\repo-seriestemporales-g4-pd\models\ventas_complete_pipeline.pkl'
joblib.dump(best_pipeline, pipeline_path)

print(f"‚úì Pipeline completo guardado en:")
print(f"  {pipeline_path}")

# Guardar resultados de comparaci√≥n
results_path = r'C:\Users\hsuna\Desktop\Proyecto Final Product Development\proyecto final v2\repo-seriestemporales-g4-pd\data\processed\model_comparison_results.csv'
results_df.to_csv(results_path, index=False)
print(f"\n‚úì Resultados de comparaci√≥n guardados en:")
print(f"  {results_path}")

# ==================================================================================
# RESUMEN FINAL
# ==================================================================================

print("\n" + "="*80)
print("RESUMEN DEL PROCESO")
print("="*80)
print(f"""
‚úì Modelos entrenados: {len(results)}/15
‚úì Divisi√≥n temporal: 80% train ({len(train_data):,}) - 20% val ({len(val_data):,})
‚úì M√©trica de selecci√≥n: RMSE
‚úì Modelo ganador: {best_model_name}
‚úì RMSE del mejor modelo: Q{best_model_rmse:,.2f}
‚úì Pipeline completo guardado y listo para inferencia
""")

print("="*80)
print("PROCESO COMPLETADO EXITOSAMENTE")
print("="*80)

ENTRENAMIENTO Y SELECCI√ìN DE MODELOS

Cargando datos...
Total de registros: 73,153
Fecha m√≠nima: 2020-01-02 00:00:00
Fecha m√°xima: 2025-10-22 00:00:00

Sucursales √∫nicas: 7
Productos √∫nicos: 15

DIVISI√ìN TEMPORAL DE DATOS
Datos de entrenamiento: 58,522 registros
  Per√≠odo: 2020-01-02 00:00:00 a 2025-10-22 00:00:00
  Sucursales: 6
  Productos: 15

Datos de validaci√≥n: 14,631 registros
  Per√≠odo: 2020-01-02 00:00:00 a 2025-10-22 00:00:00
  Sucursales: 2
  Productos: 15

‚ö†Ô∏è  Sucursales nuevas en validaci√≥n: {np.int64(12)}

Shape X_train: (58522, 4)
Shape X_val: (14631, 4)

CREANDO PIPELINE DE PREPROCESAMIENTO
‚úì Pipeline creado
‚úì Ajustando pipeline con datos de entrenamiento...
‚úì Pipeline ajustado exitosamente

Total de modelos a entrenar: 15
5 familias de modelos √ó 3 configuraciones = 15 modelos

ENTRENAMIENTO DE MODELOS

[1/15] Entrenando LinearRegression_1...
Advertencia: 9821 valores no vistos en 'Codigo_Sucursal' - usando valor default
  ‚úì RMSE: Q1,351.96
  ‚úì 