# Challenger Model V2: Advanced Optimization

**Goal**: Push the limits of the CatBoost Challenger model (WAPE < 0.25).

**Advanced Strategies (V2)**:
1.  **Hyperparameter Optimization**: Using **Optuna** to find the perfect `learning_rate`, `depth`, and `l2_leaf_reg`.
2.  **Advanced Features**: Adding Rolling Means (e.g., avg sales of last 4 same-weekdays) to capture finer seasonality.
3.  **Ensembling**: Stacking CatBoost with a simple Ridge Regression to blend non-linear and linear signals.

In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add root to path for imports
sys.path.append('..')
from src.features.features import RetailFeatureEngineer, create_lags

pd.set_option('display.max_columns', None)
sns.set_theme(style="whitegrid")

## 1. Load & Enrich Data (Advanced Features)

In [None]:
# Load optimized daily data
df = pd.read_parquet('../data/processed/daily_canon.parquet')
df = df.sort_values(by=['store_nbr', 'family', 'date']).reset_index(drop=True)

# --- FEATURE ENGINEERING V2 ---
print("Generating V2 Features...")

# 1. Base Lags (Standard)
df = create_lags(df, lags=[7, 14, 28])

# 2. [NEW] Rolling Features (Window Statistics)
# Logic: Average of sales over last 28 days (excluding current)
# Groupby is slow, so we iterate carefully or rely on shift + rolling
# Approximation: Rolling mean of Lag-7 (to avoid data leakage)
df['sales_roll_mean_28'] = df.groupby(['store_nbr', 'family'])['sales_lag_7'].transform(lambda x: x.rolling(28).mean())
df['sales_roll_std_7'] = df.groupby(['store_nbr', 'family'])['sales_lag_7'].transform(lambda x: x.rolling(7).std())

# 3. Standard Pillars (Payday, Earthquake, Clusters)
engineer = RetailFeatureEngineer()
df = engineer.transform(df)

# Clean NaNs
df = df.dropna(subset=['sales_lag_28', 'sales_roll_mean_28', 'sales'])

print(f"V2 Dataset Ready: {df.shape}")

## 2. Prepare Splits

In [None]:
split_date = '2017-08-01'
mask_train = df['date'] < split_date
mask_val = (df['date'] >= split_date) & (df['is_train_day'] == 1)

drop_cols = ['sales', 'date', 'id', 'set', 'transactions', 'transactions_missing']
X = df.drop(columns=drop_cols)
y = df['sales']

cat_features = ['store_nbr', 'family', 'city', 'state', 'type', 'cluster']
# Ensure categories are strings for CatBoost
for c in cat_features:
    X[c] = X[c].astype(str)

X_train = X[mask_train]
y_train = y[mask_train]
X_val = X[mask_val]
y_val = y[mask_val]

print(f"Train: {len(X_train)} rows | Validation: {len(X_val)} rows")

## 3. Optuna Hyperparameter Tuning
Optimizing CatBoost parameters with **Tweedie** loss.

In [None]:
def objective(trial):
    params = {
        'iterations': 500, # Keep low for fast tuning
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'loss_function': 'Tweedie:variance_power=1.5',
        'eval_metric': 'RMSE',
        'random_seed': 42,
        'verbose': False,
        'allow_writing_files': False
    }

    train_pool = Pool(X_train, y_train, cat_features=cat_features)
    val_pool = Pool(X_val, y_val, cat_features=cat_features)

    model = CatBoostRegressor(**params)
    model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=20)

    preds = model.predict(X_val)
    preds = np.maximum(preds, 0)
    
    # Optimize for WAPE directly inside Optuna
    wape = np.sum(np.abs(y_val - preds)) / np.sum(y_val)
    return wape

# Run Optimization
print("Starting Optuna Study (5 Trials for Demo)...")
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=5) # Increase n_trials for real impact

print(f"Best WAPE: {study.best_value:.4f}")
print(f"Best Params: {study.best_params}")

best_params = study.best_params
# Add fixed params back
best_params['iterations'] = 1000 # Production iterations
best_params['loss_function'] = 'Tweedie:variance_power=1.5'
best_params['eval_metric'] = 'RMSE'

## 4. Stacking (Ensemble)
Combine Best CatBoost with a robust linear model (Ridge).

In [None]:
# 1. Train Best CatBoost
model_cb = CatBoostRegressor(**best_params, verbose=100)
model_cb.fit(X_train, y_train, cat_features=cat_features)
preds_cb = model_cb.predict(X_val)

# 2. Train Simple Ridge (Linear)
# Ridge needs numeric input only, drop categories or encode
X_train_lin = X_train.select_dtypes(include=[np.number]).fillna(0)
X_val_lin = X_val.select_dtypes(include=[np.number]).fillna(0)

model_ridge = Ridge(alpha=1.0)
model_ridge.fit(X_train_lin, y_train)
preds_ridge = model_ridge.predict(X_val_lin)

# 3. Blending (Stacking)
# Weighted average: 80% CatBoost (Strong), 20% Ridge (Robust)
final_preds = (0.8 * preds_cb) + (0.2 * preds_ridge)
final_preds = np.maximum(final_preds, 0)

wape_blend = np.sum(np.abs(y_val - final_preds)) / np.sum(y_val)
print(f"---\nCatBoost Only WAPE: {np.sum(np.abs(y_val - preds_cb))/np.sum(y_val):.4f}")
print(f"Ridge Only WAPE: {np.sum(np.abs(y_val - preds_ridge))/np.sum(y_val):.4f}")
print(f"Stacking (80/20) WAPE: {wape_blend:.4f}\n---")