# SmartSus Chef: The Universal Predictive Engine
**Version:** 2.0 (Production Ready) | **Context:** Singapore & China | **Architecture:** Champion-Challenger

## üìñ How to read this Notebook
This engine is designed to predict food demand for F&B operators. It follows a strict pipeline:
1.  **Context Detection:** Where is the restaurant? (SG or CN?) -> Load correct Holidays/Weather.
2.  **Data Ingestion:** Fetch sales history from MySQL (or CSV fallback).
3.  **Sanitation:** Fix "Lazy Employee" data (missing days) using interpolation.
4.  **Evaluation (The Battle):** Hide the last 30 days, train on the past, and see which model guesses the hidden days better.
5.  **Production Training:** Retrain BOTH models on 100% of data so they are ready for tomorrow.
6.  **Prediction:** Serve the forecast via API logic.

In [None]:
# --- IMPORTS & SETUP ---
# We import standard data libraries (pandas/numpy) and our ML models (Prophet/CatBoost).
import pandas as pd
import numpy as np
import holidays
import pickle
import os
import shap
from sqlalchemy import create_engine
from prophet import Prophet
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from IPython.display import display
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import warnings

warnings.filterwarnings('ignore') # Keep the output clean

print("‚úÖ Libraries loaded successfully.")

In [None]:
# --- CONFIGURATION ---
N_CV_FOLDS = 3            # Time-series CV folds (increase when data grows)
TEST_WINDOW_DAYS = 30     # Days per test fold
MIN_TRAIN_DAYS = 60       # Minimum training data for first fold
RANDOM_SEED = 42
TREE_DEPTH_GRID = [3, 4, 6]
HOLIDAY_YEARS = [2024, 2025, 2026]

# Feature lists for tree-based models (CatBoost/XGBoost).
# SHAP grouping depends on this order: Seasonality[0:2], Holiday[2], Weather[3:5], Lags[5:10]
TREE_FEATURES = ['day_of_week', 'month', 'is_public_holiday',
                 'rain_lunch_vol', 'temperature',
                 'lag_1', 'lag_7', 'rolling_mean_7', 'rolling_mean_14', 'lag_same_weekday_avg']
TREE_CAT_FEATURES = ['day_of_week', 'month', 'is_public_holiday']

def safe_filename(name):
    """Sanitize dish name for use as a filename."""
    return name.replace(' ', '_').replace('-', '_').replace('/', '_')

## üìç Step 1: Context Awareness (Location Logic)
The model needs to know if it is in **Singapore** (Tropical, Hari Raya) or **China** (4-Seasons, Lunar New Year).
We determine this automatically using the restaurant's GPS coordinates.

In [None]:
def get_country_code(lat, lon):
    """
    Simple Bounding Box Logic.
    If the coordinates are inside Singapore box, return 'SG'. Else, default to 'CN'.
    """
    if (1.1 <= lat <= 1.5) and (103.5 <= lon <= 104.1):
        return 'SG'
    return 'CN'

def estimate_temperature(date, country_code):
    """
    Simulate temperature for a given date and country.
    Uses date-based seed so each date gets a unique but reproducible value.
    
    TODO: Replace with real weather API (e.g., OpenWeatherMap) when available.
          This function is the single place to swap in real data.
    """
    rng = np.random.default_rng(RANDOM_SEED + date.toordinal())
    m = date.month
    if country_code == 'SG':
        return rng.uniform(25, 34)
    else:
        if m in [12, 1, 2]:
            return rng.uniform(2, 10)
        elif m in [3, 4, 5]:
            return rng.uniform(12, 25)
        elif m in [6, 7, 8]:
            return rng.uniform(26, 38)
        else:
            return rng.uniform(12, 25)

def add_local_context(df, lat, lon):
    """
    Enriches the sales data with local context features (Holidays + Weather).
    """
    country_code = get_country_code(lat, lon)
    print(f"üåç Detected Location: {country_code} (Lat: {lat}, Lon: {lon})")
    
    df['day_of_week'] = df['date'].dt.dayofweek
    df['month'] = df['date'].dt.month
    
    if country_code == 'SG':
        local_holidays = holidays.SG(years=HOLIDAY_YEARS)
    else:
        local_holidays = holidays.CN(years=HOLIDAY_YEARS)
        
    df['is_public_holiday'] = df['date'].apply(lambda x: 1 if x in local_holidays else 0)
    
    rng = np.random.default_rng(RANDOM_SEED)
    
    def estimate_rain(row):
        m = row['month']
        if country_code == 'SG':
            return rng.uniform(15, 60) if m in [11, 12, 1] else rng.uniform(5, 30)
        else:
            return rng.uniform(20, 80) if m in [6, 7, 8] else rng.uniform(5, 25)
        
    df['rain_lunch_vol'] = df.apply(estimate_rain, axis=1)
    df['temperature'] = df['date'].apply(lambda d: estimate_temperature(d, country_code))
    
    return df, country_code

## üíæ Step 2: Data Ingestion & Sanitation
Here we connect to the database. Crucially, we apply the **"Anti-Lazy Employee"** fix.
If the database has holes (missing days), the model will think sales were 0. We must fix this.

In [None]:
def fetch_training_data():
    """
    Tries to connect to MySQL. If it fails (e.g., you are testing locally without DB),
    it falls back to 'food_sales.csv' so you can still run the code.
    """
    DB_URL = "mysql+pymysql://root:password123@localhost:3306/SmartSusChef"
    
    try:
        engine = create_engine(DB_URL)
        query = """
        SELECT s.Date as date, r.Name as dish, s.QuantitySold as sales
        FROM Sales s JOIN Recipes r ON s.RecipeId = r.Id
        ORDER BY s.Date ASC
        """
        df = pd.read_sql(query, engine)
        df['date'] = pd.to_datetime(df['date'])
        print(f"‚úÖ Loaded {len(df)} rows from MySQL.")
        return df
    except Exception:
        print("‚ö†Ô∏è MySQL Connection failed (or not configured). Falling back to CSV.")
        df = pd.read_csv('food_sales_eng.csv')
        df['date'] = pd.to_datetime(df['date'], format='%m/%d/%Y')
        return df.sort_values('date')

def sanitize_sparse_data(df, country_code):
    """
    The 'Anti-Lazy Employee' Logic.
    """
    all_dates = pd.date_range(start=df['date'].min(), end=df['date'].max(), freq='D')
    df = df.set_index('date').reindex(all_dates)
    
    day_counts = df.groupby(df.index.dayofweek)['sales'].count()
    expected_count = day_counts.mean()
    weak_days = day_counts[day_counts < expected_count * 0.5].index
    if len(weak_days) > 0:
        df.loc[df.index.dayofweek.isin(weak_days), 'sales'] = np.nan
    
    df['sales'] = df['sales'].interpolate(method='time').fillna(0)

    if 'dish' in df.columns:
        df['dish'] = df['dish'].dropna().iloc[0] if not df['dish'].dropna().empty else "Unknown"

    if 'rain_lunch_vol' in df.columns:
        df['rain_lunch_vol'] = df['rain_lunch_vol'].interpolate(method='time').bfill().ffill()
    else:
        df['rain_lunch_vol'] = 0.0

    if 'temperature' in df.columns:
        df['temperature'] = df['temperature'].interpolate(method='time').bfill().ffill()
    else:
        df['temperature'] = 0.0

    if country_code == 'SG':
        local_holidays = holidays.SG(years=HOLIDAY_YEARS)
    else:
        local_holidays = holidays.CN(years=HOLIDAY_YEARS)
    
    df['is_public_holiday'] = df.index.to_series().apply(lambda x: 1 if x in local_holidays else 0)

    df['day_of_week'] = df.index.dayofweek
    df['month'] = df.index.month

    df = df.reset_index().rename(columns={'index': 'date'})
        
    return df

def add_lag_features(df):
    """Add lag and rolling features for tree-based models."""
    df = df.sort_values('date')
    df['lag_1'] = df['sales'].shift(1)
    df['lag_7'] = df['sales'].shift(7)
    df['rolling_mean_7'] = df['sales'].shift(1).rolling(7).mean()
    df['rolling_mean_14'] = df['sales'].shift(1).rolling(14).mean()
    df['lag_same_weekday_avg'] = (df['sales'].shift(7) + df['sales'].shift(14) +
                                  df['sales'].shift(21) + df['sales'].shift(28)) / 4
    df = df.dropna()
    return df

## üèÜ Step 3: The Evaluator (Backtesting)
Before we trust a model, we must test it. 
We calculate **MAE (Mean Absolute Error)**: *"On average, how many plates are we wrong by?"*

**Logic:**
1. Hide the last 30 days of sales.
2. Train model on the past.
3. Ask model to predict those hidden 30 days.
4. Compare Prediction vs. Reality.

In [None]:
def _evaluate_tree(df, end_date, features, make_model, param_key):
    """Shared expanding-window CV + depth-tuning logic for tree-based models."""
    best_avg_mae = float('inf')
    best_depth = TREE_DEPTH_GRID[0]
    best_last_fold_preds = None

    for depth in TREE_DEPTH_GRID:
        fold_maes = []
        last_fold_preds = None

        for fold_i in range(N_CV_FOLDS, 0, -1):
            test_end = end_date - pd.Timedelta(days=TEST_WINDOW_DAYS * (fold_i - 1))
            test_start = test_end - pd.Timedelta(days=TEST_WINDOW_DAYS)

            train = df[df['date'] < test_start].copy()
            test = df[(df['date'] >= test_start) & (df['date'] < test_end)].copy()

            train_span = (train['date'].max() - train['date'].min()).days if len(train) > 1 else 0
            if train_span < MIN_TRAIN_DAYS or len(test) < 1:
                continue

            m = make_model(depth)
            m.fit(train[features], train['sales'])
            predicted_values = np.maximum(m.predict(test[features]), 0)

            mae = mean_absolute_error(test['sales'], predicted_values)
            fold_maes.append(mae)

            last_fold_preds = {
                'dates': test['date'].values,
                'actual': test['sales'].values,
                'predicted': predicted_values
            }

        if len(fold_maes) > 0:
            avg_mae = np.mean(fold_maes)
            if avg_mae < best_avg_mae:
                best_avg_mae = avg_mae
                best_depth = depth
                best_last_fold_preds = last_fold_preds

    if best_avg_mae == float('inf'):
        return 999.0, None, {param_key: best_depth}

    return round(best_avg_mae, 2), best_last_fold_preds, {param_key: best_depth}


def evaluate_model(df, model_type, country_code):
    """
    Expanding-window time-series cross-validation.
    Returns (average_mae, last_fold_predictions_dict, best_params).
    """
    dates = df['date'].sort_values()
    end_date = dates.max()

    # --- PROPHET ---
    if model_type == 'prophet':
        fold_maes = []
        last_fold_preds = None

        for fold_i in range(N_CV_FOLDS, 0, -1):
            test_end = end_date - pd.Timedelta(days=TEST_WINDOW_DAYS * (fold_i - 1))
            test_start = test_end - pd.Timedelta(days=TEST_WINDOW_DAYS)

            train = df[df['date'] < test_start].copy()
            test = df[(df['date'] >= test_start) & (df['date'] < test_end)].copy()

            train_span = (train['date'].max() - train['date'].min()).days if len(train) > 1 else 0
            if train_span < MIN_TRAIN_DAYS or len(test) < 1:
                continue

            p_train = train[['date', 'sales', 'rain_lunch_vol', 'temperature']].rename(
                columns={'date': 'ds', 'sales': 'y'})
            m = Prophet(daily_seasonality=False)
            try:
                m.add_country_holidays(country_name=country_code)
            except Exception:
                pass
            m.add_regressor('rain_lunch_vol')
            m.add_regressor('temperature')
            m.fit(p_train)

            p_test = test[['date', 'rain_lunch_vol', 'temperature']].rename(columns={'date': 'ds'})
            forecast = m.predict(p_test)
            predicted_values = np.maximum(forecast['yhat'].values, 0)

            mae = mean_absolute_error(test['sales'], predicted_values)
            fold_maes.append(mae)

            last_fold_preds = {
                'dates': test['date'].values,
                'actual': test['sales'].values,
                'predicted': predicted_values
            }

        if len(fold_maes) == 0:
            return 999.0, None, {}

        avg_mae = round(np.mean(fold_maes), 2)
        return avg_mae, last_fold_preds, {}

    # --- CATBOOST (delegates to shared helper) ---
    elif model_type == 'catboost':
        def make_model(depth):
            return CatBoostRegressor(iterations=300, depth=depth, cat_features=TREE_CAT_FEATURES,
                                     random_seed=RANDOM_SEED, verbose=False)
        return _evaluate_tree(df, end_date, TREE_FEATURES, make_model, 'depth')

    # --- XGBOOST (delegates to shared helper) ---
    elif model_type == 'xgboost':
        def make_model(depth):
            return XGBRegressor(n_estimators=300, max_depth=depth, learning_rate=0.05,
                                random_state=RANDOM_SEED, n_jobs=-1)
        return _evaluate_tree(df, end_date, TREE_FEATURES, make_model, 'max_depth')

## üöÄ Step 4: The Main Loop (Train, Evaluate, Save)
This runs for every dish on the menu. 
It prints the "Champion" (Lowest Error) but saves **BOTH** models so the API has a backup.

In [None]:
# --- TRAINING PIPELINE (SAVES CHAMPIONS) ---
def train_and_evaluate(df, country_code):
    unique_dishes = df['dish'].unique()
    os.makedirs('models', exist_ok=True)
    
    results_list = []
    champion_map = {}
    all_predictions = {}

    print(f"\n{'='*95}")
    print(f"STARTING 3-WAY TRAINING FOR {len(unique_dishes)} DISHES IN {country_code}")
    print(f"{'DISH NAME':<35} | {'PROPHET':<10} | {'CATBOOST':<15} | {'XGBOOST':<15} | {'WINNER':<10}")
    print(f"{'='*95}")

    for dish_name in unique_dishes:
        safe_name = safe_filename(dish_name)

        # 1. Isolate Dish Data
        dish_data = df[df['dish'] == dish_name].copy()
        dish_data = sanitize_sparse_data(dish_data, country_code)
        dish_data_with_lags = add_lag_features(dish_data.copy())

        # 2. EVALUATION PHASE
        p_error, p_preds, p_params = evaluate_model(dish_data, 'prophet', country_code)
        c_error, c_preds, c_params = evaluate_model(dish_data_with_lags, 'catboost', country_code)
        x_error, x_preds, x_params = evaluate_model(dish_data_with_lags, 'xgboost', country_code)

        # Determine Winner
        scores = {'PROPHET': p_error, 'CATBOOST': c_error, 'XGBOOST': x_error}
        winner = min(scores, key=scores.get)
        winning_mae = scores[winner]
        champion_map[dish_name] = {
            'model': winner.lower(),
            'mae': winning_mae,
            'all_mae': {'prophet': p_error, 'catboost': c_error, 'xgboost': x_error}
        }

        preds_map = {'prophet': p_preds, 'catboost': c_preds, 'xgboost': x_preds}
        all_predictions[dish_name] = preds_map

        c_depth_str = f"{c_error} (d={c_params.get('depth', '?')})"
        x_depth_str = f"{x_error} (d={x_params.get('max_depth', '?')})"
        print(f"{dish_name:<35} | {p_error:<10} | {c_depth_str:<15} | {x_depth_str:<15} | {winner:<10}")

        results_list.append({
            'Dish': dish_name,
            'Prophet MAE': p_error,
            'CatBoost MAE': c_error,
            'CB Depth': c_params.get('depth', '?'),
            'XGBoost MAE': x_error,
            'XGB Depth': x_params.get('max_depth', '?'),
            'Winner': winner
        })

        # 3. PRODUCTION TRAINING PHASE

        # Prophet
        p_df = dish_data[['date', 'sales', 'rain_lunch_vol', 'temperature']].rename(
            columns={'date': 'ds', 'sales': 'y'})
        mp = Prophet(daily_seasonality=False)
        try:
            mp.add_country_holidays(country_name=country_code)
        except Exception:
            pass
        mp.add_regressor('rain_lunch_vol')
        mp.add_regressor('temperature')
        mp.fit(p_df)
        with open(f'models/prophet_{safe_name}.pkl', 'wb') as f:
            pickle.dump(mp, f)

        # CatBoost (tuned depth)
        cb_depth = c_params.get('depth', 6)
        mc = CatBoostRegressor(iterations=300, depth=cb_depth, cat_features=TREE_CAT_FEATURES,
                               random_seed=RANDOM_SEED, verbose=False)
        mc.fit(dish_data_with_lags[TREE_FEATURES], dish_data_with_lags['sales'])
        with open(f'models/catboost_{safe_name}.pkl', 'wb') as f:
            pickle.dump(mc, f)

        # XGBoost (tuned depth)
        xgb_depth = x_params.get('max_depth', 6)
        mx = XGBRegressor(n_estimators=300, max_depth=xgb_depth, learning_rate=0.05,
                          random_state=RANDOM_SEED, n_jobs=-1)
        mx.fit(dish_data_with_lags[TREE_FEATURES], dish_data_with_lags['sales'])
        with open(f'models/xgboost_{safe_name}.pkl', 'wb') as f:
            pickle.dump(mx, f)

        # Save recent sales history for lag computation at prediction time
        recent_sales = dish_data[['date', 'sales']].tail(28).copy()
        with open(f'models/recent_sales_{safe_name}.pkl', 'wb') as f:
            pickle.dump(recent_sales, f)

    # SAVE THE REGISTRY
    with open('models/champion_registry.pkl', 'wb') as f:
        pickle.dump(champion_map, f)

    clear_model_cache()

    print(f"{'='*95}\n‚úÖ All models saved. Champion Registry updated.")
    return pd.DataFrame(results_list), all_predictions

## Step 5: Prediction API
This simulates the API call. Loads saved model and predicts for a specific future date.

In [None]:
# --- MODEL CACHE ---
_model_cache = {}

def _load_cached(filepath):
    if filepath not in _model_cache:
        with open(filepath, 'rb') as f:
            _model_cache[filepath] = pickle.load(f)
    return _model_cache[filepath]

def clear_model_cache():
    _model_cache.clear()

def _predict_tree(model_obj, future, cols, dish_mae):
    """Shared prediction + SHAP explanation logic for tree models."""
    pred = float(model_obj.predict(future[cols])[0])
    qty = int(max(0, pred))
    pred_lower = int(max(0, pred - dish_mae))
    pred_upper = int(pred + dish_mae)

    try:
        ex = shap.TreeExplainer(model_obj)
        sv = ex.shap_values(future[cols])[0]
        base = float(ex.expected_value)
        # SHAP grouping matches TREE_FEATURES order:
        # Seasonality: day_of_week(0) + month(1)
        seasonality = float(sv[0] + sv[1])
        # Holiday: is_public_holiday(2)
        holiday = float(sv[2])
        # Weather: rain_lunch_vol(3) + temperature(4)
        weather = float(sv[3] + sv[4])
        # Lags: indices 5..9
        lags = float(sv[5] + sv[6] + sv[7] + sv[8] + sv[9])
        expl = {
            "Trend": round(base + lags, 1),
            "Seasonality": round(seasonality, 1),
            "Holiday": round(holiday, 1),
            "Weather": round(weather, 1)
        }
    except Exception:
        expl = {"Trend": round(pred, 1), "Seasonality": 0.0,
                "Holiday": 0.0, "Weather": 0.0}

    return qty, pred_lower, pred_upper, expl

# --- PREDICTION API (AUTO-SELECT) ---
def get_prediction(dish, date_str, lat, lon, rain_forecast=0, model='auto'):
    """
    Predicts using the BEST model for the specific dish (unless overridden).
    Returns prediction with MAE-based confidence intervals and explanation.
    CI semantics: [prediction - MAE, prediction + MAE] for all model types.
    """
    dt = pd.to_datetime(date_str)
    country = get_country_code(lat, lon)
    safe_name = safe_filename(dish)

    dish_mae = 0.0

    # 1. REGISTRY LOOKUP (always, for MAE ‚Äî even when model is manually specified)
    try:
        registry = _load_cached('models/champion_registry.pkl')
        if model == 'auto':
            model = registry[dish]['model']
        dish_mae = registry[dish]['all_mae'].get(model, 0.0)
    except Exception:
        if model == 'auto':
            model = 'prophet'

    # 2. Rebuild Context
    local_hols = holidays.SG() if country == 'SG' else holidays.CN()
    is_hol = 1 if dt in local_hols else 0
    temp = estimate_temperature(dt, country)

    future = pd.DataFrame({
        'ds': [dt],
        'rain_lunch_vol': [rain_forecast],
        'temperature': [temp],
        'is_public_holiday': [is_hol],
        'day_of_week': [dt.dayofweek],
        'month': [dt.month]
    })

    # 3. For tree models, compute lag features from recent sales history
    if model in ('catboost', 'xgboost'):
        try:
            recent = _load_cached(f'models/recent_sales_{safe_name}.pkl')
            sales_vals = recent['sales'].values
            future['lag_1'] = sales_vals[-1] if len(sales_vals) >= 1 else 0
            future['lag_7'] = sales_vals[-7] if len(sales_vals) >= 7 else 0
            future['rolling_mean_7'] = np.mean(sales_vals[-7:]) if len(sales_vals) >= 7 else np.mean(sales_vals)
            future['rolling_mean_14'] = np.mean(sales_vals[-14:]) if len(sales_vals) >= 14 else np.mean(sales_vals)
            weekday_vals = []
            for w in [7, 14, 21, 28]:
                if len(sales_vals) >= w:
                    weekday_vals.append(sales_vals[-w])
            future['lag_same_weekday_avg'] = np.mean(weekday_vals) if weekday_vals else 0
        except Exception:
            future['lag_1'] = 0
            future['lag_7'] = 0
            future['rolling_mean_7'] = 0
            future['rolling_mean_14'] = 0
            future['lag_same_weekday_avg'] = 0

    try:
        # --- PROPHET ---
        if model == 'prophet':
            mp = _load_cached(f'models/prophet_{safe_name}.pkl')
            fcst = mp.predict(future[['ds', 'rain_lunch_vol', 'temperature']])
            yhat = fcst['yhat'].values[0]
            qty = int(max(0, yhat))

            pred_lower = int(max(0, yhat - dish_mae))
            pred_upper = int(yhat + dish_mae)

            trend = fcst['trend'].values[0]
            holiday = fcst['holidays'].values[0]
            weather = fcst['extra_regressors_additive'].values[0]
            seasonality = yhat - trend - holiday - weather

            expl = {
                "Trend": round(trend, 1),
                "Seasonality": round(seasonality, 1),
                "Holiday": round(holiday, 1),
                "Weather": round(weather, 1)
            }

        # --- TREE MODELS (shared logic) ---
        elif model in ('catboost', 'xgboost'):
            tree_model = _load_cached(f'models/{model}_{safe_name}.pkl')
            qty, pred_lower, pred_upper, expl = _predict_tree(tree_model, future, TREE_FEATURES, dish_mae)

        return {
            "Dish": dish,
            "Date": date_str,
            "Model Used": model.upper(),
            "Prediction": qty,
            "Prediction_Lower": pred_lower,
            "Prediction_Upper": pred_upper,
            "Explanation": expl
        }

    except Exception as e:
        return {"Error": f"Model error for {dish}: {str(e)}"}

## üîÆ Step 6: Execution Block
This simulates the command you would run from your terminal.

In [None]:
# --- EXECUTION TEST ---

# 1. Load Data
raw_df = fetch_training_data()

# 2. Define Location
lat_in, lon_in = 31.23, 121.47 

# 3. Add Context
enriched_df, country = add_local_context(raw_df, lat_in, lon_in)

# 4. Run Training Pipeline
results_table, all_predictions = train_and_evaluate(enriched_df, country)

# 5. Show Leaderboard
print(f"\n{'='*50}")
print(f"MODEL LEADERBOARD (Lower MAE is Better)")
print(f"{'='*50}")
display(results_table)

In [None]:
# --- VISUALIZATION A: MAE Comparison Bar Chart ---
fig, ax = plt.subplots(figsize=(16, 6))

dishes = results_table['Dish']
x = np.arange(len(dishes))
width = 0.25

bars_p = ax.bar(x - width, results_table['Prophet MAE'], width, label='Prophet', color='#4C72B0')
bars_c = ax.bar(x, results_table['CatBoost MAE'], width, label='CatBoost', color='#DD8452')
bars_x = ax.bar(x + width, results_table['XGBoost MAE'], width, label='XGBoost', color='#55A868')

# Highlight the winner for each dish with a star marker
for i, row in results_table.iterrows():
    winner_mae = min(row['Prophet MAE'], row['CatBoost MAE'], row['XGBoost MAE'])
    if row['Winner'] == 'PROPHET':
        offset = -width
    elif row['Winner'] == 'CATBOOST':
        offset = 0
    else:
        offset = width
    ax.plot(x[i] + offset, winner_mae, marker='*', color='gold', markersize=14, zorder=5)

ax.set_xlabel('Dish')
ax.set_ylabel('MAE (plates)')
ax.set_title('Model MAE Comparison by Dish (lower is better)')
ax.set_xticks(x)
ax.set_xticklabels(dishes, rotation=45, ha='right', fontsize=8)
ax.legend()
ax.yaxis.set_minor_locator(ticker.AutoMinorLocator())
plt.tight_layout()
plt.show()

In [None]:
# --- VISUALIZATION B: Actual vs Predicted (Last Fold, Winning Model) ---
n_dishes = len(results_table)
ncols = 4
nrows = int(np.ceil(n_dishes / ncols))

fig, axes = plt.subplots(nrows, ncols, figsize=(18, 4 * nrows), squeeze=False)

for idx, (_, row) in enumerate(results_table.iterrows()):
    ax = axes[idx // ncols][idx % ncols]
    dish = row['Dish']
    winner = row['Winner'].lower()
    
    preds = all_predictions.get(dish, {}).get(winner)
    if preds is not None:
        dates = pd.to_datetime(preds['dates'])
        ax.plot(dates, preds['actual'], label='Actual', color='#333333', linewidth=1.5)
        ax.plot(dates, preds['predicted'], label='Predicted', color='#E24A33', linewidth=1.5, linestyle='--')
        ax.fill_between(dates, preds['actual'], preds['predicted'], alpha=0.15, color='#E24A33')
    
    ax.set_title(f"{dish}\n({winner.upper()})", fontsize=8, fontweight='bold')
    ax.tick_params(axis='x', rotation=30, labelsize=6)
    ax.tick_params(axis='y', labelsize=7)
    if idx == 0:
        ax.legend(fontsize=7)

# Hide unused subplots
for idx in range(n_dishes, nrows * ncols):
    axes[idx // ncols][idx % ncols].set_visible(False)

fig.suptitle('Actual vs Predicted Sales (Last CV Fold, Winning Model)', fontsize=13, y=1.01)
plt.tight_layout()
plt.show()

In [None]:
# --- PREDICTIONS: Forecast for Every Dish ---
forecast_date = '2026-05-20'
rain_input = 10.0

predictions = []
for dish_name in enriched_df['dish'].unique():
    res = get_prediction(
        dish=dish_name,
        date_str=forecast_date,
        lat=lat_in, lon=lon_in,
        rain_forecast=rain_input
    )
    if 'Error' not in res:
        predictions.append({
            'Dish': res['Dish'],
            'Predicted Qty': res['Prediction'],
            'Lower': res['Prediction_Lower'],
            'Upper': res['Prediction_Upper'],
            'Model': res['Model Used'],
            'Trend': res['Explanation']['Trend'],
            'Seasonality': res['Explanation']['Seasonality'],
            'Holiday': res['Explanation']['Holiday'],
            'Weather': res['Explanation']['Weather']
        })

pred_df = pd.DataFrame(predictions)

print(f"Forecast for: {forecast_date} | Rain: {rain_input}mm")
print("Explanation: Trend + Seasonality + Holiday + Weather = Predicted Qty")
print(f"{'='*90}")
display(pred_df)

# Bar chart of predicted quantities with confidence interval error bars
fig, ax = plt.subplots(figsize=(16, 6))
colors_map = {'PROPHET': '#4C72B0', 'CATBOOST': '#DD8452', 'XGBOOST': '#55A868'}
bar_colors = [colors_map.get(m, '#999999') for m in pred_df['Model']]

# Compute asymmetric error bars from Lower/Upper
yerr_lower = pred_df['Predicted Qty'] - pred_df['Lower']
yerr_upper = pred_df['Upper'] - pred_df['Predicted Qty']
yerr = [yerr_lower.values, yerr_upper.values]

bars = ax.bar(range(len(pred_df)), pred_df['Predicted Qty'], color=bar_colors,
              yerr=yerr, capsize=3, error_kw={'elinewidth': 1, 'capthick': 1, 'color': '#555555'})

ax.set_xticks(range(len(pred_df)))
ax.set_xticklabels(pred_df['Dish'], rotation=45, ha='right', fontsize=8)
ax.set_ylabel('Predicted Quantity (plates)')
ax.set_title(f'Predicted Sales per Dish ‚Äî {forecast_date} (with Confidence Intervals)')

for bar, qty, lo, hi in zip(bars, pred_df['Predicted Qty'], pred_df['Lower'], pred_df['Upper']):
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + (hi - qty) + 1,
            f"{qty} [{lo}-{hi}]", ha='center', va='bottom', fontsize=7, fontweight='bold')

from matplotlib.patches import Patch
legend_patches = [Patch(color=c, label=m) for m, c in colors_map.items()]
ax.legend(handles=legend_patches, title='Model Used')

plt.tight_layout()
plt.show()