# Export Best Models per Section

This notebook trains each canteen section's best-performing model on the
**training set** (first 80 % of the data), evaluates it on the **test set**
(last 20 %), and serialises the artefacts to disk (`deployment_models/`).
Predictive-quality metrics and (for the LSTM) efficiency metrics are printed
and optionally saved with the model.

---
## 1 -- Imports

In [1]:
!pip install pmdarima torchmetrics

Collecting pmdarima
  Downloading pmdarima-2.1.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (8.5 kB)
Collecting torchmetrics
  Downloading torchmetrics-1.8.2-py3-none-any.whl.metadata (22 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.15.3-py3-none-any.whl.metadata (5.5 kB)
Downloading pmdarima-2.1.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (689 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m689.1/689.1 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading torchmetrics-1.8.2-py3-none-any.whl (983 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.15.3-py3-none-any.whl (31 kB)
Installing collected packages: lightning-utilities, torchmetrics, pmdarima
Successfully installed lightning-utilities-0.15.3 pmdarima-2

In [2]:
import os
import time
import warnings
from datetime import datetime

import joblib
import numpy as np
import pandas as pd

import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data_utils

from prophet import Prophet
import pmdarima as pm

# optional -- use torchmetrics for more robust metrics
from torchmetrics import MeanAbsoluteError, MeanSquaredError, R2Score

warnings.filterwarnings('ignore')

In [3]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

try:
    os.chdir('/content/drive/MyDrive/UAB/FDS/campus-waste-intelligence')
    print('Directory changed')
except OSError:
    print("Error: Can't change the Current Working Directory")

Mounted at /content/drive
Directory changed


---
## 2 -- Configuration & Constants

In [4]:
DATA_PATH   = 'data/food_waste_cleaned.csv'
MODEL_DIR   = 'deployment_models'

# Split ratio: first SPLIT_RATIO * 100 % of rows are training, the rest test
SPLIT_RATIO = 0.8

LOOKBACK    = 7
LSTM_HIDDEN = 50
LSTM_LAYERS = 2
EPOCHS      = 50
BATCH_SIZE  = 16
LR          = 0.001

os.makedirs(MODEL_DIR, exist_ok=True)

---
## 3 -- Load and Aggregate Data

In [5]:
df = pd.read_csv(DATA_PATH, parse_dates=['Date'])

daily_section = (
    df.groupby(['Date', 'Canteen_Section'])['Waste_Weight_kg']
      .sum()
      .reset_index()
      .rename(columns={'Waste_Weight_kg': 'Total_Waste_kg'})
)

daily_wide = (
    daily_section
    .pivot(index='Date', columns='Canteen_Section', values='Total_Waste_kg')
    .fillna(0)
    .sort_index()
    .asfreq('D')
    .fillna(0)
)

sections = daily_wide.columns.tolist()
print(f'Sections found: {sections}')
print(f'Date range: {daily_wide.index.min()} to {daily_wide.index.max()}')

Sections found: ['A', 'B', 'C', 'D']
Date range: 2025-06-11 00:00:00 to 2025-08-10 00:00:00


---
## 4 -- Feature Engineering

In [6]:
def create_features_for_series(series: pd.Series) -> pd.DataFrame:
    """Build lag, rolling-window, and calendar features for a single series."""
    df_ml = pd.DataFrame(index=series.index)
    df_ml['y'] = series.values

    # Calendar features
    df_ml['dayofweek'] = df_ml.index.dayofweek
    df_ml['day']       = df_ml.index.day
    df_ml['month']     = df_ml.index.month
    df_ml['quarter']   = df_ml.index.quarter
    df_ml['weekend']   = (df_ml.index.dayofweek >= 5).astype(int)

    # Lag features
    for lag in [1, 2, 3, 7, 14]:
        df_ml[f'lag_{lag}'] = df_ml['y'].shift(lag)

    # Rolling-window features (shifted by 1 to avoid look-ahead)
    shifted = df_ml['y'].shift(1)
    df_ml['rolling_mean_7'] = shifted.rolling(7).mean()
    df_ml['rolling_std_7']  = shifted.rolling(7).std()
    df_ml['rolling_min_7']  = shifted.rolling(7).min()
    df_ml['rolling_max_7']  = shifted.rolling(7).max()
    df_ml['ewm_mean_7']    = shifted.ewm(span=7).mean()

    df_ml.dropna(inplace=True)
    return df_ml

In [7]:
feature_dfs: dict[str, pd.DataFrame] = {}

for sec in sections:
    feature_dfs[sec] = create_features_for_series(daily_wide[sec])

# Align all sections to the same date window
common_start = max(df_sec.index.min() for df_sec in feature_dfs.values())
common_end   = min(df_sec.index.max() for df_sec in feature_dfs.values())

for sec in sections:
    feature_dfs[sec] = feature_dfs[sec].loc[common_start:common_end]

print(f'Common date range: {common_start.date()} to {common_end.date()}')
print(f'Rows per section:  {len(feature_dfs[sections[0]])}')

Common date range: 2025-06-25 to 2025-08-10
Rows per section:  47


---
## 5 -- Train / Test Split (percentage-based, sequential)

In [8]:
ref_index = feature_dfs[sections[0]].index
n_total   = len(ref_index)
n_train   = int(n_total * SPLIT_RATIO)

train_indices = ref_index[:n_train]
test_indices  = ref_index[n_train:]

train_mask = ref_index.isin(train_indices)
test_mask  = ref_index.isin(test_indices)

print(f'Total rows: {n_total}')
print(f'Train rows: {n_train} ({SPLIT_RATIO:.0%})')
print(f'Test rows : {n_total - n_train} ({1 - SPLIT_RATIO:.0%})')

Total rows: 47
Train rows: 37 (80%)
Test rows : 10 (20%)


---
## 6 -- Best Model per Section (hard-coded)

In [9]:
best_models: dict[str, str] = {
    'A': 'XGBoost',
    'B': 'XGBoost',
    'C': 'Random Forest',
    'D': 'XGBoost',
}

best_models

{'A': 'XGBoost', 'B': 'XGBoost', 'C': 'Random Forest', 'D': 'XGBoost'}

---
## 7 -- LSTM Architecture

In [10]:
class LSTMModel(nn.Module):
    """Simple stacked-LSTM regressor."""

    def __init__(
        self,
        input_size: int = 1,
        hidden_size: int = LSTM_HIDDEN,
        num_layers: int = LSTM_LAYERS,
        output_size: int = 1,
    ):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc   = nn.Linear(hidden_size, output_size)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out.squeeze()

---
## 8 -- Sequence Helper

In [11]:
def create_sequences(
    data: np.ndarray, lookback: int = LOOKBACK
) -> tuple[np.ndarray, np.ndarray]:
    """Slide a window of *lookback* steps over *data* and return (X, y) arrays."""
    xs, ys = [], []
    for i in range(lookback, len(data)):
        xs.append(data[i - lookback : i])
        ys.append(data[i])
    return np.array(xs), np.array(ys)

---
## 9 -- Metrics Helper

In [12]:
def compute_regression_metrics(
    y_true: np.ndarray, y_pred: np.ndarray
) -> dict[str, float]:
    """Return RMSE, MAE, MAPE, and R-squared."""
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae  = mean_absolute_error(y_true, y_pred)

    # Avoid division by zero in MAPE
    mask = y_true != 0
    mape = (
        np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100
        if mask.sum() > 0
        else np.nan
    )

    r2 = r2_score(y_true, y_pred)

    return {'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2}

---
## 10 -- Per-Model Training Helpers

In [13]:
def _train_tree_model(
    model_name: str,
    X_train: pd.DataFrame,
    y_train: pd.Series,
    X_test: pd.DataFrame,
):
    """Train an XGBoost or Random Forest model and return (model, test_pred)."""
    if model_name == 'XGBoost':
        model = xgb.XGBRegressor(
            n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42,
        )
    else:
        model = RandomForestRegressor(
            n_estimators=100, max_depth=10, random_state=42,
        )
    model.fit(X_train, y_train)
    test_pred = model.predict(X_test)
    return model, test_pred

In [14]:
def _train_svm(
    X_train: pd.DataFrame,
    y_train: pd.Series,
    X_test: pd.DataFrame,
):
    """Train an SVR model with standard scaling; returns (model, test_pred, scaler)."""
    scaler = StandardScaler()
    X_scaled      = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model = SVR(kernel='rbf', C=100, gamma='scale')
    model.fit(X_scaled, y_train)

    test_pred = model.predict(X_test_scaled)
    return model, test_pred, scaler

In [15]:
def _train_prophet(y_train: pd.Series, test_dates: pd.DatetimeIndex):
    """Fit a Prophet model on training data and forecast for *test_dates*."""
    df_prophet = pd.DataFrame({'ds': y_train.index, 'y': y_train.values})

    model = Prophet(
        yearly_seasonality=False,
        weekly_seasonality=True,
        daily_seasonality=False,
    )
    model.fit(df_prophet)

    # Predict on the exact test dates
    future    = pd.DataFrame({'ds': test_dates})
    forecast  = model.predict(future)
    test_pred = forecast['yhat'].values
    return model, test_pred

In [16]:
def _train_sarima(y_train: pd.Series, test_dates: pd.DatetimeIndex):
    """Fit auto-ARIMA with weekly seasonality and forecast for *test_dates*."""
    model = pm.auto_arima(
        y_train,
        seasonal=True,
        m=7,
        trace=False,
        error_action='ignore',
        suppress_warnings=True,
        stepwise=True,
    )

    n_test = len(test_dates)
    pred, _ = model.predict(n_periods=n_test, return_conf_int=True)
    test_pred = pred.values
    return model, test_pred

In [17]:
def _train_lstm(
    sec: str,
    y_train: pd.Series,
    y_test: pd.Series,
) -> tuple:
    """Train an LSTM model and return (model, test_pred, efficiency_dict).

    Uses iterative (auto-regressive) prediction on the test set, seeded
    with the last *LOOKBACK* values of the training data.
    """
    # -- Prepare training sequences -------------------------------------------
    X_seq, y_seq = create_sequences(y_train.values, lookback=LOOKBACK)
    X_tensor = torch.tensor(X_seq, dtype=torch.float32).unsqueeze(-1)
    y_tensor = torch.tensor(y_seq, dtype=torch.float32)

    dataset = data_utils.TensorDataset(X_tensor, y_tensor)
    loader  = data_utils.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

    # -- Build & train --------------------------------------------------------
    model     = LSTMModel()
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=LR)

    for _epoch in range(EPOCHS):
        model.train()
        for xb, yb in loader:
            optimizer.zero_grad()
            loss = criterion(model(xb), yb)
            loss.backward()
            optimizer.step()

    # -- Iterative test-set prediction ----------------------------------------
    last_train  = y_train.values[-LOOKBACK:].reshape(1, LOOKBACK, 1)
    current_seq = torch.tensor(last_train, dtype=torch.float32)

    test_pred: list[float] = []
    model.eval()
    with torch.no_grad():
        for _ in range(len(y_test)):
            pred = model(current_seq).item()
            test_pred.append(pred)
            new_val     = torch.tensor([[[pred]]], dtype=torch.float32)
            current_seq = torch.cat([current_seq[:, 1:, :], new_val], dim=1)

    test_pred = np.array(test_pred)

    # -- Efficiency metrics ---------------------------------------------------
    param_count = sum(p.numel() for p in model.parameters())

    state_dict_path = f'{MODEL_DIR}/section_{sec}_lstm.pth'
    torch.save(model.state_dict(), state_dict_path)
    file_size_kb = os.path.getsize(state_dict_path) / 1024

    # Average latency over 100 forward passes
    sample_input = torch.randn(1, LOOKBACK, 1)
    model.eval()
    t0 = time.perf_counter()
    for _ in range(100):
        _ = model(sample_input)
    latency_ms = (time.perf_counter() - t0) / 100 * 1000

    efficiency = {
        'param_count': param_count,
        'model_size_kb': file_size_kb,
        'inference_latency_ms': latency_ms,
        'state_dict_path': f'section_{sec}_lstm.pth',
    }
    print(
        f'  LSTM efficiency: params={param_count}, '
        f'size={file_size_kb:.1f} KB, latency={latency_ms:.3f} ms'
    )

    return model, test_pred, efficiency

In [18]:
def _train_baseline(
    model_name: str,
    y_train: pd.Series,
    test_dates: pd.DatetimeIndex,
):
    """Return a constant-prediction baseline and a lightweight 'model' dict."""
    n_test = len(test_dates)

    if model_name == 'Naive':
        value = y_train.iloc[-1]
    elif model_name == 'Seasonal Naive':
        value = y_train.iloc[-7] if len(y_train) >= 7 else y_train.iloc[0]
    elif model_name == 'MA(7)':
        value = y_train.iloc[-7:].mean() if len(y_train) >= 7 else y_train.mean()
    else:
        value = 0.0

    test_pred = np.full(n_test, value)

    # Lightweight model artefact for deployment
    model = {
        'last_values': y_train[-14:].tolist(),
        'last_dates': y_train.index[-14:].strftime('%Y-%m-%d').tolist(),
    }
    return model, test_pred

---
## 11 -- Train, Evaluate & Export Dispatcher

In [19]:
def train_eval_export_section(sec: str, model_name: str) -> dict:
    """Train *model_name* on the train set, evaluate on the test set,
    serialise artefacts to *MODEL_DIR*, and return a dict of test metrics."""
    print(f"\n{'-' * 60}")
    print(f'Training {model_name} for section {sec}')

    df_ml = feature_dfs[sec]
    X = df_ml.drop('y', axis=1)
    y = df_ml['y']

    X_train, y_train = X[train_mask], y[train_mask]
    X_test,  y_test  = X[test_mask],  y[test_mask]

    feature_columns = X.columns.tolist()

    artifacts: dict = {
        'section': sec,
        'model_name': model_name,
        'feature_columns': feature_columns,
        'lookback': LOOKBACK,
        'train_date_range': [
            train_indices.min().isoformat(),
            train_indices.max().isoformat(),
        ],
        'test_date_range': [
            test_indices.min().isoformat(),
            test_indices.max().isoformat(),
        ],
    }

    # ---- Dispatch -----------------------------------------------------------
    if model_name in ('XGBoost', 'Random Forest'):
        model, test_pred = _train_tree_model(model_name, X_train, y_train, X_test)

    elif model_name == 'SVM':
        model, test_pred, scaler = _train_svm(X_train, y_train, X_test)
        artifacts['scaler'] = scaler

    elif model_name == 'Prophet':
        model, test_pred = _train_prophet(y_train, test_indices)

    elif model_name == 'SARIMA':
        model, test_pred = _train_sarima(y_train, test_indices)

    elif model_name == 'LSTM':
        model, test_pred, efficiency = _train_lstm(sec, y_train, y_test)
        artifacts.update(efficiency)

    elif model_name in ('Naive', 'Seasonal Naive', 'MA(7)'):
        model, test_pred = _train_baseline(model_name, y_train, test_indices)

    else:
        print(f"  Unsupported model '{model_name}' for section {sec}")
        return {}

    # ---- Metrics & save -----------------------------------------------------
    metrics = compute_regression_metrics(y_test.values, test_pred)
    print(
        f"  Test metrics: RMSE={metrics['RMSE']:.3f}, "
        f"MAE={metrics['MAE']:.3f}, "
        f"MAPE={metrics['MAPE']:.2f}%, "
        f"R2={metrics['R2']:.3f}"
    )

    artifacts['test_metrics'] = metrics
    artifacts['model'] = model

    save_path = f'{MODEL_DIR}/section_{sec}.joblib'
    joblib.dump(artifacts, save_path)
    print(f'  Saved -> {save_path}')

    return metrics

---
## 12 -- Run Export Loop and Collect Metrics

In [20]:
all_metrics: list[dict] = []

for sec, model_name in best_models.items():
    metrics = train_eval_export_section(sec, model_name)
    if metrics:
        all_metrics.append({'Section': sec, 'Model': model_name, **metrics})


------------------------------------------------------------
Training XGBoost for section A
  Test metrics: RMSE=6.981, MAE=5.829, MAPE=23.49%, R2=-0.793
  Saved -> deployment_models/section_A.joblib

------------------------------------------------------------
Training XGBoost for section B
  Test metrics: RMSE=6.195, MAE=5.422, MAPE=22.06%, R2=0.084
  Saved -> deployment_models/section_B.joblib

------------------------------------------------------------
Training Random Forest for section C
  Test metrics: RMSE=4.259, MAE=3.724, MAPE=13.10%, R2=-0.591
  Saved -> deployment_models/section_C.joblib

------------------------------------------------------------
Training XGBoost for section D
  Test metrics: RMSE=6.898, MAE=6.188, MAPE=26.33%, R2=-3.698
  Saved -> deployment_models/section_D.joblib


In [21]:
# Summary table
print('\n' + '=' * 60)
print('SUMMARY OF TEST METRICS')
print('=' * 60)

summary_df = pd.DataFrame(all_metrics)
print(summary_df.to_string(index=False))

summary_path = f'{MODEL_DIR}/test_metrics_summary.csv'
summary_df.to_csv(summary_path, index=False)
print(f'\nSummary saved to {summary_path}')


SUMMARY OF TEST METRICS
Section         Model     RMSE      MAE      MAPE        R2
      A       XGBoost 6.981413 5.828658 23.493827 -0.793069
      B       XGBoost 6.195208 5.422199 22.059672  0.083807
      C Random Forest 4.258691 3.723980 13.096007 -0.590600
      D       XGBoost 6.897860 6.188219 26.326522 -3.698431

Summary saved to deployment_models/test_metrics_summary.csv


---
## 13 -- Verify Saved Artefacts

In [22]:
saved_files = sorted(os.listdir(MODEL_DIR))
print(f"Files in '{MODEL_DIR}/' ({len(saved_files)}):\n")
for f in saved_files:
    size_kb = os.path.getsize(os.path.join(MODEL_DIR, f)) / 1024
    print(f'  {f:40s}  {size_kb:>8.1f} KB')

Files in 'deployment_models/' (5):

  section_A.joblib                             168.1 KB
  section_B.joblib                             154.0 KB
  section_C.joblib                             359.8 KB
  section_D.joblib                             167.3 KB
  test_metrics_summary.csv                       0.4 KB
