# Model Training & Evaluation

**Reference:** Paper 1 — OpenFPL (Groos, 2025)

Every model run is automatically logged to `outputs/results/experiment_runs.jsonl`.
At the end, `tracker.summary()` shows the full comparison table across all runs.

## 1. Setup

In [None]:
import sys
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor

from src.preprocessing import TIER1_FEATURES, TIER2_FEATURES
from src.evaluation import ExperimentTracker

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

# One tracker for the whole notebook session
tracker = ExperimentTracker()

print('Setup complete')

## 2. Load Data

In [None]:
data_dir = Path('../data/processed/tier2_2022-23_to_2023-24')

X_train = pd.read_csv(data_dir / 'X_train.csv')
X_test  = pd.read_csv(data_dir / 'X_test.csv')
y_train = pd.read_csv(data_dir / 'y_train.csv')['total_points']
y_test  = pd.read_csv(data_dir / 'y_test.csv')['total_points']

# Full DataFrames for position / category grouping
train_full = pd.read_csv(data_dir / 'train_full.csv')
test_full  = pd.read_csv(data_dir / 'test_full.csv')

# Positions array — always stays aligned with X_test / y_test
test_positions = test_full['position_label'].values

# Available feature columns by tier
tier1_cols = [f for f in TIER1_FEATURES if f in X_train.columns]
tier2_cols = list(X_train.columns)   # already Tier 2

print(f'Train  : {X_train.shape[0]:,} samples')
print(f'Test   : {X_test.shape[0]:,} samples')
print(f'Tier 1 : {len(tier1_cols)} features')
print(f'Tier 2 : {len(tier2_cols)} features')

## 3. Baseline Models

### 3.1 Naive Baseline — Last 5 Average

In [None]:
naive_preds = X_test['form_last_5'].values

tracker.log(
    name='Naive: Last 5 Avg',
    y_true=y_test, y_pred=naive_preds, positions=test_positions,
    config={
        'model': 'Naive',
        'features': 'form_last_5 only',
        'n_features': 1,
        'train_seasons': '2022-23',
        'test_season': '2023-24',
    }
)

### 3.2 Linear Regression — Tier 1

In [None]:
scaler = StandardScaler()
X_tr_t1 = scaler.fit_transform(X_train[tier1_cols])
X_te_t1 = scaler.transform(X_test[tier1_cols])

lr = LinearRegression()
lr.fit(X_tr_t1, y_train)
lr_preds = lr.predict(X_te_t1)

tracker.log(
    name='LinearRegression Tier1',
    y_true=y_test, y_pred=lr_preds, positions=test_positions,
    config={
        'model': 'LinearRegression',
        'features': 'Tier 1',
        'n_features': len(tier1_cols),
        'train_seasons': '2022-23',
        'test_season': '2023-24',
        'params': {'fit_intercept': True, 'scaling': 'StandardScaler'}
    }
)

## 4. XGBoost — Experiment with Params

In [None]:
# --- Run 1: XGBoost default-ish ---
xgb_params = dict(n_estimators=100, max_depth=5, learning_rate=0.1,
                  subsample=1.0, colsample_bytree=1.0, random_state=42, n_jobs=-1)

xgb1 = XGBRegressor(**xgb_params)
xgb1.fit(X_train, y_train)
preds = xgb1.predict(X_test)

tracker.log(
    name='XGBoost n100 d5 lr0.1',
    y_true=y_test, y_pred=preds, positions=test_positions,
    config={
        'model': 'XGBoost',
        'features': 'Tier 2',
        'n_features': len(tier2_cols),
        'train_seasons': '2022-23',
        'test_season': '2023-24',
        'params': xgb_params,
    }
)

In [None]:
# --- Run 2: XGBoost more trees, lower lr ---
xgb_params2 = dict(n_estimators=300, max_depth=5, learning_rate=0.05,
                   subsample=0.85, colsample_bytree=0.85, random_state=42, n_jobs=-1)

xgb2 = XGBRegressor(**xgb_params2)
xgb2.fit(X_train, y_train)
preds2 = xgb2.predict(X_test)

tracker.log(
    name='XGBoost n300 d5 lr0.05',
    y_true=y_test, y_pred=preds2, positions=test_positions,
    config={
        'model': 'XGBoost',
        'features': 'Tier 2',
        'n_features': len(tier2_cols),
        'train_seasons': '2022-23',
        'test_season': '2023-24',
        'params': xgb_params2,
    }
)

In [None]:
# --- Run 3: XGBoost deeper trees ---
xgb_params3 = dict(n_estimators=300, max_depth=7, learning_rate=0.05,
                   subsample=0.85, colsample_bytree=0.7, random_state=42, n_jobs=-1)

xgb3 = XGBRegressor(**xgb_params3)
xgb3.fit(X_train, y_train)
preds3 = xgb3.predict(X_test)

tracker.log(
    name='XGBoost n300 d7 lr0.05',
    y_true=y_test, y_pred=preds3, positions=test_positions,
    config={
        'model': 'XGBoost',
        'features': 'Tier 2',
        'n_features': len(tier2_cols),
        'train_seasons': '2022-23',
        'test_season': '2023-24',
        'params': xgb_params3,
    }
)

## 5. Random Forest — Experiment with Params

In [None]:
# --- Run 1: RF default ---
rf_params = dict(n_estimators=100, max_depth=None, min_samples_leaf=1,
                 max_features='sqrt', random_state=42, n_jobs=-1)

rf1 = RandomForestRegressor(**rf_params)
rf1.fit(X_train, y_train)
rf_preds1 = rf1.predict(X_test)

tracker.log(
    name='RF n100 depth=None',
    y_true=y_test, y_pred=rf_preds1, positions=test_positions,
    config={
        'model': 'RandomForest',
        'features': 'Tier 2',
        'n_features': len(tier2_cols),
        'train_seasons': '2022-23',
        'test_season': '2023-24',
        'params': rf_params,
    }
)

In [None]:
# --- Run 2: RF regularised (max_depth, min_samples_leaf) ---
rf_params2 = dict(n_estimators=300, max_depth=10, min_samples_leaf=3,
                  max_features='sqrt', random_state=42, n_jobs=-1)

rf2 = RandomForestRegressor(**rf_params2)
rf2.fit(X_train, y_train)
rf_preds2 = rf2.predict(X_test)

tracker.log(
    name='RF n300 depth=10 leaf=3',
    y_true=y_test, y_pred=rf_preds2, positions=test_positions,
    config={
        'model': 'RandomForest',
        'features': 'Tier 2',
        'n_features': len(tier2_cols),
        'train_seasons': '2022-23',
        'test_season': '2023-24',
        'params': rf_params2,
    }
)

## 6. Results — Full Run Table

In [None]:
# RMSE table
tracker.summary('rmse')

In [None]:
# MAE table
tracker.summary('mae')

In [None]:
# Best run by overall MAE
best = tracker.best_run('overall_mae')
print(f"Best run by MAE: #{int(best['run_id'])} '{best['name']}'")
print(f"  overall_rmse={best['overall_rmse']:.4f}  overall_mae={best['overall_mae']:.4f}")

In [None]:
# Paper 1 benchmarks for reference
paper1 = {
    'run_id': '—', 'name': 'Paper1: Last5 Baseline',
    'overall_rmse': None, 'overall_mae': None,
    'Zeros_rmse': 0.791, 'Blanks_rmse': 1.400, 'Tickers_rmse': 2.136, 'Haulers_rmse': 5.613,
    'Zeros_mae':  0.270, 'Blanks_mae':  0.652, 'Tickers_mae':  1.645, 'Haulers_mae':  4.709,
}
paper2 = {
    'run_id': '—', 'name': 'Paper1: OpenFPL',
    'overall_rmse': None, 'overall_mae': None,
    'Zeros_rmse': 0.818, 'Blanks_rmse': 1.291, 'Tickers_rmse': 1.517, 'Haulers_rmse': 5.142,
    'Zeros_mae':  0.427, 'Blanks_mae':  0.749, 'Tickers_mae':  1.127, 'Haulers_mae':  4.317,
}

all_runs = tracker.load_runs()
paper_df = pd.DataFrame([paper1, paper2])

cat_rmse_cols = ['name', 'overall_rmse', 'Zeros_rmse', 'Blanks_rmse', 'Tickers_rmse', 'Haulers_rmse']
cat_mae_cols  = ['name', 'overall_mae',  'Zeros_mae',  'Blanks_mae',  'Tickers_mae',  'Haulers_mae']

combined_rmse = pd.concat([paper_df[cat_rmse_cols], all_runs[cat_rmse_cols]], ignore_index=True)
combined_mae  = pd.concat([paper_df[cat_mae_cols],  all_runs[cat_mae_cols]],  ignore_index=True)

print('RMSE BY RETURN CATEGORY (our runs + Paper 1)')
print(combined_rmse.to_string(index=False, float_format=lambda x: f'{x:.4f}'))

print('\nMAE BY RETURN CATEGORY (our runs + Paper 1)')
print(combined_mae.to_string(index=False, float_format=lambda x: f'{x:.4f}'))

## 7. Visualisations

In [None]:
# RMSE by category — all runs + Paper 1
categories = ['Zeros', 'Blanks', 'Tickers', 'Haulers']
all_runs = tracker.load_runs()

fig, axes = plt.subplots(1, 2, figsize=(18, 6))

for ax, metric in zip(axes, ['rmse', 'mae']):
    cols = [f'{c}_{metric}' for c in categories]

    # Paper 1 lines first
    ax.plot(categories, [paper1[c] for c in cols], 'k--o', lw=2, label='Paper1: Last5', zorder=5)
    ax.plot(categories, [paper2[c] for c in cols], 'k-^',  lw=2, label='Paper1: OpenFPL', zorder=5)

    # Our runs
    for _, row in all_runs.iterrows():
        vals = [row.get(c) for c in cols]
        ax.plot(categories, vals, '-o', lw=1.5, ms=6, label=row['name'])

    ax.set_title(f'{metric.upper()} by Return Category', fontweight='bold', fontsize=13)
    ax.set_ylabel(metric.upper())
    ax.legend(fontsize=8, loc='upper left')
    ax.grid(alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Overall RMSE / MAE comparison bar chart
all_runs = tracker.load_runs()

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for ax, metric in zip(axes, ['overall_rmse', 'overall_mae']):
    ax.barh(all_runs['name'], all_runs[metric], color='steelblue', alpha=0.8, edgecolor='black')
    ax.set_title(metric.replace('_', ' ').upper(), fontweight='bold')
    ax.set_xlabel(metric.split('_')[1].upper())
    ax.invert_yaxis()
    ax.grid(axis='x', alpha=0.3)
    for i, v in enumerate(all_runs[metric]):
        ax.text(v + 0.005, i, f'{v:.4f}', va='center', fontsize=8)

plt.suptitle('All Runs — Overall Metrics', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Feature importance for best XGBoost run (by MAE)
xgb_runs = all_runs[all_runs['model'] == 'XGBoost']
best_xgb_idx = xgb_runs['overall_mae'].idxmin()
print(f"Best XGBoost run: #{int(all_runs.loc[best_xgb_idx, 'run_id'])} '{all_runs.loc[best_xgb_idx, 'name']}'")

# Use the last trained XGBoost (you may need to retrain if restarting the kernel)
# xgb2 = best based on the runs above — adjust if needed
best_xgb = xgb2   # change this if a different run is best

imp = pd.Series(best_xgb.feature_importances_, index=tier2_cols).sort_values()
fig, ax = plt.subplots(figsize=(9, 8))
imp.plot(kind='barh', ax=ax, color='orange', alpha=0.8, edgecolor='black')
ax.set_title('XGBoost Feature Importance (best run)', fontweight='bold')
ax.set_xlabel('Importance')
ax.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

## 8. Load All Historical Runs

Runs accumulate across notebook sessions. Run this cell any time to see everything.

In [None]:
from src.evaluation import load_runs

all_runs_df = load_runs()
print(f'Total runs logged so far: {len(all_runs_df)}')
all_runs_df

In [None]:
# Clean pivot: runs × metrics (useful for sharing / exporting)
pivot = tracker.comparison_table('rmse')
pivot

In [None]:
# Uncomment to reset and start fresh (deletes all logged runs!)
# tracker.clear()