# Baseline Models for F1 Race Prediction

This notebook trains and evaluates baseline models:
- Dummy baselines (mean, median, grid position)
- Historical mean lookup
- Linear Regression
- Ridge Regression
- Random Forest

Goal: Establish baseline performance before trying advanced models.

In [None]:
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from src.models import (
    set_random_seeds,
    load_modeling_data,
    verify_data_integrity,
    train_dummy_baseline,
    train_mean_baseline,
    train_linear_regression,
    train_ridge_regression,
    train_random_forest,
    create_results_summary,
    save_models
)

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Reproducibility
set_random_seeds(42)

## Load and Verify Data

In [None]:
# Load the train/val/test splits
X_train, X_val, X_test, y_train, y_val, y_test = load_modeling_data()

# Verify data quality
verify_data_integrity(X_train, X_val, X_test, y_train, y_val, y_test)

In [None]:
# Quick peek at features
print(f"\nFeature columns ({len(X_train.columns)} total):")
print(X_train.columns.tolist()[:20])  # First 20
print("...")

## 1. Dummy Baselines

In [None]:
dummy_results = train_dummy_baseline(X_train, y_train, X_val, y_val)

## 2. Historical Mean Baseline

In [None]:
mean_results = train_mean_baseline(X_train, y_train, X_val, y_val)

## 3. Linear Regression

In [None]:
lr_results = train_linear_regression(X_train, y_train, X_val, y_val)

In [None]:
# Visualize coefficients
fig, ax = plt.subplots(figsize=(10, 8))
top_features = lr_results['feature_importance'].head(20)
colors = ['red' if c < 0 else 'green' for c in top_features['coefficient']]
ax.barh(range(len(top_features)), top_features['coefficient'], color=colors, alpha=0.7)
ax.set_yticks(range(len(top_features)))
ax.set_yticklabels(top_features['feature'])
ax.set_xlabel('Coefficient Value')
ax.set_title('Linear Regression: Top 20 Feature Coefficients')
ax.axvline(0, color='black', linestyle='--', linewidth=0.8)
plt.tight_layout()
plt.show()

In [None]:
# Diagnostic plots
y_pred_train = lr_results['model'].predict(X_train)
y_pred_val = lr_results['model'].predict(X_val)
residuals_val = y_val - y_pred_val

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Predicted vs Actual
axes[0, 0].scatter(y_pred_val, y_val, alpha=0.5)
axes[0, 0].plot([0, 20], [0, 20], 'r--', lw=2)
axes[0, 0].set_xlabel('Predicted Position')
axes[0, 0].set_ylabel('Actual Position')
axes[0, 0].set_title('Predicted vs Actual')

# Residuals vs Predicted
axes[0, 1].scatter(y_pred_val, residuals_val, alpha=0.5)
axes[0, 1].axhline(0, color='r', linestyle='--', lw=2)
axes[0, 1].set_xlabel('Predicted Position')
axes[0, 1].set_ylabel('Residuals')
axes[0, 1].set_title('Residuals vs Predicted')

# Residuals histogram
axes[1, 0].hist(residuals_val, bins=30, edgecolor='black', alpha=0.7)
axes[1, 0].set_xlabel('Residual')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Residuals Distribution')

# Q-Q plot
from scipy import stats
stats.probplot(residuals_val, dist="norm", plot=axes[1, 1])
axes[1, 1].set_title('Q-Q Plot')

plt.tight_layout()
plt.show()

## 4. Ridge Regression

In [None]:
ridge_results = train_ridge_regression(X_train, y_train, X_val, y_val)

In [None]:
# Visualize CV results
cv_results = ridge_results['cv_results']
alphas = cv_results['param_alpha'].data
mean_scores = -cv_results['mean_test_score']  # Convert back to positive MAE

plt.figure(figsize=(10, 6))
plt.semilogx(alphas, mean_scores, marker='o', linewidth=2, markersize=8)
plt.xlabel('Alpha (Regularization Strength)')
plt.ylabel('Cross-Validation MAE')
plt.title('Ridge Regression: Hyperparameter Tuning')
plt.grid(True, alpha=0.3)
best_alpha = ridge_results['best_params']['alpha']
plt.axvline(best_alpha, color='r', linestyle='--', label=f'Best: α={best_alpha}')
plt.legend()
plt.show()

In [None]:
# Compare Linear vs Ridge coefficients
lr_coef = lr_results['feature_importance'].set_index('feature')['coefficient']
ridge_coef = ridge_results['feature_importance'].set_index('feature')['coefficient']

comparison = pd.DataFrame({
    'Linear': lr_coef,
    'Ridge': ridge_coef
}).head(20)

comparison.plot(kind='barh', figsize=(10, 8), alpha=0.7)
plt.xlabel('Coefficient Value')
plt.title('Linear vs Ridge: Top 20 Coefficients')
plt.axvline(0, color='black', linestyle='--', linewidth=0.8)
plt.tight_layout()
plt.show()

## 5. Random Forest

In [None]:
rf_results = train_random_forest(X_train, y_train, X_val, y_val)

In [None]:
# Feature importance visualization
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Top features bar plot
top_features = rf_results['feature_importance'].head(20)
axes[0].barh(range(len(top_features)), top_features['importance'], alpha=0.7, color='forestgreen')
axes[0].set_yticks(range(len(top_features)))
axes[0].set_yticklabels(top_features['feature'])
axes[0].set_xlabel('Importance')
axes[0].set_title('Random Forest: Top 20 Features')

# Cumulative importance
cumsum = rf_results['feature_importance']['importance'].cumsum()
axes[1].plot(range(len(cumsum)), cumsum, linewidth=2)
axes[1].axhline(0.90, color='r', linestyle='--', label='90% threshold')
axes[1].axhline(0.95, color='orange', linestyle='--', label='95% threshold')
axes[1].set_xlabel('Number of Features')
axes[1].set_ylabel('Cumulative Importance')
axes[1].set_title('Cumulative Feature Importance')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Results Summary

In [None]:
# Combine all results
all_results = {
    'Linear Regression': lr_results,
    'Ridge Regression': ridge_results,
    'Random Forest': rf_results
}

summary = create_results_summary(all_results)
print("\n" + "="*80)
print("BASELINE MODELS COMPARISON")
print("="*80)
print(summary.to_string(index=False))
print("="*80)

In [None]:
# Visualize comparison
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# MAE comparison
axes[0].barh(summary['Model'], summary['MAE'], alpha=0.7, color='steelblue')
axes[0].set_xlabel('MAE (positions)')
axes[0].set_title('Model Comparison: MAE')
axes[0].invert_yaxis()

# R² comparison
axes[1].barh(summary['Model'], summary['R²'], alpha=0.7, color='forestgreen')
axes[1].set_xlabel('R² Score')
axes[1].set_title('Model Comparison: R²')
axes[1].invert_yaxis()

# Training time vs MAE
axes[2].scatter(summary['Train Time (s)'], summary['MAE'], s=150, alpha=0.6)
for idx, row in summary.iterrows():
    axes[2].annotate(row['Model'], (row['Train Time (s)'], row['MAE']),
                    xytext=(5, 5), textcoords='offset points', fontsize=9)
axes[2].set_xlabel('Training Time (seconds)')
axes[2].set_ylabel('MAE (positions)')
axes[2].set_title('Efficiency: Training Time vs Performance')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Error Analysis

In [None]:
# Use best model (Random Forest) for detailed error analysis
best_model = rf_results['model']
y_pred = best_model.predict(X_val)

# Create error dataframe
error_df = pd.DataFrame({
    'actual': y_val.values,
    'predicted': y_pred,
    'error': y_val.values - y_pred,
    'abs_error': np.abs(y_val.values - y_pred),
    'grid_position': X_val['GridPosition'].values
})

print("\nError Statistics:")
print(error_df['abs_error'].describe())

# Predictions within tolerance
within_1 = (error_df['abs_error'] <= 1).mean() * 100
within_3 = (error_df['abs_error'] <= 3).mean() * 100
within_5 = (error_df['abs_error'] <= 5).mean() * 100

print(f"\nPrediction Accuracy:")
print(f"  Within 1 position: {within_1:.1f}%")
print(f"  Within 3 positions: {within_3:.1f}%")
print(f"  Within 5 positions: {within_5:.1f}%")

In [None]:
# Worst predictions
print("\nWorst 10 Predictions:")
print(error_df.nlargest(10, 'abs_error')[['actual', 'predicted', 'error', 'grid_position']])

In [None]:
# Best predictions
print("\nBest 10 Predictions:")
print(error_df.nsmallest(10, 'abs_error')[['actual', 'predicted', 'error', 'grid_position']])

In [None]:
# Error by grid position
error_by_grid = error_df.groupby('grid_position')['abs_error'].mean().sort_index()

plt.figure(figsize=(12, 6))
plt.bar(error_by_grid.index, error_by_grid.values, alpha=0.7, color='coral')
plt.xlabel('Grid Position')
plt.ylabel('Average Absolute Error')
plt.title('Prediction Error by Grid Position')
plt.xticks(range(1, 21))
plt.grid(True, alpha=0.3, axis='y')
plt.show()

In [None]:
# Actual vs Predicted scatter with color by grid position
plt.figure(figsize=(10, 10))
scatter = plt.scatter(y_pred, y_val, c=X_val['GridPosition'], 
                     cmap='RdYlGn_r', alpha=0.6, s=50)
plt.plot([0, 20], [0, 20], 'k--', lw=2, label='Perfect prediction')
plt.plot([0, 20], [1, 21], 'r--', lw=1, alpha=0.5, label='±1 position')
plt.plot([0, 20], [-1, 19], 'r--', lw=1, alpha=0.5)
plt.plot([0, 20], [3, 23], 'orange', linestyle='--', lw=1, alpha=0.5, label='±3 positions')
plt.plot([0, 20], [-3, 17], 'orange', linestyle='--', lw=1, alpha=0.5)
plt.colorbar(scatter, label='Grid Position')
plt.xlabel('Predicted Finish Position')
plt.ylabel('Actual Finish Position')
plt.title('Random Forest: Predicted vs Actual (colored by grid position)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xlim(0, 20)
plt.ylim(0, 20)
plt.show()

## Save Results

In [None]:
# Save models
save_models(all_results)

# Save summary table
summary.to_csv('../results/models/baseline_results.csv', index=False)
print("\nResults saved to results/models/")

## Key Findings

Document your observations here:

1. **Baseline Performance**: Grid position baseline achieves MAE of ~X positions
2. **Linear Models**: Linear/Ridge regression achieves MAE of ~X positions
3. **Tree Models**: Random Forest achieves MAE of ~X positions
4. **Feature Importance**: GridPosition, circuit stats, and team performance dominate
5. **Error Patterns**: Errors are higher for [front/back] of grid, especially at [circuit type]
6. **Next Steps**: XGBoost and hyperparameter tuning should push below X.X MAE