# Baseline Model
## Goal
Train a logistic regression model to predict podium probability.
Evaluate using ROC-AUC with cross-validation on 2023-2025 data. 

## Input
- aus_gp_features.csv (output from 02_feature_engineering.ipynb)

## Output
- Trained baseline model
- ROC-AUC score
- Probability outputs per driver

# Core libraries

In [28]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler 
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import pickle
%matplotlib inline

# Load engineered features

In [19]:
df = pd.read_csv('aus_gp_features.csv')

# Define features and target

In [20]:
X = df[['GridPosition', 'driver_aus_podium_rate', 'constructor_aus_podium_rate']]
y = df['Podium'] 

print("Features shape:", X.shape)
print("Target shape:", y.shape)
print("\nFeature preview:")
print(X.head())

Features shape: (59, 3)
Target shape: (59,)

Feature preview:
   GridPosition  driver_aus_podium_rate  constructor_aus_podium_rate
0           1.0                0.666667                     0.333333
1           3.0                0.333333                     0.333333
2           4.0                0.333333                     0.166667
3           6.0                0.000000                     0.166667
4          20.0                0.000000                     0.333333


# Scale features - logistic regression performs better when features are on the same scale. GridPosition is 1-20, podium rates are 0-0.5
# Scaling brings them all to a similar range

In [15]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train logistic regression

In [16]:
model = LogisticRegression(random_state=42)

# Evaluate with cross-validation using ROC-AUC
# cv=5 means we split data into 5 folds and train/test 5 times

In [17]:
scores = cross_val_score(model, X_scaled, y, cv=5, scoring='roc_auc')

print("ROC-AUC scores per fold:", scores.round(3))
print("Mean ROC-AUC:", scores.mean().round(3))
print("Std ROC-AUC:", scores.std().round(3))

ROC-AUC scores per fold: [1.    0.875 1.    0.925 1.   ]
Mean ROC-AUC: 0.96
Std ROC-AUC: 0.051


# Train on full dataset now that we've validated performance with cross-validation

In [21]:
model.fit(X_scaled, y)

# Get probability outputs for each driver
# predict_proba returns [prob_not_podium, prob_podium] for each row
# we take [:, 1] to get just the podium probability

In [22]:
df['podium_probability'] = model.predict_proba(X_scaled)[:, 1]

# View predictions sorted by probability

In [23]:
print(df[['Abbreviation', 'Year', 'GridPosition', 'podium_probability', 'Podium']]
      .sort_values('podium_probability', ascending=False)
      .round(3))

   Abbreviation  Year  GridPosition  podium_probability  Podium
0           VER  2023           1.0               0.838       1
38          VER  2024           1.0               0.838       0
39          NOR  2025           1.0               0.838       1
22          NOR  2024           3.0               0.756       1
40          VER  2025           3.0               0.756       1
20          SAI  2024           2.0               0.504       1
17          RUS  2023           2.0               0.504       0
1           HAM  2023           3.0               0.440       1
41          RUS  2025           4.0               0.378       1
21          LEC  2024           4.0               0.378       1
11          SAI  2023           5.0               0.319       0
2           ALO  2023           4.0               0.302       1
19          LEC  2023           7.0               0.219       0
46          LEC  2025           7.0               0.219       0
36          RUS  2024           7.0     

# Train Random Forest and compare

In [26]:
# Random forest doesn't need scaling so we use original X
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_scores = cross_val_score(rf_model, X, y, cv=5, scoring='roc_auc')

print("Random Forest ROC-AUC scores per fold:", rf_scores.round(3))
print("Mean ROC-AUC:", rf_scores.mean().round(3))
print("Std ROC-AUC:", rf_scores.std().round(3))

print("\n--- Comparison ---")
print(f"Logistic Regression: {scores.mean().round(3)} (+/- {scores.std().round(3)})")
print(f"Random Forest:       {rf_scores.mean().round(3)} (+/- {rf_scores.std().round(3)})")

Random Forest ROC-AUC scores per fold: [1.    0.925 1.    0.975 1.   ]
Mean ROC-AUC: 0.98
Std ROC-AUC: 0.029

--- Comparison ---
Logistic Regression: 0.96 (+/- 0.051)
Random Forest:       0.98 (+/- 0.029)


# Save R.F model

In [31]:
# Train random forest on full dataset
rf_model.fit(X, y)

# Save model and scaler for use in prediction
with open('rf_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)

# Save the feature column names so we can use the same features later
with open('feature_columns.pkl', 'wb') as f:
    pickle.dump(list(X.columns), f)

print("Model saved successfully!")
print("Features:", list(X.columns))

Model saved successfully!
Features: ['GridPosition', 'driver_aus_podium_rate', 'constructor_aus_podium_rate']
