In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib 
import warnings

# Suppress warnings for a cleaner output
warnings.filterwarnings('ignore')

print("--- Starting Model Training ---")

# --- 1. Load Data ---
DATA_FILE = "../data/processed/feature_engineered_data.csv"
try:
    df = pd.read_csv(DATA_FILE)
except FileNotFoundError:
    print(f"Error: Data file not found at {DATA_FILE}")
    print("Please run src/features/build_features.py first.")
    exit()

print(f"Loaded feature-engineered data. Shape: {df.shape}")

# --- 2. Define Features (X) and Target (y) ---
target = 'IsPodium' 

non_feature_cols = [
    'Year', 'RacePos', 'Points', 'Laps', 
    'FinishedRace', 'IsRaceWinner', 'IsPodium'
]
features = [col for col in df.columns if col not in non_feature_cols]

X = df[features]
y = df[target]

print(f"Target (y): {target}")
print(f"Number of Features (X): {len(features)}")

# --- 3. Split Data (Time-Based Split) ---
TRAIN_UNTIL_ROUND = 18  
TEST_FROM_ROUND = 18    

# Create the training dataset
X_train = X[df['RoundNumber'] < TEST_FROM_ROUND]
y_train = y[df['RoundNumber'] < TEST_FROM_ROUND]

# Create the testing dataset
X_test = X[df['RoundNumber'] >= TEST_FROM_ROUND]
y_test = y[df['RoundNumber'] >= TEST_FROM_ROUND]

# --- Get FullName and RacePos for the report ---
# We need this for our final detailed report
report_df = df[df['RoundNumber'] >= TEST_FROM_ROUND][[
    'RoundNumber', 'FullName', 'RacePos'
]].copy()

print(f"Training on {X_train.shape[0]} driver results (Rounds 1-{TRAIN_UNTIL_ROUND-1})")
print(f"Testing on {X_test.shape[0]} driver results (Rounds {TEST_FROM_ROUND}-24)")

# --- 4. Train the Model ---
print("\nTraining RandomForestClassifier...")
model = RandomForestClassifier(
    n_estimators=100,       
    random_state=42,        
    class_weight='balanced' 
)
model.fit(X_train, y_train)
print("Model training complete.")

# --- 5a. Evaluate the Model (High Level) ---
print("\n--- Model Evaluation on Test Data ---")
y_pred = model.predict(X_test) # The "Yes/No" guess

print(f"Overall Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(" (1 = Podium, 0 = No Podium)\n")
print(classification_report(y_test, y_pred, target_names=['No Podium (0)', 'Podium (1)']))


# --- 5b. Show Detailed Guesses vs. Actuals ---
print("\n--- Detailed Guesses vs. Actual Results ---")

# Get the probabilities (how "confident" the model is)
# We want the probability for class '1' (Podium)
podium_probabilities = model.predict_proba(X_test)[:, 1]

# Add all our info to the report DataFrame
report_df['Actual_Podium'] = y_test.values
report_df['Predicted_Podium'] = y_pred
report_df['Podium_Probability'] = podium_probabilities

# Format for easier reading
report_df['Podium_Probability'] = report_df['Podium_Probability'].apply(lambda x: f"{x*100:.1f}%")
report_df.sort_values(by=['RoundNumber', 'Podium_Probability'], ascending=[True, False], inplace=True)

# Print the final detailed report
print(report_df.to_string(index=False))


# --- 6. Save the Model ---
MODEL_PATH = "models/f1_podium_predictor.joblib" # Use relative path
joblib.dump(model, MODEL_PATH)

print(f"\nModel saved to {MODEL_PATH}")
print("--- Script Finished. ---")

--- Starting Model Training ---
Loaded feature-engineered data. Shape: (479, 28)
Target (y): IsPodium
Number of Features (X): 21


KeyError: "['FullName'] not in index"