In [6]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib 
import warnings

# Suppress warnings for a cleaner output
warnings.filterwarnings('ignore')

print("--- Starting Model Training ---")

# --- 1. Load Data ---
DATA_FILE = "../data/processed/feature_engineered_data.csv"
try:
    df = pd.read_csv(DATA_FILE)
except FileNotFoundError:
    print(f"Error: Data file not found at {DATA_FILE}")
    print("Please run src/features/build_features.py first.")
    exit()

print(f"Loaded feature-engineered data. Shape: {df.shape}")

# --- 2. Define Features (X) and Target (y) ---

# This is the column we want to predict
target = 'IsPodium' 

# These columns are "data leaks" or "answers".
# We must remove them from our features (X) or the model will just "cheat".
non_feature_cols = [
    'Year',          # Not a useful feature since it's all 2024
    'RacePos',       # This is the answer!
    'Points',        # This is also the answer
    'Laps',          # A result of the race
    'FinishedRace',  # A result of the race
    'IsRaceWinner',  # This is another target
    'IsPodium'       # This is our current target
]

# All columns are features *except* the ones we just listed
features = [col for col in df.columns if col not in non_feature_cols]

X = df[features]
y = df[target]

print(f"Target (y): {target}")
print(f"Number of Features (X): {len(features)}")


# --- 3. Split Data (Time-Based Split) ---
# For F1, we MUST split by time. We train on the past to predict the future.
# A random split would let the model "see" the future, which is unrealistic.
TRAIN_UNTIL_ROUND = 17  # Train on races 1-17
TEST_FROM_ROUND = 17    # Test on races 18-24 (Singapore to Abu Dhabi)

# Create the training dataset
X_train = X[df['RoundNumber'] < TEST_FROM_ROUND]
y_train = y[df['RoundNumber'] < TEST_FROM_ROUND]

# Create the testing dataset
X_test = X[df['RoundNumber'] >= TEST_FROM_ROUND]
y_test = y[df['RoundNumber'] >= TEST_FROM_ROUND]

print(f"Training on {X_train.shape[0]} driver results (Rounds 1-{TRAIN_UNTIL_ROUND-1})")
print(f"Testing on {X_test.shape[0]} driver results (Rounds {TEST_FROM_ROUND}-24)")


# --- 4. Train the Model ---
print("\nTraining RandomForestClassifier...")

# We use RandomForestClassifier - it's powerful and good for this type of data
# class_weight='balanced' is CRITICAL. It tells the model to pay
# extra attention to the 'Podium' (1) class, since it's more rare.
model = RandomForestClassifier(
    n_estimators=100,       # 100 "trees" in the forest
    random_state=42,        # For reproducible results
    class_weight='balanced' 
)

# Train the model on the training data
model.fit(X_train, y_train)

print("Model training complete.")


# --- 5. Evaluate the Model ---
print("\n--- Model Evaluation on Test Data ---")

# Make predictions on the "unseen" test data
y_pred = model.predict(X_test)

# Show the main accuracy score
print(f"Overall Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# The Classification Report is the most important output!
print("\nClassification Report:")
print(" (1 = Podium, 0 = No Podium)\n")
# This report shows you Precision, Recall, and F1-score for each class
print(classification_report(y_test, y_pred, target_names=['No Podium (0)', 'Podium (1)']))


# --- 6. Save the Model ---
# Save the trained model to a file so we can use it later
MODEL_PATH = "/Users/axelreich/Library/CloudStorage/OneDrive-FloridaStateUniversity/Semester8/DataMining/f1-ml-project/src/models/podium_model.pkl"
joblib.dump(model, MODEL_PATH)

print(f"\nModel saved to {MODEL_PATH}")
print("--- Script Finished. ---")

--- Starting Model Training ---
Loaded feature-engineered data. Shape: (479, 28)
Target (y): IsPodium
Number of Features (X): 21
Training on 319 driver results (Rounds 1-16)
Testing on 160 driver results (Rounds 17-24)

Training RandomForestClassifier...
Model training complete.

--- Model Evaluation on Test Data ---
Overall Accuracy: 0.8750

Classification Report:
 (1 = Podium, 0 = No Podium)

               precision    recall  f1-score   support

No Podium (0)       0.93      0.93      0.93       136
   Podium (1)       0.58      0.58      0.58        24

     accuracy                           0.88       160
    macro avg       0.75      0.75      0.75       160
 weighted avg       0.88      0.88      0.88       160


Model saved to /Users/axelreich/Library/CloudStorage/OneDrive-FloridaStateUniversity/Semester8/DataMining/f1-ml-project/src/models/podium_model.pkl
--- Script Finished. ---
