Data Analysis

In [74]:
import pandas as pd

# Store data in a Dataframe
train = pd.read_csv("./raw_data/train.csv")
test = pd.read_csv("./raw_data/test.csv")
submission = pd.read_csv("./raw_data/sample_submission.csv")

In [75]:
# Check for Nulls
pd.concat([
  train.isna().sum().rename("train_nulls"),
  test.isna().sum().rename("test_nulls")
], axis=1)

Unnamed: 0,train_nulls,test_nulls
Unique ID,0,0.0
Rider_ID,0,0.0
category_x,0,0.0
Circuit_Length_km,0,0.0
Laps,0,0.0
Grid_Position,0,0.0
Avg_Speed_kmh,0,0.0
Track_Condition,0,0.0
Humidity_%,0,0.0
Tire_Compound_Front,0,0.0


In [76]:
# Feature Operations
def add_features(df, is_train=False):
    # Features using retained columns
    df["Speed_vs_Grid"] = df["Avg_Speed_kmh"] / (df["Grid_Position"] + 1)
    df["Ambient_vs_Track_Temp"] = df["Ambient_Temperature_Celsius"] / (df["Track_Temperature_Celsius"] + 1)
    df["Humidity_Normalized"] = df["Humidity_%"] / 100
    df["Finish_Rate"] = df["finishes"] / (df["starts"] + 1)
    df["Podium_Rate"] = df["podiums"] / (df["starts"] + 1)
    df["Win_Rate"] = df["wins"] / (df["starts"] + 1)

    if is_train:
        # Only apply to training data (Lap_Time_Seconds available)
        df["Lap_Duration_per_km"] = df["Lap_Time_Seconds"] / (df["Circuit_Length_km"] + 1e-6)
        df["Speed_vs_Avg"] = df["Avg_Speed_kmh"] / (df["Lap_Time_Seconds"] + 1e-6)

        # Removed Grid_vs_Position and Position_Rank due to dropped columns
        # df["Grid_vs_Position"] = df["Grid_Position"] - df["position"]
        # df["Position_Rank"] = df["Championship_Position"] / (df["Championship_Points"] + 1)

    return df


train = add_features(train, is_train=True)
test = add_features(test, is_train=False)


In [77]:
print(train.columns)
print(len(train.columns))

Index(['Unique ID', 'Rider_ID', 'category_x', 'Circuit_Length_km', 'Laps',
       'Grid_Position', 'Avg_Speed_kmh', 'Track_Condition', 'Humidity_%',
       'Tire_Compound_Front', 'Tire_Compound_Rear', 'Penalty',
       'Championship_Points', 'Championship_Position', 'Session', 'year_x',
       'sequence', 'rider', 'team', 'bike', 'position', 'points', 'shortname',
       'circuit_name', 'rider_name', 'team_name', 'bike_name',
       'Lap_Time_Seconds', 'Corners_per_Lap',
       'Tire_Degradation_Factor_per_Lap', 'Pit_Stop_Duration_Seconds',
       'Ambient_Temperature_Celsius', 'Track_Temperature_Celsius', 'weather',
       'track', 'air', 'ground', 'starts', 'finishes', 'with_points',
       'podiums', 'wins', 'min_year', 'max_year', 'years_active',
       'Speed_vs_Grid', 'Ambient_vs_Track_Temp', 'Humidity_Normalized',
       'Finish_Rate', 'Podium_Rate', 'Win_Rate', 'Lap_Duration_per_km',
       'Speed_vs_Avg'],
      dtype='object')
53


In [78]:
# Label Encode categorical columns for ML
from sklearn.preprocessing import LabelEncoder

# Step 1: Explicit label columns
label_cols = ["rider", "team", "bike", "track", "weather", "Session"]

# Step 2: Automatically detect any other object columns not already in label_cols
object_cols_train = train.select_dtypes(include='object').columns.tolist()
object_cols_test = test.select_dtypes(include='object').columns.tolist()

# Step 3: Union of explicit + detected object columns (removes duplicates)
all_label_cols = list(set(label_cols + object_cols_train + object_cols_test))

# Step 4: Label encode safely
for col in all_label_cols:
    if col in train.columns and col in test.columns:
        le = LabelEncoder()
        combined_vals = pd.concat([train[col], test[col]], axis=0).astype(str)
        le.fit(combined_vals)
        train[col] = le.transform(train[col].astype(str))
        test[col] = le.transform(test[col].astype(str))
    else:
        print(f"Skipping '{col}' — not found in both train and test.")


In [79]:
# Train Data
X = train.drop(columns=["Lap_Time_Seconds"])
y = train["Lap_Time_Seconds"]

# Test data
X_test = test.copy()

# Align test data with train data based on columns
X, X_test = X.align(X_test, join='left', axis=1, fill_value=0)

In [80]:
# Label encode remaining columns of train
from sklearn.preprocessing import LabelEncoder

for col in X.columns:
    if X[col].dtype == 'object':
        le = LabelEncoder()
        all_vals = pd.concat([X[col], X_test[col]], axis=0).astype(str)
        le.fit(all_vals)
        X[col] = le.transform(X[col].astype(str))
        X_test[col] = le.transform(X_test[col].astype(str))


Model Training

In [90]:
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import ExtraTreesRegressor

# Load best model saved by FLAML
saved_model = joblib.load("./raw_data/flaml_best_model.pkl")
full_params = saved_model.get_params()

# Filter only valid parameters for ExtraTreesRegressor
valid_keys = ExtraTreesRegressor().get_params().keys()
best_params = {k: v for k, v in full_params.items() if k in valid_keys}

# Prepare KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scores = []
best_model = None
best_rmse = float("inf")

# Loop through folds
for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = ExtraTreesRegressor(**best_params)
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    rmse_scores.append(rmse)

    print(f"Fold {fold} RMSE: {rmse:.4f}")

    if rmse < best_rmse:
        best_rmse = rmse
        best_model = model

# Summary
print("\nAll Fold RMSE Scores:", rmse_scores)
print(f"Best RMSE: {best_rmse:.4f}")
print(f"Mean RMSE: {np.mean(rmse_scores):.4f}")

# Predict with best model
final_preds = best_model.predict(X_test)
submission["Lap_Time_Seconds"] = final_preds
submission.to_csv("submission.csv", index=False)
print("Submission file saved.")


Fold 1 RMSE: 0.0416
Fold 2 RMSE: 0.0410
Fold 3 RMSE: 0.0410
Fold 4 RMSE: 0.0411
Fold 5 RMSE: 0.0413

All Fold RMSE Scores: [0.04158155551104178, 0.040973371755103724, 0.0410467805335971, 0.041089870163295704, 0.04132007196624864]
Best RMSE: 0.0410
Mean RMSE: 0.0412
Submission file saved.


In [94]:
# Feature Importance
importance_df = pd.DataFrame({
    "feature": X.columns,
    "importance": best_model.feature_importances_
}).sort_values(by="importance", ascending=False)

# Display column names in order of importance
important_features = importance_df[importance_df["importance"] > 0]["feature"].tolist()
print(important_features)

['Lap_Duration_per_km', 'Circuit_Length_km', 'Speed_vs_Avg', 'Avg_Speed_kmh', 'Humidity_Normalized', 'Session', 'rider', 'team', 'bike_name', 'Championship_Position', 'year_x', 'Championship_Points', 'Penalty', 'Rider_ID', 'Tire_Compound_Front', 'Humidity_%', 'circuit_name', 'Tire_Degradation_Factor_per_Lap', 'Track_Condition', 'finishes', 'weather', 'category_x', 'bike', 'Grid_Position', 'Tire_Compound_Rear', 'Laps', 'rider_name', 'Corners_per_Lap', 'Speed_vs_Grid', 'wins', 'Ambient_vs_Track_Temp', 'points', 'Win_Rate', 'Unique ID', 'position', 'with_points', 'years_active', 'Pit_Stop_Duration_Seconds', 'shortname', 'air', 'Ambient_Temperature_Celsius', 'team_name', 'track', 'sequence', 'Podium_Rate', 'starts', 'ground', 'Track_Temperature_Celsius', 'max_year', 'Finish_Rate', 'podiums', 'min_year']


In [None]:
# Submission
preds = best_model.predict(X_test)
submission["Lap_Time_Seconds"] = preds
submissxion.to_csv("submission.csv", index=False)

In [None]:
# Backup Models for debugging
# import joblib
# joblib.dump(automl.model, "flaml_best_model.pkl")