Data Analysis

In [65]:
import pandas as pd

# Store data in a Dataframe
train = pd.read_csv("./raw_data/train.csv")
test = pd.read_csv("./raw_data/test.csv")
submission = pd.read_csv("./raw_data/sample_submission.csv")

In [66]:
# Check for Nulls
pd.concat([
  train.isna().sum().rename("train_nulls"),
  test.isna().sum().rename("test_nulls")
], axis=1)

Unnamed: 0,train_nulls,test_nulls
Unique ID,0,0.0
Rider_ID,0,0.0
category_x,0,0.0
Circuit_Length_km,0,0.0
Laps,0,0.0
Grid_Position,0,0.0
Avg_Speed_kmh,0,0.0
Track_Condition,0,0.0
Humidity_%,0,0.0
Tire_Compound_Front,0,0.0


In [67]:
# Feature Operations
def add_features(df, is_train=False):
    # Features using retained columns
    df["Speed_vs_Grid"] = df["Avg_Speed_kmh"] / (df["Grid_Position"] + 1)
    df["Ambient_vs_Track_Temp"] = df["Ambient_Temperature_Celsius"] / (df["Track_Temperature_Celsius"] + 1)
    df["Humidity_Normalized"] = df["Humidity_%"] / 100
    df["Finish_Rate"] = df["finishes"] / (df["starts"] + 1)
    df["Podium_Rate"] = df["podiums"] / (df["starts"] + 1)
    df["Win_Rate"] = df["wins"] / (df["starts"] + 1)

    if is_train:
        # Only apply to training data (Lap_Time_Seconds available)
        df["Lap_Duration_per_km"] = df["Lap_Time_Seconds"] / (df["Circuit_Length_km"] + 1e-6)
        df["Speed_vs_Avg"] = df["Avg_Speed_kmh"] / (df["Lap_Time_Seconds"] + 1e-6)

        # Removed Grid_vs_Position and Position_Rank due to dropped columns
        # df["Grid_vs_Position"] = df["Grid_Position"] - df["position"]
        # df["Position_Rank"] = df["Championship_Position"] / (df["Championship_Points"] + 1)

    return df


train = add_features(train, is_train=True)
test = add_features(test, is_train=False)


In [69]:
# Label Encode categorical columns for ML
from sklearn.preprocessing import LabelEncoder

# Step 1: Explicit label columns
label_cols = ["rider", "team", "bike", "track", "weather", "Session"]

# Step 2: Automatically detect any other object columns not already in label_cols
object_cols_train = train.select_dtypes(include='object').columns.tolist()
object_cols_test = test.select_dtypes(include='object').columns.tolist()

# Step 3: Union of explicit + detected object columns (removes duplicates)
all_label_cols = list(set(label_cols + object_cols_train + object_cols_test))

# Step 4: Label encode safely
for col in all_label_cols:
    if col in train.columns and col in test.columns:
        le = LabelEncoder()
        combined_vals = pd.concat([train[col], test[col]], axis=0).astype(str)
        le.fit(combined_vals)
        train[col] = le.transform(train[col].astype(str))
        test[col] = le.transform(test[col].astype(str))
    else:
        print(f"Skipping '{col}' — not found in both train and test.")


In [70]:
# Train Data
X = train.drop(columns=["Lap_Time_Seconds"])
y = train["Lap_Time_Seconds"]

# Test data
X_test = test.copy()

# Align test data with train data based on columns
X, X_test = X.align(X_test, join='left', axis=1, fill_value=0)

Model Training

In [72]:
from sklearn.model_selection import train_test_split
from flaml import AutoML
from sklearn.metrics import mean_squared_error
import numpy as np

# Training data split into evaluation and train datasets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# FLAML model
automl = AutoML()
automl.fit(
    X_train=X_train,
    y_train=y_train,
    task="regression",
    metric="rmse",
    time_budget=500, 
    log_file_name="flaml_automl.log"
)

# RMSE on evaluation set
val_preds = automl.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, val_preds))
print(f"\n Best validation RMSE (on holdout set): {rmse_val:.4f}")

# RMSE on internal Validation
print(f"\n Best internal FLAML RMSE: {automl.best_loss:.4f}")


[flaml.automl.logger: 06-14 13:31:52] {1752} INFO - task = regression
[flaml.automl.logger: 06-14 13:31:52] {1763} INFO - Evaluation method: holdout
[flaml.automl.logger: 06-14 13:31:52] {1862} INFO - Minimizing error metric: rmse
[flaml.automl.logger: 06-14 13:31:52] {1979} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd', 'catboost']
[flaml.automl.logger: 06-14 13:31:52] {2282} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 06-14 13:31:52] {2417} INFO - Estimated sufficient time budget=52506s. Estimated necessary time budget=454s.
[flaml.automl.logger: 06-14 13:31:52] {2466} INFO -  at 8.4s,	estimator lgbm's best error=9.5965,	best estimator lgbm's best error=9.5965
[flaml.automl.logger: 06-14 13:31:52] {2282} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 06-14 13:31:52] {2466} INFO -  at 8.5s,	estimator lgbm's best error=9.5965,	best estimator lgbm's best error=9.5965
[flaml.automl.logger: 06-1

In [None]:
# Feature Importance
importance_df = pd.DataFrame({
    "feature": X.columns,
    "importance": automl.feature_importances_
}).sort_values(by="importance", ascending=False)

# Display column names in order of importance
important_features = importance_df[importance_df["importance"] > 0]["feature"].tolist()
print(important_features)

['Lap_Duration_per_km', 'Circuit_Length_km', 'Speed_vs_Avg', 'Avg_Speed_kmh', 'Ambient_Temperature_Celsius', 'rider_name', 'Ambient_vs_Track_Temp', 'shortname', 'Finish_Rate', 'Pit_Stop_Duration_Seconds', 'Unique ID', 'bike', 'Track_Temperature_Celsius', 'position', 'rider', 'Tire_Degradation_Factor_per_Lap', 'with_points', 'Speed_vs_Grid', 'Rider_ID', 'Corners_per_Lap', 'starts', 'Championship_Points', 'points', 'circuit_name', 'team_name', 'finishes', 'air', 'years_active', 'Championship_Position', 'Podium_Rate', 'sequence', 'Humidity_%', 'bike_name', 'year_x', 'Session', 'podiums', 'team', 'max_year', 'Laps', 'Grid_Position', 'Penalty', 'ground', 'min_year', 'Tire_Compound_Rear', 'Win_Rate', 'weather', 'Tire_Compound_Front', 'track', 'wins', 'Track_Condition', 'category_x']


In [None]:
# Submission
preds = automl.predict(X_test)
submission["Lap_Time_Seconds"] = preds
submission.to_csv("submission.csv", index=False)