Data Analysis

In [74]:
import pandas as pd

# Store data in a Dataframe
train = pd.read_csv("./raw_data/train.csv")
test = pd.read_csv("./raw_data/test.csv")
submission = pd.read_csv("./raw_data/sample_submission.csv")

In [75]:
# Check for Nulls
pd.concat([
  train.isna().sum().rename("train_nulls"),
  test.isna().sum().rename("test_nulls")
], axis=1)

Unnamed: 0,train_nulls,test_nulls
Unique ID,0,0.0
Rider_ID,0,0.0
category_x,0,0.0
Circuit_Length_km,0,0.0
Laps,0,0.0
Grid_Position,0,0.0
Avg_Speed_kmh,0,0.0
Track_Condition,0,0.0
Humidity_%,0,0.0
Tire_Compound_Front,0,0.0


In [76]:
# Feature Operations
def add_features(df, is_train=False):
    # Features using retained columns
    df["Speed_vs_Grid"] = df["Avg_Speed_kmh"] / (df["Grid_Position"] + 1)
    df["Ambient_vs_Track_Temp"] = df["Ambient_Temperature_Celsius"] / (df["Track_Temperature_Celsius"] + 1)
    df["Humidity_Normalized"] = df["Humidity_%"] / 100
    df["Finish_Rate"] = df["finishes"] / (df["starts"] + 1)
    df["Podium_Rate"] = df["podiums"] / (df["starts"] + 1)
    df["Win_Rate"] = df["wins"] / (df["starts"] + 1)

    if is_train:
        # Only apply to training data (Lap_Time_Seconds available)
        df["Lap_Duration_per_km"] = df["Lap_Time_Seconds"] / (df["Circuit_Length_km"] + 1e-6)
        df["Speed_vs_Avg"] = df["Avg_Speed_kmh"] / (df["Lap_Time_Seconds"] + 1e-6)

        # Removed Grid_vs_Position and Position_Rank due to dropped columns
        # df["Grid_vs_Position"] = df["Grid_Position"] - df["position"]
        # df["Position_Rank"] = df["Championship_Position"] / (df["Championship_Points"] + 1)

    return df


train = add_features(train, is_train=True)
test = add_features(test, is_train=False)


In [77]:
print(train.columns)
print(len(train.columns))

Index(['Unique ID', 'Rider_ID', 'category_x', 'Circuit_Length_km', 'Laps',
       'Grid_Position', 'Avg_Speed_kmh', 'Track_Condition', 'Humidity_%',
       'Tire_Compound_Front', 'Tire_Compound_Rear', 'Penalty',
       'Championship_Points', 'Championship_Position', 'Session', 'year_x',
       'sequence', 'rider', 'team', 'bike', 'position', 'points', 'shortname',
       'circuit_name', 'rider_name', 'team_name', 'bike_name',
       'Lap_Time_Seconds', 'Corners_per_Lap',
       'Tire_Degradation_Factor_per_Lap', 'Pit_Stop_Duration_Seconds',
       'Ambient_Temperature_Celsius', 'Track_Temperature_Celsius', 'weather',
       'track', 'air', 'ground', 'starts', 'finishes', 'with_points',
       'podiums', 'wins', 'min_year', 'max_year', 'years_active',
       'Speed_vs_Grid', 'Ambient_vs_Track_Temp', 'Humidity_Normalized',
       'Finish_Rate', 'Podium_Rate', 'Win_Rate', 'Lap_Duration_per_km',
       'Speed_vs_Avg'],
      dtype='object')
53


In [78]:
# Label Encode categorical columns for ML
from sklearn.preprocessing import LabelEncoder

# Step 1: Explicit label columns
label_cols = ["rider", "team", "bike", "track", "weather", "Session"]

# Step 2: Automatically detect any other object columns not already in label_cols
object_cols_train = train.select_dtypes(include='object').columns.tolist()
object_cols_test = test.select_dtypes(include='object').columns.tolist()

# Step 3: Union of explicit + detected object columns (removes duplicates)
all_label_cols = list(set(label_cols + object_cols_train + object_cols_test))

# Step 4: Label encode safely
for col in all_label_cols:
    if col in train.columns and col in test.columns:
        le = LabelEncoder()
        combined_vals = pd.concat([train[col], test[col]], axis=0).astype(str)
        le.fit(combined_vals)
        train[col] = le.transform(train[col].astype(str))
        test[col] = le.transform(test[col].astype(str))
    else:
        print(f"Skipping '{col}' — not found in both train and test.")


In [79]:
# Train Data
X = train.drop(columns=["Lap_Time_Seconds"])
y = train["Lap_Time_Seconds"]

# Test data
X_test = test.copy()

# Align test data with train data based on columns
X, X_test = X.align(X_test, join='left', axis=1, fill_value=0)

In [80]:
# Label encode remaining columns of train
from sklearn.preprocessing import LabelEncoder

for col in X.columns:
    if X[col].dtype == 'object':
        le = LabelEncoder()
        all_vals = pd.concat([X[col], X_test[col]], axis=0).astype(str)
        le.fit(all_vals)
        X[col] = le.transform(X[col].astype(str))
        X_test[col] = le.transform(X_test[col].astype(str))


Model Training

In [83]:
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
import optuna
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np

# Define number of folds
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Optuna objective function
def objective(trial):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "num_leaves": trial.suggest_int("num_leaves", 20, 150),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }

    rmse_scores = []

    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = lgb.LGBMRegressor(**params, n_estimators=1000)

        model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric="rmse",
        callbacks=[
            early_stopping(stopping_rounds=50),
            log_evaluation(period=100)
        ]
    )

        preds = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        rmse_scores.append(rmse)

    return np.mean(rmse_scores)

# Run Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50, timeout=600)

# Best RMSE and params
print(f"\n Best CV RMSE from Optuna: {study.best_value:.4f}")
print(f"Best hyperparameters: {study.best_params}")

best_model = lgb.LGBMRegressor(**study.best_params, n_estimators=1000)
best_model.fit(X, y)

[I 2025-06-14 14:09:40,229] A new study created in memory with name: no-name-3b2c9f14-c7cc-47c3-bd24-da01b2090c9b


Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 0.38464
[200]	valid_0's rmse: 0.324979
[300]	valid_0's rmse: 0.299426
[400]	valid_0's rmse: 0.281417
[500]	valid_0's rmse: 0.268661
[600]	valid_0's rmse: 0.258565
[700]	valid_0's rmse: 0.250079
[800]	valid_0's rmse: 0.242733
[900]	valid_0's rmse: 0.236617
[1000]	valid_0's rmse: 0.230739
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 0.230739
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 0.384867
[200]	valid_0's rmse: 0.325883
[300]	valid_0's rmse: 0.302587
[400]	valid_0's rmse: 0.285694
[500]	valid_0's rmse: 0.273053
[600]	valid_0's rmse: 0.261722
[700]	valid_0's rmse: 0.252905
[800]	valid_0's rmse: 0.245477
[900]	valid_0's rmse: 0.238262
[1000]	valid_0's rmse: 0.231267
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 0.231267
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 0.381627
[200]

[I 2025-06-14 14:13:18,840] Trial 0 finished with value: 0.23114055971669298 and parameters: {'learning_rate': 0.07584125519384413, 'num_leaves': 54, 'feature_fraction': 0.7558076957604236, 'bagging_fraction': 0.7526226335274139, 'bagging_freq': 10, 'min_child_samples': 8}. Best is trial 0 with value: 0.23114055971669298.


Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 0.276085
[200]	valid_0's rmse: 0.245909
[300]	valid_0's rmse: 0.229301
[400]	valid_0's rmse: 0.218681
[500]	valid_0's rmse: 0.210125
[600]	valid_0's rmse: 0.202399
[700]	valid_0's rmse: 0.19725
[800]	valid_0's rmse: 0.192807
[900]	valid_0's rmse: 0.187864
[1000]	valid_0's rmse: 0.184262
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 0.184262
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 0.26972
[200]	valid_0's rmse: 0.238827
[300]	valid_0's rmse: 0.22396
[400]	valid_0's rmse: 0.213693
[500]	valid_0's rmse: 0.205247
[600]	valid_0's rmse: 0.198233
[700]	valid_0's rmse: 0.193089
[800]	valid_0's rmse: 0.188204
[900]	valid_0's rmse: 0.18446
[1000]	valid_0's rmse: 0.180702
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 0.180702
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 0.274959
[200]	va

[I 2025-06-14 14:19:22,607] Trial 1 finished with value: 0.18357265459737324 and parameters: {'learning_rate': 0.09480499051160103, 'num_leaves': 121, 'feature_fraction': 0.6947262252462879, 'bagging_fraction': 0.7737416850357834, 'bagging_freq': 4, 'min_child_samples': 91}. Best is trial 1 with value: 0.18357265459737324.


Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 0.223257
[200]	valid_0's rmse: 0.199076
[300]	valid_0's rmse: 0.187845
[400]	valid_0's rmse: 0.181882
[500]	valid_0's rmse: 0.176266
[600]	valid_0's rmse: 0.171152
[700]	valid_0's rmse: 0.168111
[800]	valid_0's rmse: 0.164611
[900]	valid_0's rmse: 0.162265
[1000]	valid_0's rmse: 0.159875
Did not meet early stopping. Best iteration is:
[999]	valid_0's rmse: 0.159874
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 0.220309
[200]	valid_0's rmse: 0.196205
[300]	valid_0's rmse: 0.186118
[400]	valid_0's rmse: 0.179153
[500]	valid_0's rmse: 0.174557
[600]	valid_0's rmse: 0.170578
[700]	valid_0's rmse: 0.166976
[800]	valid_0's rmse: 0.164038
[900]	valid_0's rmse: 0.161206
[1000]	valid_0's rmse: 0.159016
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 0.159016
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 0.22155
[200]	

[I 2025-06-14 14:24:01,950] Trial 2 finished with value: 0.1589437393738194 and parameters: {'learning_rate': 0.06890344634946895, 'num_leaves': 107, 'feature_fraction': 0.9528407598310963, 'bagging_fraction': 0.5864513187203058, 'bagging_freq': 5, 'min_child_samples': 96}. Best is trial 2 with value: 0.1589437393738194.



 Best CV RMSE from Optuna: 0.1589
Best hyperparameters: {'learning_rate': 0.06890344634946895, 'num_leaves': 107, 'feature_fraction': 0.9528407598310963, 'bagging_fraction': 0.5864513187203058, 'bagging_freq': 5, 'min_child_samples': 96}


In [84]:
# Feature Importance
importance_df = pd.DataFrame({
    "feature": X.columns,
    "importance": best_model.feature_importances_
}).sort_values(by="importance", ascending=False)

# Display column names in order of importance
important_features = importance_df[importance_df["importance"] > 0]["feature"].tolist()
print(important_features)

['Lap_Duration_per_km', 'Circuit_Length_km', 'Speed_vs_Avg', 'Avg_Speed_kmh', 'Pit_Stop_Duration_Seconds', 'Rider_ID', 'Ambient_vs_Track_Temp', 'Championship_Points', 'rider_name', 'Track_Temperature_Celsius', 'Speed_vs_Grid', 'Ambient_Temperature_Celsius', 'Unique ID', 'Tire_Degradation_Factor_per_Lap', 'Humidity_%', 'Finish_Rate', 'rider', 'circuit_name', 'shortname', 'Championship_Position', 'position', 'bike', 'Corners_per_Lap', 'bike_name', 'with_points', 'starts', 'sequence', 'Podium_Rate', 'points', 'finishes', 'Grid_Position', 'max_year', 'year_x', 'team_name', 'Laps', 'Penalty', 'Session', 'team', 'Win_Rate', 'air', 'years_active', 'podiums', 'min_year', 'ground', 'weather', 'category_x', 'Tire_Compound_Front', 'Tire_Compound_Rear', 'Track_Condition', 'wins', 'Humidity_Normalized', 'track']


In [85]:
# Submission
preds = best_model.predict(X_test)
submission["Lap_Time_Seconds"] = preds
submission.to_csv("submission.csv", index=False)

In [None]:
# Backup Models for debugging
# import joblib
# joblib.dump(automl.model, "flaml_best_model.pkl")