Data Analysis

In [23]:
import pandas as pd

# Store data in a Dataframe
train = pd.read_csv("./raw_data/train.csv")
test = pd.read_csv("./raw_data/test.csv")
submission = pd.read_csv("./raw_data/sample_submission.csv")

In [24]:
# Check for Nulls
pd.concat([
  train.isna().sum().rename("train_nulls"),
  test.isna().sum().rename("test_nulls")
], axis=1)

Unnamed: 0,train_nulls,test_nulls
Unique ID,0,0.0
Rider_ID,0,0.0
category_x,0,0.0
Circuit_Length_km,0,0.0
Laps,0,0.0
Grid_Position,0,0.0
Avg_Speed_kmh,0,0.0
Track_Condition,0,0.0
Humidity_%,0,0.0
Tire_Compound_Front,0,0.0


In [25]:
# Feature Operations
def add_features(df, is_train=False):
    # Features using retained columns
    df["Speed_vs_Grid"] = df["Avg_Speed_kmh"] / (df["Grid_Position"] + 1)
    df["Ambient_vs_Track_Temp"] = df["Ambient_Temperature_Celsius"] / (df["Track_Temperature_Celsius"] + 1)
    df["Humidity_Normalized"] = df["Humidity_%"] / 100
    df["Finish_Rate"] = df["finishes"] / (df["starts"] + 1)
    df["Podium_Rate"] = df["podiums"] / (df["starts"] + 1)
    df["Win_Rate"] = df["wins"] / (df["starts"] + 1)

    if is_train:
        # Only apply to training data (Lap_Time_Seconds available)
        df["Lap_Duration_per_km"] = df["Lap_Time_Seconds"] / (df["Circuit_Length_km"] + 1e-6)
        df["Speed_vs_Avg"] = df["Avg_Speed_kmh"] / (df["Lap_Time_Seconds"] + 1e-6)

        # Removed Grid_vs_Position and Position_Rank due to dropped columns
        # df["Grid_vs_Position"] = df["Grid_Position"] - df["position"]
        # df["Position_Rank"] = df["Championship_Position"] / (df["Championship_Points"] + 1)

    return df


train = add_features(train, is_train=True)
test = add_features(test, is_train=False)


In [26]:
print(train.columns)
print(len(train.columns))

Index(['Unique ID', 'Rider_ID', 'category_x', 'Circuit_Length_km', 'Laps',
       'Grid_Position', 'Avg_Speed_kmh', 'Track_Condition', 'Humidity_%',
       'Tire_Compound_Front', 'Tire_Compound_Rear', 'Penalty',
       'Championship_Points', 'Championship_Position', 'Session', 'year_x',
       'sequence', 'rider', 'team', 'bike', 'position', 'points', 'shortname',
       'circuit_name', 'rider_name', 'team_name', 'bike_name',
       'Lap_Time_Seconds', 'Corners_per_Lap',
       'Tire_Degradation_Factor_per_Lap', 'Pit_Stop_Duration_Seconds',
       'Ambient_Temperature_Celsius', 'Track_Temperature_Celsius', 'weather',
       'track', 'air', 'ground', 'starts', 'finishes', 'with_points',
       'podiums', 'wins', 'min_year', 'max_year', 'years_active',
       'Speed_vs_Grid', 'Ambient_vs_Track_Temp', 'Humidity_Normalized',
       'Finish_Rate', 'Podium_Rate', 'Win_Rate', 'Lap_Duration_per_km',
       'Speed_vs_Avg'],
      dtype='object')
53


In [39]:
from sklearn.preprocessing import LabelEncoder

label_cols = ["rider", "team", "bike", "track", "weather", "Session"]

for col in label_cols:
    if col in train.columns and col in test.columns:
        le = LabelEncoder()
        
        # Combine values BEFORE any filtering/splitting
        combined_vals = pd.concat([train[col], test[col]], axis=0).astype(str)
        le.fit(combined_vals)

        train[col] = le.transform(train[col].astype(str))
        test[col] = le.transform(test[col].astype(str))
    else:
        print(f"Skipping {col} — column not found.")

In [40]:
# Train Data
X = train.drop(columns=["Lap_Time_Seconds"])
y = train["Lap_Time_Seconds"]

# Test data
X_test = test.copy()

# Align test data with train data based on columns
X, X_test = X.align(X_test, join='left', axis=1, fill_value=0)

In [None]:
# Label encode remaining columns
from sklearn.preprocessing import LabelEncoder

for col in X.columns:
    if X[col].dtype == 'object':
        le = LabelEncoder()
        all_vals = pd.concat([X[col], X_test[col]], axis=0).astype(str)
        le.fit(all_vals)
        X[col] = le.transform(X[col].astype(str))
        X_test[col] = le.transform(X_test[col].astype(str))


Model Training

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
from sklearn.model_selection import train_test_split

#Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

#Train model
model = LGBMRegressor(
    objective='regression',
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=31,
    random_state=42
)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='rmse',
    callbacks=[
        early_stopping(stopping_rounds=50),
        log_evaluation(period=100)
    ]
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019728 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6415
[LightGBM] [Info] Number of data points in the train set: 1531244, number of used features: 52
[LightGBM] [Info] Start training from score 90.001982
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 0.452573	valid_0's l2: 0.204822
[200]	valid_0's rmse: 0.305747	valid_0's l2: 0.0934815
[300]	valid_0's rmse: 0.264085	valid_0's l2: 0.069741
[400]	valid_0's rmse: 0.244047	valid_0's l2: 0.0595587
[500]	valid_0's rmse: 0.226774	valid_0's l2: 0.0514263
[600]	valid_0's rmse: 0.212854	valid_0's l2: 0.0453067
[700]	valid_0's rmse: 0.203343	valid_0's l2: 0.0413484
[800]	valid_0's rmse: 0.195456	valid_0's l2: 0.0382029
[900]	valid_0's rmse: 0.189693	valid_0's l2: 0.0359835
[1000]	valid_0's rmse: 0.185545	valid

In [43]:
# Feature Importance
importance_df = pd.DataFrame({
    "feature": X.columns,
    "importance": model.feature_importances_
}).sort_values(by="importance", ascending=False)

# Display just the column names in order of importance
important_features = importance_df[importance_df["importance"] > 0]["feature"].tolist()
print(important_features)

['Lap_Duration_per_km', 'Circuit_Length_km', 'Speed_vs_Avg', 'Avg_Speed_kmh', 'Finish_Rate', 'Ambient_Temperature_Celsius', 'shortname', 'Ambient_vs_Track_Temp', 'Unique ID', 'rider_name', 'Rider_ID', 'position', 'Track_Temperature_Celsius', 'Pit_Stop_Duration_Seconds', 'points', 'with_points', 'Speed_vs_Grid', 'circuit_name', 'rider', 'Tire_Degradation_Factor_per_Lap', 'starts', 'team_name', 'Championship_Points', 'bike', 'team', 'Corners_per_Lap', 'Humidity_%', 'Session', 'Penalty', 'sequence', 'Championship_Position', 'years_active', 'air', 'bike_name', 'Laps', 'Podium_Rate', 'finishes', 'year_x', 'Tire_Compound_Rear', 'max_year', 'podiums', 'min_year', 'Win_Rate', 'Grid_Position', 'ground', 'weather', 'track', 'Tire_Compound_Front', 'category_x', 'Track_Condition', 'wins']


In [44]:
# Submission
preds = model.predict(X_test)
submission["Lap_Time_Seconds"] = preds
submission.to_csv("submission.csv", index=False)