Data Analysis

In [None]:
import pandas as pd

# Store data in a Dataframe
train = pd.read_csv("./raw_data/train.csv")
test = pd.read_csv("./raw_data/test.csv")
submission = pd.read_csv("./raw_data/sample_submission.csv")

In [None]:
# Train Data
X = train.drop(columns=["Lap_Time_Seconds"])
y = train["Lap_Time_Seconds"]

# Test data
X_test = test.copy()

# One Hot Encode Categorical data
X = pd.get_dummies(X)
X_test = pd.get_dummies(X_test)

# Align test data with train data based on columns
X, X_test = X.align(X_test, join='left', axis=1, fill_value=0)

In [17]:
from sklearn.model_selection import train_test_split

# Generalize column names
original_cols = X.columns.tolist()

X.columns = [f"f{i}" for i in range(X.shape[1])]
X_test.columns = X.columns 

Model Training

In [None]:
import time
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.ensemble import StackingRegressor

stacked_model = StackingRegressor(
    estimators=[
        ('xgb', xgb.XGBRegressor(n_estimators=100, tree_method='hist', n_jobs=-1, verbosity=1)),
        ('lgb', lgb.LGBMRegressor(n_estimators=100, n_jobs=-1)),
        ('cat', CatBoostRegressor(iterations=100, thread_count=-1, verbose=0))
    ],
    final_estimator=lgb.LGBMRegressor(n_estimators=50)
)


# ⏱ Training with timer
start = time.time()
stacked_model.fit(X, y)
print(f"⏱ Done training in {round(time.time() - start, 2)} seconds.")

In [None]:
# from sklearn.model_selection import KFold
# from sklearn.metrics import root_mean_squared_error
# import numpy as np

# # Split training data into test and val, to find out RMSE value.
# kf = KFold(n_splits=5, shuffle=True, random_state=42)

# rmse_list = []

# for train_index, val_index in kf.split(X):
#     X_train, X_val = X.iloc[train_index], X.iloc[val_index]
#     y_train, y_val = y.iloc[train_index], y.iloc[val_index]

#     stacked_model.fit(X_train, y_train)
#     preds = stacked_model.predict(X_val)
    
#     rmse = root_mean_squared_error(y_val, preds, squared=False)
#     rmse_list.append(rmse)

# print("Fold-wise RMSEs:", rmse_list)
# print("Average RMSE:", np.mean(rmse_list))


In [None]:
# Predicting solutions
submission['Lap_Time_Seconds'] = stacked_model.predict(X_test)
submission.to_csv('solution.csv', index=False)