In [22]:
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error
import numpy as np

In [2]:
train = pd.read_csv("train.csv")
val = pd.read_csv("val.csv")
test = pd.read_csv("test.csv")


In [3]:
train = train.dropna(subset=["Lap_Time_Seconds"])
val = val.dropna(subset=["Lap_Time_Seconds"])

In [4]:
full = pd.concat([train, val, test], axis=0).reset_index(drop=True)

In [7]:
categorical_cols = [
    "category_x", "Track_Condition", "Tire_Compound_Front", "Tire_Compound_Rear", "Session",
    "weather", "track", "rider", "team", "bike", "shortname",
    "circuit_name", "team_name", "bike_name", "Penalty"
]

In [8]:
full[categorical_cols] = full[categorical_cols].fillna("Unknown")
full = full.fillna(-1)

In [9]:
full["experience"] = full["max_year"] - full["min_year"] + 1
full["racing_age"] = full["year_x"] - full["min_year"]
full["win_rate"] = full["wins"] / (full["starts"] + 1)
full["points_per_finish"] = full["points"] / (full["finishes"] + 1)
full["podium_ratio"] = full["podiums"] / (full["with_points"] + 1)
full["temp_diff"] = full["Track_Temperature_Celsius"] - full["Ambient_Temperature_Celsius"]
full["humidity_temp_ratio"] = full["Humidity_%"] / (full["Ambient_Temperature_Celsius"] + 1)

In [10]:
X = full.drop(columns=["Lap_Time_Seconds"], errors="ignore")
y = full["Lap_Time_Seconds"] if "Lap_Time_Seconds" in full else None

In [11]:
X_train = X.iloc[:len(train)]
y_train = y.iloc[:len(train)]
X_val = X.iloc[len(train):len(train) + len(val)]
y_val = y.iloc[len(train):len(train) + len(val)]
X_test = X.iloc[len(train) + len(val):]

In [13]:
categorical_cols = X_train.select_dtypes(include="object").columns.tolist()

In [14]:
cat_feature_indices = [X_train.columns.get_loc(col) for col in categorical_cols]

In [15]:
train_pool = Pool(X_train, y_train, cat_features=cat_feature_indices)
val_pool = Pool(X_val, y_val, cat_features=cat_feature_indices)

In [16]:
model = CatBoostRegressor(
    iterations=2000,
    learning_rate=0.05,
    depth=8,
    loss_function='RMSE',
    eval_metric='RMSE',
    cat_features=categorical_cols,  # if you've passed column *names* here, switch to indices in Pool
    early_stopping_rounds=100,
    verbose=100
)

model.fit(train_pool, eval_set=val_pool)

0:	learn: 11.2936426	test: 11.2693010	best: 11.2693010 (0)	total: 3.54s	remaining: 1h 57m 59s
100:	learn: 7.2321567	test: 7.0274787	best: 7.0274787 (100)	total: 3m 58s	remaining: 1h 14m 41s
200:	learn: 6.9796490	test: 6.7693541	best: 6.7693541 (200)	total: 9m 18s	remaining: 1h 23m 20s
300:	learn: 6.7808121	test: 6.5663891	best: 6.5663891 (300)	total: 14m 36s	remaining: 1h 22m 29s
400:	learn: 6.5622207	test: 6.3390866	best: 6.3390866 (400)	total: 19m 42s	remaining: 1h 18m 33s
500:	learn: 6.4131191	test: 6.1857935	best: 6.1857935 (500)	total: 24m 44s	remaining: 1h 14m 1s
600:	learn: 6.2604344	test: 6.0282450	best: 6.0282450 (600)	total: 30m 38s	remaining: 1h 11m 20s
700:	learn: 6.1407891	test: 5.9043278	best: 5.9043278 (700)	total: 36m 52s	remaining: 1h 8m 19s
800:	learn: 6.0295871	test: 5.7889051	best: 5.7889051 (800)	total: 42m 14s	remaining: 1h 3m 13s
900:	learn: 5.9222440	test: 5.6783944	best: 5.6783944 (900)	total: 47m 15s	remaining: 57m 38s
1000:	learn: 5.8287999	test: 5.5820836	be

<catboost.core.CatBoostRegressor at 0x2549d1c6420>

In [17]:
pred_val = model.predict(X_val)
pred_test = model.predict(X_test)

In [18]:
# Save to CSV
submission_test = pd.DataFrame({
    "Unique ID": test["Unique ID"].values,
    "Lap_Time_Seconds": pred_test
})

submission_test.to_csv("submission_test.csv", index=False)


In [24]:
final_pred = 0.9 * pred_test + 0.1 * np.mean(y_train)

# Create submission DataFrame with 'Unique ID' from test set
submission = pd.DataFrame({
    "Unique ID": test["Unique ID"],
    "Lap_Time_Seconds": final_pred
})

# Save to CSV
submission.to_csv("submission_test.csv", index=False)