In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from feature_eng import preprocess
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV

### Load the dataset

In [2]:
dataset = pd.read_csv('data/train.csv')

### Preprocess the dataset

In [3]:
dataset = preprocess(dataset)

### Split the dataset

In [4]:
# Separate the features and the target
X = dataset.drop('lapTime_ms', axis=1)
y = dataset['lapTime_ms']

# Split the dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
cat_features = ['date', 'time', 'circuit', 'driver', 'constructor', 'pitStop', 'last_lap', 'first_lap', 'type', 'direction', 'location', 'country']

### Define the model

In [6]:
model = CatBoostRegressor(
    cat_features=cat_features,
    loss_function='RMSE',
    random_seed=42
)

### Find optimal hyperparameters

In [7]:
# Define the parameter grid
param_grid = {
    'iterations': [500, 1000, 1500],        # number of trees
    'learning_rate': [0.01, 0.05, 0.1],     # step size of each iteration
    'depth': [6, 8, 10]                     # depth of each tree
}

In [9]:
# Find the optimal parameters
random_search = RandomizedSearchCV(
    model,
    param_distributions=param_grid,
    n_iter=5,
    cv=2,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

random_search.fit(X_train, y_train, verbose=200)

print("Best parameters found: ", random_search.best_params_)

Fitting 2 folds for each of 5 candidates, totalling 10 fits
0:	learn: 61408.2707073	total: 171ms	remaining: 4m 16s
200:	learn: 10981.7324550	total: 7.67s	remaining: 49.6s
400:	learn: 7068.2757277	total: 15.1s	remaining: 41.3s
600:	learn: 5801.9597306	total: 22.4s	remaining: 33.5s
800:	learn: 5226.8610609	total: 29.9s	remaining: 26.1s
1000:	learn: 4871.0829771	total: 37.4s	remaining: 18.6s
1200:	learn: 4630.0447979	total: 44.7s	remaining: 11.1s
1400:	learn: 4447.5481905	total: 52.1s	remaining: 3.68s
1499:	learn: 4374.3132618	total: 55.8s	remaining: 0us
Best parameters found:  {'learning_rate': 0.1, 'iterations': 1500, 'depth': 6}


### Save the model

In [10]:
model = random_search.best_estimator_
model.save_model('models/cb_model.cbm')