In [12]:
import os
import pandas as pd
import numpy as np
import optuna
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)

In [4]:
train = pd.read_csv('/kaggle/input/car-prepared-data/train_prepared.csv')
test = pd.read_csv('/kaggle/input/car-prepared-data/test_prepared.csv')

In [5]:
train.sample(5)

Unnamed: 0,id,brand,model,model_year,milage,ext_col,int_col,price,hp,l,number_of_cylinders,speeds,fuel_type_Diesel,fuel_type_E85 Flex Fuel,fuel_type_Electric,fuel_type_Gasoline,fuel_type_Hybrid,fuel_type_Plug-In Hybrid,fuel_type_not supported,fuel_type_–,accident_At least 1 accident or damage reported,accident_None reported,accident_unknown,clean_title_Yes,clean_title_missing,transmission_A/T,transmission_M/T,transmission_unknown
143536,144479,41836.755127,24233.111111,2001,70000,25293,24495,12900,300.0,5.3,8.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
122244,123043,51290.728171,34176.083969,2018,120000,16995,107674,12500,280.0,3.6,6.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
45713,46036,45659.09478,18174.74359,2010,93500,16995,107674,24900,202.0,2.5,4.0,6.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2422,2446,27010.73315,16495.9,2013,111000,48658,4527,9900,283.0,3.6,6.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
112202,112933,40511.969508,59552.409091,2022,7600,14555,107674,51000,480.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


In [6]:
cols2drop = ['id', 'price']
target = ['price']

In [7]:
X = train.drop(cols2drop, axis = 1)
y = train[target]

In [17]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Optuna

In [8]:
def fit_catboost(trial, train, val):
    X_train, y_train = train
    X_test, y_test = val
    
    # Define the hyperparameter search space
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-5, 10),
#         'bagging_temperature': trial.suggest_uniform('bagging_temperature', 0, 1),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_strength': trial.suggest_uniform('random_strength', 0, 10),
        'boosting_type': trial.suggest_categorical('boosting_type', ['Plain', 'Ordered']),
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
        'use_best_model': True,
        'eval_metric': 'RMSE'
    }

    # If bootstrap type is Bayesian, add bagging temperature
    if params['bootstrap_type'] == 'Bayesian':
        params['bagging_temperature'] = trial.suggest_uniform('bagging_temperature', 0, 1)
        
        
    # Initialize CatBoost model
    model = CatBoostRegressor(**params, 
                              verbose=0,
                              task_type = 'GPU',)
    
    # Train model
    model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50)
        
    # Make predictions
    preds = model.predict(X_test)
    
    return model, preds

In [9]:
# Define objective function for Optuna
def objective(trial):
    
    n_splits = 3
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    scores, models = [], []
    
    for train_index, test_index in kf.split(X, y):
        train_data = X.iloc[train_index, :], y.iloc[train_index]
        valid_data = X.iloc[test_index, :], y.iloc[test_index]
        
        model, y_pred = fit_catboost(trial, train_data, valid_data)
        # Calculate RMSE
        scores.append(mean_squared_error(y_pred, valid_data[1], squared=False))
        
        models.append(model)
        break   

    result = np.mean(scores) 
    
    return result

In [13]:
# Create an Optuna study and optimize
study = optuna.create_study(direction='minimize')
study.optimize(objective, 
               n_trials=1000,
               show_progress_bar=True,
#                n_jobs = -1, 
              )

[I 2024-09-23 17:06:35,466] A new study created in memory with name: no-name-18c97a24-1d2e-414e-90a2-00550c941e47


  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2024-09-23 17:06:42,157] Trial 0 finished with value: 32390.011214238824 and parameters: {'iterations': 389, 'depth': 6, 'learning_rate': 0.014104289374333352, 'l2_leaf_reg': 0.00790643211933661, 'border_count': 105, 'random_strength': 6.208216963940034, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.18640563045168257}. Best is trial 0 with value: 32390.011214238824.
[I 2024-09-23 17:06:45,612] Trial 1 finished with value: 32555.06986970048 and parameters: {'iterations': 262, 'depth': 4, 'learning_rate': 0.151592287226039, 'l2_leaf_reg': 3.4025582906841224e-05, 'border_count': 102, 'random_strength': 2.5374990826712382, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 0 with value: 32390.011214238824.
[I 2024-09-23 17:07:14,441] Trial 2 finished with value: 33743.93075835545 and parameters: {'iterations': 920, 'depth': 8, 'learning_rate': 0.0014874990161525387, 'l2_leaf_reg': 2.806627057817159, 'border_count': 149, 'random_stren

In [14]:
# Print best hyperparameters
print("Best parameters: ", study.best_params)

Best parameters:  {'iterations': 660, 'depth': 10, 'learning_rate': 0.06416865001789653, 'l2_leaf_reg': 2.251271067624928, 'border_count': 231, 'random_strength': 8.164170594970063, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.14102956272973186}


In [15]:
optuna.visualization.plot_slice(study)

In [None]:
# Best parameters:  
# {
#     'iterations': 660, 
#     'depth': 10, 
#     'learning_rate': 0.06416865001789653, 
#     'l2_leaf_reg': 2.251271067624928, 
#     'border_count': 231, 
#     'random_strength': 8.164170594970063, 
#     'boosting_type': 'Ordered', 
#     'bootstrap_type': 'Bayesian', 
#     'bagging_temperature': 0.14102956272973186}