In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import lightgbm as lgb
import optuna
from lightgbm.callback import early_stopping

# Load the dataset
df = pd.read_csv('cleaned_data.csv') 

In [3]:
X = df[['body_type', 'city_fuel_economy', 'engine_type', 'exterior_color', 'fuel_tank_volume', 'fuel_type', 'highway_fuel_economy', 'horsepower', 'isCab', 'make_name', 'maximum_seating', 'mileage', 'model_name', 'seller_rating', 'torque', 'transmission', 'wheel_system', 'year', 'damage_history', 'major_options_count']]  # Columns
y = df['price']  # Target

# One-Hot Encoding categorical features
categorical_features = ['body_type', 'engine_type', 'damage_history', 'fuel_type', 'isCab', 'make_name', 'transmission', 'wheel_system']
numerical_features = ['city_fuel_economy', 'highway_fuel_economy', 'exterior_color', 'fuel_tank_volume', 'horsepower', 'mileage', 'model_name', 'major_options_count', 'seller_rating', 'torque', 'year' ]

# Creating preprocessing pipelines for categorical features
categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Applies One-Hot Encoding
])

# No transformation for numerical features in this pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_pipeline, categorical_features),
        ('num', 'passthrough', numerical_features)  # No changes to numerical features
    ]
)

def objective(trial):
    # Defining hyperparameters
    param = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'n_estimators': trial.suggest_int('n_estimators', 18000, 28000),
        'learning_rate': trial.suggest_float('learning_rate', 0.007, 0.05),
        'num_leaves': trial.suggest_int('num_leaves', 140, 280),
        'max_depth': trial.suggest_int('max_depth', 35, 85),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 12),
        #colsample_bytree=0.8,
        #force_row_wise=True
        #force_col_wise=True
        # Add more parameters if needed
    }

    # K-fold cross-validation
    kf = KFold(n_splits=3)
    rmse_scores = []

    for train_index, val_index in kf.split(X):
        # Apply preprocessing
        X_train = X.iloc[train_index]
        X_val = X.iloc[val_index]
        y_train = y.iloc[train_index]
        y_val = y.iloc[val_index]

        # Convert categorical variables using the preprocessor variable
        X_train_preprocessed = preprocessor.fit_transform(X_train)
        X_val_preprocessed = preprocessor.transform(X_val)

        # Train the model with preprocessed data
        model = lgb.LGBMRegressor(**param)
        model.fit(X_train_preprocessed, y_train, eval_set=[(X_val_preprocessed, y_val)], callbacks=[early_stopping(stopping_rounds=100, verbose=False)])
        preds = model.predict(X_val_preprocessed)
        rmse_scores.append(mean_squared_error(y_val, preds, squared=False))

    avg_rmse = np.mean(rmse_scores)
    # Print the trial's parameters and RMSE score.
    # print(f"Trial {trial.number}: RMSE = {avg_rmse}, parameters: {param}")

    return avg_rmse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)  # Number of trials to run

# Print best parameters
print("Best parameters: ", study.best_params

[I 2023-12-06 07:08:01,988] A new study created in memory with name: no-name-4b0a56a2-1ff0-452c-9218-9ed847b54b2d
[I 2023-12-06 07:12:42,906] Trial 0 finished with value: 3395.618884129109 and parameters: {'n_estimators': 21713, 'learning_rate': 0.043173505635851976, 'num_leaves': 184, 'max_depth': 72, 'min_child_samples': 4}. Best is trial 0 with value: 3395.618884129109.
[I 2023-12-06 07:22:09,177] Trial 1 finished with value: 3400.2750662940484 and parameters: {'n_estimators': 26375, 'learning_rate': 0.016951231218519137, 'num_leaves': 256, 'max_depth': 48, 'min_child_samples': 7}. Best is trial 0 with value: 3395.618884129109.
[I 2023-12-06 07:35:56,323] Trial 2 finished with value: 3399.146282603684 and parameters: {'n_estimators': 22759, 'learning_rate': 0.013415512184863842, 'num_leaves': 239, 'max_depth': 52, 'min_child_samples': 7}. Best is trial 0 with value: 3395.618884129109.
[I 2023-12-06 07:49:05,093] Trial 3 finished with value: 3397.9059903194707 and parameters: {'n_est

Best parameters:  {'n_estimators': 25274, 'learning_rate': 0.021983118499833997, 'num_leaves': 173, 'max_depth': 61, 'min_child_samples': 2}
