In [1]:
# data manipulation
import pickle
import numpy as np
import pandas as pd

# preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# machine learning models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

# evaluation metrics
from tabulate import tabulate
from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error

# hyperparameter tuning
import optuna

# damp warnings
import logging

In [2]:
def load_object(file_name):
    with open(file_name, 'rb') as f:
        data= pickle.load(f)
    return data

In [3]:
train_features= load_object('notebook_artifacts/train_features.pkl')
test_features= load_object('notebook_artifacts/test_features.pkl')
train_target= load_object('notebook_artifacts/train_target.pkl')
test_target= load_object('notebook_artifacts/test_target.pkl')

In [4]:
train_features.shape, train_target.shape, test_features.shape, test_target.shape

((1388331, 7), (1388331,), (347083, 7), (347083,))

# Model Selection

## Making evaluation metrics function

In [5]:
def evaluation_metrics(true, predicted):
    MAE= f'{mean_absolute_error(true, predicted):.3f}'
    RMSE= f'{root_mean_squared_error(true, predicted):.3f}'
    R2= f'{r2_score(true, predicted):.3f}'

    metrics_data= {'Metric': ['MAE', 'RMSE', 'R2'],
                   'Value': [MAE, RMSE, R2]}
    
    metrics_dataframe= pd.DataFrame(metrics_data)
    table= tabulate(metrics_dataframe, headers= 'keys', tablefmt= 'pretty', showindex= False)

    return table

## Making model evaluation function

In [6]:
def evaluate_model(model, model_name, train_features, test_features, train_target, test_target):
    # fit model on train data
    model.fit(train_features, train_target)
    
    # find predictions for train and test data
    train_prediction= model.predict(train_features)
    test_prediction= model.predict(test_features)

    # evalute model performance
    train_metrics= evaluation_metrics(train_target, train_prediction)
    test_metrics= evaluation_metrics(test_target, test_prediction)

    print('===================================================================================')
    print(f'MODEL : {model_name}')
    print('-----------------------------------------------------------------------------------')
    print('TRAIN DATA')
    print(train_metrics)
    print('-----------------------------------------------------------------------------------')
    print('TEST DATA')
    print(test_metrics)

## Initiating model selection

### Listing usable models

In [7]:
models= {'Liner Regression': LinearRegression(),
         'Lasso': Lasso(),
         'Ridge': Ridge(),
         'Random Forest Regressor': RandomForestRegressor(),
         'XGBoost': XGBRegressor(verbosity= 0),
         'LGBM Regressor': LGBMRegressor(verbosity= -1),
         'CatBoost Regressor': CatBoostRegressor(verbose= False),
         'AdaBoost Regressor': AdaBoostRegressor()}

### Using helper functions on listed models

In [16]:
for model_name, model_object in models.items():
    evaluate_model(model_object, model_name, train_features, test_features, train_target, test_target)

## Final Selection

Selecting CatBoost Regressor it has performed the best.

In [21]:
train_data= pd.concat([train_features, train_target], axis=1)

In [22]:
sample_data= train_data.sample(100_000, random_state= 42)

In [30]:
sample_train_data, sample_test_data= sample_data.iloc[:80_000], sample_data.iloc[80_000:]

In [35]:
sample_train_features, sample_train_target= sample_train_data.drop('price', axis= 1), sample_train_data.price
sample_test_features, sample_test_target= sample_test_data.drop('price', axis= 1), sample_test_data.price

In [36]:
def objective(trial):
    # Define hyperparameters to be tuned
    params = {
        'iterations': trial.suggest_int('iterations', 500, 2000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'random_strength': trial.suggest_float('random_strength', 0.1, 1.0),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'border_count': trial.suggest_int('border_count', 5, 255),
    }
    model = CatBoostRegressor(**params, verbose=0)
    model.fit(sample_train_features, sample_train_target)
    sample_test_predictions = model.predict(sample_test_features)
    # Calculate R² score
    r2 = r2_score(sample_test_target, sample_test_predictions)
    return r2

In [37]:
# Create a study and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# Get the best parameters and best trial
best_parameters = study.best_params
best_trial = study.best_trial

[I 2024-01-29 14:26:13,602] A new study created in memory with name: no-name-4a9489d0-bf87-453d-b562-34f1592ac5ed
[I 2024-01-29 14:26:35,621] Trial 0 finished with value: 0.2404815779801529 and parameters: {'iterations': 1261, 'depth': 9, 'learning_rate': 0.15798954853496702, 'random_strength': 0.8656391741011038, 'bagging_temperature': 0.45129965703781083, 'border_count': 235}. Best is trial 0 with value: 0.2404815779801529.
[I 2024-01-29 14:26:50,465] Trial 1 finished with value: 0.16185730367854134 and parameters: {'iterations': 1931, 'depth': 4, 'learning_rate': 0.036690410686796085, 'random_strength': 0.13565626810616047, 'bagging_temperature': 0.3334997318374676, 'border_count': 34}. Best is trial 0 with value: 0.2404815779801529.
[I 2024-01-29 14:27:14,323] Trial 2 finished with value: 0.2263538791252634 and parameters: {'iterations': 1449, 'depth': 9, 'learning_rate': 0.06454850975095652, 'random_strength': 0.3532553598889775, 'bagging_temperature': 0.5009439326884276, 'border_

Best Parameters: {'iterations': 1682, 'depth': 4, 'learning_rate': 0.17464278943281045, 'random_strength': 0.5187807297310542, 'bagging_temperature': 0.9983467903906863, 'border_count': 229}


In [47]:
# Train the final model on the entire training dataset using the best parameters
print(f"Best Parameters: {best_parameters}")

Best Parameters: {'iterations': 1682, 'depth': 4, 'learning_rate': 0.17464278943281045, 'random_strength': 0.5187807297310542, 'bagging_temperature': 0.9983467903906863, 'border_count': 229}


In [45]:
selected_parameters= {'iterations': 1682, 
                      'depth': 4, 
                      'learning_rate': 0.17464278943281045, 
                      'random_strength': 0.5187807297310542, 
                      'bagging_temperature': 0.9983467903906863, 
                      'border_count': 229}

In [46]:
final_model = CatBoostRegressor(**selected_parameters, verbose=0)
final_model.fit(train_features, train_target)

test_predictions = final_model.predict(test_features)
# Calculate R² on the test set
final_r2 = r2_score(test_target, test_predictions)
print(f"Final R² on Test Set: {final_r2}")

Final R² on Test Set: 0.2857355265550754
