## Gradient Boost Regression Implementation

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import warnings
warnings.filterwarnings("ignore")


In [2]:
df = pd.read_csv('cardekho_imputated.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [4]:
df.shape

(15411, 14)

### Data Cleaning

In [5]:
df.isnull().sum()

Unnamed: 0           0
car_name             0
brand                0
model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [6]:
# Dropping car_name and brand as model is the only required one
df.drop(columns = ['car_name', 'brand'], inplace = True, axis = 1)

In [7]:
# Getting All Different Types OF Features
num_features = [feature for feature in df.columns if df[feature].dtype != 'O']
print('Num of Numerical Features: ', len(num_features))

cat_features = [feature for feature in df.columns if df[feature].dtype == 'O']
print('Num of Categorical Features: ', len(cat_features))

Num of Numerical Features:  8
Num of Categorical Features:  4


In [8]:
# Independent and Dependent features
from sklearn.model_selection import train_test_split

X = df.drop(['selling_price'], axis = 1)
Y = df['selling_price']

## Feature Encoding and Scaling

In [9]:
len(df['model'].unique())

120

In [10]:
df['model'].value_counts()

model
i20            906
Swift Dzire    890
Swift          781
Alto           778
City           757
              ... 
Ghibli           1
Altroz           1
GTC4Lusso        1
Aura             1
Gurkha           1
Name: count, Length: 120, dtype: int64

In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [12]:
X['model'] = le.fit_transform(X['model'])

In [13]:
# Create Column Transformer with 3 types of transformers
numerical_features = X.select_dtypes(exclude = 'object').columns
onehot_columns = ['seller_type','fuel_type','transmission_type']

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop = 'first')

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, onehot_columns),
        ("StandardScaler", numeric_transformer, numerical_features)
        
    ],remainder='passthrough'
)

In [14]:
X = preprocessor.fit_transform(X)

In [15]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.738694,-1.519714,0.983562,1.247335,-0.000276,-1.324259,-1.263352,-0.403022
1,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.738516,-0.225693,-0.343933,-0.690016,-0.192071,-0.554718,-0.432571,-0.403022
2,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.738339,1.536377,1.647309,0.084924,-0.647583,-0.554718,-0.479113,-0.403022
3,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.738162,-1.519714,0.983562,-0.360667,0.292211,-0.936610,-0.779312,-0.403022
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-1.737985,-0.666211,-0.012060,-0.496281,0.735736,0.022918,-0.046502,-0.403022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15406,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.723327,1.508844,0.983562,-0.869744,0.026096,-0.767733,-0.757204,-0.403022
15407,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.723859,-0.556082,-1.339555,-0.728763,-0.527711,-0.216964,-0.220803,2.073444
15408,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.724036,0.407551,-0.012060,0.220539,0.344954,0.022918,0.068225,-0.403022
15409,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.724213,1.426247,-0.343933,72.541850,-0.887326,1.329794,0.917158,2.073444


In [16]:
# Separate dataset into train and test

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)
X_train.shape, X_test.shape

((12328, 15), (3083, 15))

## Model Training And Model Selection

In [17]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [18]:
# Create a Function to Evaluate Model
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [19]:
# Beginning Model Training
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "Gradient Boost Regressor": GradientBoostingRegressor()
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, Y_train) # Training the model

    # Making predictions
    Y_train_pred = model.predict(X_train)
    Y_test_pred = model.predict(X_test)

    # Evaluateing Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(Y_train, Y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(Y_test, Y_test_pred)

    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    
    print('=' * 35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 553850.0494
- Mean Absolute Error: 268104.1303
- R2 Score: 0.6218
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 502582.0834
- Mean Absolute Error: 279686.6479
- R2 Score: 0.6645


Lasso
Model performance for Training set
- Root Mean Squared Error: 553850.0538
- Mean Absolute Error: 268101.7491
- R2 Score: 0.6218
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 502581.1494
- Mean Absolute Error: 279682.7929
- R2 Score: 0.6645


Ridge
Model performance for Training set
- Root Mean Squared Error: 553850.6941
- Mean Absolute Error: 268061.4421
- R2 Score: 0.6218
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 502572.3576
- Mean Absolute Error: 279625.1576
- R2 Score: 0.6645


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 335460.8145
- Mean 

## Hyperparameter Tuning

In [20]:
#Initialize few parameter for Hyperparamter tuning


rf_params = {"max_depth": [5, 8, 15, None, 10],
             "max_features": [5, 7, "auto", 8],
             "min_samples_split": [2, 8, 15, 20],
             "n_estimators": [100, 200, 500, 1000]}

gradient_params = {"loss": ['squared_error', 'huber', 'absolute_error'],
                "criterion": ['friedman_mse', 'squared_error', 'mse'],
                "min_samples_split": [2, 8, 15, 20],
                "n_estimators": [100, 200, 500],
                "max_depth": [5, 8, 15, None, 10],}

In [23]:
# Models list for Hyperparameter tuning

randomcv_models = [("RF", RandomForestRegressor(), rf_params),
                   ("GradientBoost", GradientBoostingRegressor(), gradient_params)]

In [24]:
# Hyperparameter Tuning
from sklearn.model_selection import RandomizedSearchCV
model_param = {}

for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator = model, param_distributions = params, n_iter = 100, cv = 3, verbose = 2, n_jobs = -1)
    random.fit(X_train, Y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END max_depth=None, max_features=5, min_samples_split=8, n_estimators=100; total time=   1.0s
[CV] END max_depth=8, max_features=7, min_samples_split=2, n_estimators=100; total time=   0.8s
[CV] END max_depth=15, max_features=7, min_samples_split=20, n_estimators=200; total time=   2.1s
[CV] END max_depth=8, max_features=5, min_samples_split=15, n_estimators=1000; total time=   6.3s
[CV] END max_depth=None, max_features=5, min_samples_split=8, n_estimators=500; total time=   4.9s
[CV] END max_depth=8, max_features=5, min_samples_split=8, n_estimators=500; total time=   3.0s
[CV] END max_depth=8, max_features=5, min_samples_split=8, n_estimators=500; total time=   3.1s
[CV] END max_depth=15, max_features=8, min_samples_split=2, n_estimators=500; total time=   7.7s
[CV] END max_depth=None, max_features=5, min_samples_split=2, n_estimators=500; total time=   6.6s


Exception ignored in: <function ResourceTracker.__del__ at 0x104d1dbc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


[CV] END max_depth=10, max_features=auto, min_samples_split=20, n_estimators=1000; total time=   0.0s
[CV] END max_depth=10, max_features=8, min_samples_split=20, n_estimators=100; total time=   1.0s
[CV] END max_depth=8, max_features=auto, min_samples_split=20, n_estimators=1000; total time=   0.0s
[CV] END max_depth=8, max_features=auto, min_samples_split=20, n_estimators=1000; total time=   0.0s
[CV] END max_depth=8, max_features=auto, min_samples_split=20, n_estimators=1000; total time=   0.0s
[CV] END max_depth=8, max_features=5, min_samples_split=15, n_estimators=500; total time=   3.0s
[CV] END max_depth=15, max_features=7, min_samples_split=8, n_estimators=200; total time=   2.5s
[CV] END max_depth=10, max_features=5, min_samples_split=8, n_estimators=1000; total time=   7.3s
[CV] END max_depth=10, max_features=8, min_samples_split=8, n_estimators=100; total time=   1.1s
[CV] END max_depth=15, max_features=7, min_samples_split=2, n_estimators=500; total time=   6.8s
[CV] END ma

Exception ignored in: <function ResourceTracker.__del__ at 0x106949bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


[CV] END max_depth=10, max_features=8, min_samples_split=20, n_estimators=500; total time=   5.1s
[CV] END max_depth=None, max_features=auto, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=8, max_features=5, min_samples_split=20, n_estimators=200; total time=   1.2s
[CV] END max_depth=None, max_features=8, min_samples_split=20, n_estimators=500; total time=   6.0s
[CV] END max_depth=10, max_features=8, min_samples_split=2, n_estimators=500; total time=   5.5s
[CV] END max_depth=10, max_features=7, min_samples_split=8, n_estimators=100; total time=   0.9s
[CV] END max_depth=None, max_features=8, min_samples_split=2, n_estimators=500; total time=   9.1s


Exception ignored in: <function ResourceTracker.__del__ at 0x1037d5bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


[CV] END max_depth=10, max_features=auto, min_samples_split=20, n_estimators=1000; total time=   0.0s
[CV] END max_depth=5, max_features=5, min_samples_split=8, n_estimators=200; total time=   0.9s
[CV] END max_depth=5, max_features=5, min_samples_split=8, n_estimators=200; total time=   0.8s
[CV] END max_depth=8, max_features=5, min_samples_split=15, n_estimators=500; total time=   3.0s
[CV] END max_depth=10, max_features=8, min_samples_split=8, n_estimators=200; total time=   2.1s
[CV] END max_depth=10, max_features=5, min_samples_split=8, n_estimators=1000; total time=   7.4s
[CV] END max_depth=None, max_features=7, min_samples_split=15, n_estimators=1000; total time=  11.6s
[CV] END max_depth=10, max_features=5, min_samples_split=15, n_estimators=100; total time=   0.7s
[CV] END max_depth=5, max_features=7, min_samples_split=2, n_estimators=200; total time=   1.1s
[CV] END max_depth=10, max_features=7, min_samples_split=20, n_estimators=200; total time=   1.9s
[CV] END max_depth=5,

Exception ignored in: <function ResourceTracker.__del__ at 0x106339bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


Fitting 3 folds for each of 100 candidates, totalling 300 fits


KeyboardInterrupt: 

In [27]:
# Retraining the models with best parameters
models = {
    "Random Forest Regressor": RandomForestRegressor(n_estimators = 500, min_samples_split = 2, max_features = 8, max_depth = 15, n_jobs = -1),
    "GradientBoost Regressor":GradientBoostingRegressor(n_estimators = 200,
                                                         min_samples_split = 8, max_depth = 10, loss = 'huber', criterion = 'friedman_mse')
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, Y_train) # Training the model

    # Making predictions
    Y_train_pred = model.predict(X_train)
    Y_test_pred = model.predict(X_test)

    # Evaluateing Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(Y_train, Y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(Y_test, Y_test_pred)

    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    
    print('=' * 35)
    print('\n')

Random Forest Regressor
Model performance for Training set
- Root Mean Squared Error: 131011.9921
- Mean Absolute Error: 49824.7443
- R2 Score: 0.9788
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 219243.0636
- Mean Absolute Error: 96044.4336
- R2 Score: 0.9361


GradientBoost Regressor
Model performance for Training set
- Root Mean Squared Error: 58721.7064
- Mean Absolute Error: 28099.4391
- R2 Score: 0.9957
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 212040.3658
- Mean Absolute Error: 93339.0072
- R2 Score: 0.9403


