# Dataset Preparation


### Setup

In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
import warnings

warnings.filterwarnings('ignore')

seed = 1855

## Data Collection


In [27]:

# get absolute path of the current directory
path = './regression-problems/second-hand-cars/dataset/dataset.csv'

df = pd.read_csv(path)

# Removing the v.id column as it is not useful for the model
df = df.drop('v.id', axis=1)

# Converting the column names to snake case for convenience
df.columns = df.columns.str.lower().str.replace(' ', '_')

df.head()

Unnamed: 0,on_road_old,on_road_now,years,km,rating,condition,economy,top_speed,hp,torque,current_price
0,535651,798186,3,78945,1,2,14,177,73,123,351318.0
1,591911,861056,6,117220,5,9,9,148,74,95,285001.5
2,686990,770762,2,132538,2,8,15,181,53,97,215386.0
3,573999,722381,4,101065,4,3,11,197,54,116,244295.5
4,691388,811335,6,61559,3,9,12,160,53,105,531114.5


## Handling missing values


In [28]:
# Cheching if there are any missing values

df.isnull().sum()

on_road_old      0
on_road_now      0
years            0
km               0
rating           0
condition        0
economy          0
top_speed        0
hp               0
torque           0
current_price    0
dtype: int64

## Encoding categorical features with One-Hot Encoding


In [29]:
# There are no missing values in the dataset

## Encoding binary class label


# Model Building


In [72]:
from sklearn.model_selection import *
from sklearn.tree import *
from sklearn.ensemble import *
from sklearn.metrics import *
from sklearn.neighbors import *
from sklearn.linear_model import *
from sklearn.svm import *
from sklearn.preprocessing import *


## Splitting the dataset into training and testing sets


In [31]:
x = df.drop('current_price', axis=1)
y = df['current_price']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=seed)

## Evaluation function


In [32]:
def evaluate(expected, predicted):
    mae = mean_absolute_error(expected, predicted)
    mse = mean_squared_error(expected, predicted)
    rmse = np.sqrt(mse)
    r2 = r2_score(expected, predicted)
    print(f'MAE: {mae:.2f}')
    print(f'MSE: {mse:.2f}')
    print(f'RMSE: {rmse:.2f}')
    print(f'R2: {r2:.2f}')
    

## Model Building


In [35]:
model = LinearRegression()

## Model Evaluation


### Cross Validation

In [52]:
scoring = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_root_mean_squared_error']
cross_validation_results = cross_validate(model, x_train, y_train, cv=10, scoring=scoring)

for metric, scores in cross_validation_results.items():
  if 'time' not in metric:
    name  = metric.replace('test_','' ).replace("_", ' ').replace('neg ', '').capitalize()
    print(f'{name}: {-np.mean(scores):.2f} += {np.std(scores):.2f}')
  
# relative_error = rmse/mean
relative_error = -cross_validation_results['test_neg_root_mean_squared_error'] / y_train.mean()
print(f"Relative error: {relative_error.mean():.3f}")

Mean absolute error: 7520.69 += 498.15
Mean squared error: 79942320.21 += 10021825.12
Root mean squared error: 8923.78 += 555.44
Relative error: 0.029


### Cross Validation with KFold

In [53]:
scoring = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_root_mean_squared_error']
k_fold = KFold(n_splits=10, shuffle=True, random_state=seed)
cross_validation_results = cross_validate(model, x_train, y_train, cv=k_fold, scoring=scoring)

for metric, scores in cross_validation_results.items():
  if 'time' not in metric:
    name  = metric.replace('test_','' ).replace("_", ' ').replace('neg ', '').capitalize()
    print(f'{name}: {-np.mean(scores):.2f} += {np.std(scores):.2f}')
  
# relative_error = rmse/mean
relative_error = -cross_validation_results['test_neg_root_mean_squared_error'] / y_train.mean()
print(f"Relative error: {relative_error.mean():.3f}")

Mean absolute error: 7516.12 += 621.43
Mean squared error: 79992419.17 += 12915699.40
Root mean squared error: 8916.60 += 697.66
Relative error: 0.029


### Cross Validation with StratifiedKFold

In [55]:
# Cannot use stratified k-fold as the target variable is continuous

## Model Building with Hyperparameter Tuning


In [59]:
model = LinearRegression()
params = {
    'fit_intercept': [True, False],
    'n_jobs': [1, 2, 4, 10, 20]
}
k_fold = KFold(n_splits=10, shuffle=True, random_state=seed)

grid_search = GridSearchCV(model, param_grid=params, cv=k_fold, scoring='neg_root_mean_squared_error')

grid_search.fit(x_train, y_train)

print(f'Best parameters: {grid_search.best_params_}')
print(f'Best score: {-grid_search.best_score_:.3f}')

Best parameters: {'fit_intercept': True, 'n_jobs': 1}
Best score: 8916.596



## Model Evaluation


In [60]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)
evaluate(y_test, y_pred)

MAE: 6906.27
MSE: 67170456.40
RMSE: 8195.76
R2: 1.00


# Comparing Different Models Performance

## Model Building

In [73]:
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=seed),
    'KNN': KNeighborsRegressor(),
    'Decision Tree': DecisionTreeRegressor(random_state=seed),
    'Gradient Boosting': GradientBoostingRegressor(random_state=seed),
    'AdaBoost': AdaBoostRegressor(random_state=seed),
    'Extra Trees': ExtraTreesRegressor(random_state=seed),
    'Support Vector Regressor': SVR()
}

## Model Comparison

In [75]:

results = {}
for name, model in models.items():
  results[name] = -cross_val_score(model, x_train, y_train, cv=k_fold, scoring='neg_mean_absolute_error')
  
results = pd.DataFrame(results).transpose()

results['mean'] = results.mean(axis=1)
results['std'] = results.std(axis=1)
results['relative_error'] = results.mean(axis=1) / y_train.mean()

results = results.sort_values('mean', ascending=True)

results

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,mean,std,relative_error
Linear Regression,6885.228433,8087.567559,7715.349384,6702.112536,7271.605676,7320.798216,7271.989696,7073.607282,7936.732675,8896.212989,7516.120445,621.434118,0.022325
Gradient Boosting,8729.405813,7685.164361,7854.240657,8972.042012,8263.902121,8218.960454,7817.244384,8731.898742,8478.324851,9218.923857,8397.010725,492.171113,0.024887
Extra Trees,10371.24375,11631.514125,10959.457562,9149.88525,10271.947875,10710.109187,12044.6005,10449.73625,11044.939438,13955.794125,11058.922806,1221.916906,0.03293
Random Forest,13989.930562,15929.236687,16090.19175,13610.482625,13377.478312,14130.301625,14910.692063,14945.538062,14397.45875,17038.822375,14842.013281,1125.750691,0.044057
AdaBoost,21421.9124,23745.614347,24821.600821,16470.947611,21482.676239,20500.736436,24333.785978,18969.757163,23510.156322,23500.696135,21875.788345,2518.648956,0.065167
KNN,24093.28125,25858.9975,20058.7175,20584.10125,18060.3075,21944.9325,21400.4275,18569.4825,24039.3775,25138.0575,21974.76825,2582.620396,0.065476
Decision Tree,22761.7625,20664.88125,25209.53125,24593.4125,22485.15625,26769.8125,21336.25,24367.36875,24438.1875,25993.53125,23861.989375,1887.967054,0.070853
Support Vector Regressor,96215.72624,118726.662215,114162.132132,104008.655387,102588.7105,110937.886476,103930.812743,101772.437702,114373.14349,110637.59006,107735.375695,6692.98679,0.319406
