## model training 

In [55]:
## importing required packages 

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, accuracy_score, r2_score

In [56]:
## reading data

data = pd.read_csv('cleaned_details.csv')

data = data.iloc[:, 1:]
data.drop('Value Configuration', axis=1, inplace=True)
data.head()

Unnamed: 0,ignition_type,kilometers_driven,ownerNo,modelYear,centralVariantId,price,Insurance Validity,Engine Displacement,Color,Engine Type,...,Gear Box,Drive Type,Steering Type,No Door Numbers,Cargo Volumn,Turning Radius,Fuel_type,Body_type,Oem,Location
0,0,20000.0,1,2022,8654,11.5,1,998.0,8,369,...,7.0,12,0,5.0,392.0,5.3,4,7,13,1
1,0,20687.0,1,2015,4025,4.15,0,1196.0,57,278,...,5.0,16,2,5.0,540.0,4.5,4,5,20,1
2,0,30000.0,1,2021,8135,7.5,1,999.0,86,263,...,5.0,12,0,5.0,336.0,5.0,4,7,24,1
3,0,59247.0,1,2015,1579,3.98,0,1086.0,116,265,...,5.0,12,3,5.0,225.0,4.7,4,2,9,1
4,0,50000.0,1,2015,1341,5.5,1,1199.0,86,476,...,5.0,12,3,5.0,354.0,5.1,4,2,8,1


In [57]:
data['ignition_type'].unique()

array([0], dtype=int64)

In [58]:
## train test split

tar = data['price']
val = data.drop('price', axis=1)

In [59]:
train_data, test_data, train_lab, test_lab = train_test_split(val, tar, test_size=0.2, random_state=42)

In [60]:
## model building 

forest = RandomForestRegressor(n_estimators=1000, random_state=65, max_depth=10)

In [61]:
forest.fit(train_data, train_lab)

In [62]:
def performance(actual, predicted):
    print("mean squared error : " ,mean_squared_error(actual, predicted))
    print("mean absolute percentage error : ", mean_absolute_percentage_error(actual, predicted))
    print("r2 score : ", r2_score(actual, predicted))

In [63]:
## testing performance 

# predictions

train_pred = forest.predict(train_data)
test_pred = forest.predict(test_data)

In [64]:
## performance of train

performance(train_lab, train_pred)

mean squared error :  15.353282427220467
mean absolute percentage error :  0.3340102993290817
r2 score :  0.8966806171249272


In [65]:
## performance of test

performance(test_lab, test_pred)

mean squared error :  68.48537031191881
mean absolute percentage error :  0.46757389183044584
r2 score :  0.6601191846355976


## Feature Selection 

In [67]:
## feature_selection - selecting best features

ap = pd.DataFrame({
    'feature':train_data.columns,
    'score': forest.feature_importances_
}).sort_values('score', ascending=False)['feature'].to_list()

ap

['Gear Box',
 'modelYear',
 'Kerb Weight',
 'Width',
 'kilometers_driven',
 'centralVariantId',
 'Engine Displacement',
 'Wheel Base',
 'Turning Radius',
 'Length',
 'Height',
 'Location',
 'Engine Type',
 'Cargo Volumn',
 'ownerNo',
 'Oem',
 'Insurance Validity',
 'Color',
 'Drive Type',
 'Steering Type',
 'Body_type',
 'No of Cylinder',
 'No Door Numbers',
 'Fuel_type',
 'Turbo Charger',
 'ignition_type']

In [68]:
## using only important features 

data_new = data[['Gear Box', 'modelYear', 'Width', 'Kerb Weight', 'Wheel Base', 'Turning Radius', 'Engine Displacement', 'kilometers_driven', 'Length', 'centralVariantId', 'Height', 'Cargo Volumn', 'Engine Type', 'Oem', 'ownerNo', 'Location', 'Body_type', 'Color', 'No of Cylinder', 'price']]

In [69]:
## x and y data selection 

tar_new = data_new['price']
val_new = data_new.drop('price', axis=1)

## train test split

train_data_new, test_data_new, train_lab_new, test_lab_new = train_test_split(val_new, tar_new, test_size=0.2, random_state=43)

In [70]:
## building model with selected features

featured_forest = RandomForestRegressor(n_estimators=1000, random_state=65, max_depth=10)

featured_forest.fit(train_data_new, train_lab_new)

In [71]:
## predictions of model

train_pred_new = featured_forest.predict(train_data_new)
test_pred_new = featured_forest.predict(test_data_new)


In [72]:
## performance check

# performance of train

performance(train_lab_new, train_pred_new)


mean squared error :  14.616938904806087
mean absolute percentage error :  0.3245456181370912
r2 score :  0.908301452510639


In [73]:
## performance of test

performance(test_lab_new, test_pred_new)

mean squared error :  69.52820872987942
mean absolute percentage error :  0.7560639586417276
r2 score :  0.5623279298259831


## Decision Tree Regressor

In [75]:
## reading data

data = pd.read_csv('cleaned_details.csv')

data = data.iloc[:, 1:]
data.head()

Unnamed: 0,ignition_type,kilometers_driven,ownerNo,modelYear,centralVariantId,price,Insurance Validity,Engine Displacement,Color,Engine Type,...,Drive Type,Steering Type,No Door Numbers,Cargo Volumn,Value Configuration,Turning Radius,Fuel_type,Body_type,Oem,Location
0,0,20000.0,1,2022,8654,11.5,1,998.0,8,369,...,12,0,5.0,392.0,3,5.3,4,7,13,1
1,0,20687.0,1,2015,4025,4.15,0,1196.0,57,278,...,16,2,5.0,540.0,3,4.5,4,5,20,1
2,0,30000.0,1,2021,8135,7.5,1,999.0,86,263,...,12,0,5.0,336.0,8,5.0,4,7,24,1
3,0,59247.0,1,2015,1579,3.98,0,1086.0,116,265,...,12,3,5.0,225.0,8,4.7,4,2,9,1
4,0,50000.0,1,2015,1341,5.5,1,1199.0,86,476,...,12,3,5.0,354.0,8,5.1,4,2,8,1


In [76]:
## train test split

tar = data['price']
val = data.drop('price', axis=1)

In [77]:
train_data, test_data, train_lab, test_lab = train_test_split(val, tar, test_size=0.2, random_state=42)

In [78]:
tree = DecisionTreeRegressor(criterion='squared_error', max_depth=6, random_state=65)

In [79]:
tree.fit(train_data, train_lab)

In [80]:
## testing performance 

# predictions

train_pred = tree.predict(train_data)
test_pred = tree.predict(test_data)

In [81]:
## performance check

# train predictions 

performance(train_lab, train_pred)

mean squared error :  47.494345679347184
mean absolute percentage error :  0.6305897646884718
r2 score :  0.6803884440407639


In [82]:
## test predictions

performance(test_lab, test_pred)

mean squared error :  90.8510814071022
mean absolute percentage error :  0.6803986426154774
r2 score :  0.5491221046955533


## With Feature Selection

In [84]:
## using only important features 

data = data[['Gear Box', 'modelYear', 'Width', 'Kerb Weight', 'Wheel Base', 'Turning Radius', 'Engine Displacement', 'kilometers_driven', 'Length', 'centralVariantId', 'Height', 'Cargo Volumn', 'Engine Type', 'Oem', 'ownerNo', 'Location', 'Body_type', 'Value Configuration', 'Color', 'No of Cylinder', 'price']]

In [85]:
## x and y data selection 

tar = data['price']
val = data.drop('price', axis=1)

## train test split

train_data, test_data, train_lab, test_lab = train_test_split(val, tar, test_size=0.2, random_state=43)

In [86]:
## model building

featured_tree = DecisionTreeRegressor(criterion='squared_error', max_depth=6, random_state=65)

In [87]:
featured_tree.fit(train_data, train_lab)

In [88]:
## testing performance 

# predictions

train_pred_new = featured_tree.predict(train_data)
test_pred_new = featured_tree.predict(test_data)

In [89]:
## train perfromance

performance(train_lab, train_pred_new)

mean squared error :  45.0244691224557
mean absolute percentage error :  0.6071607288030686
r2 score :  0.717541514889191


In [90]:
## test_perfromance

performance(test_lab, test_pred_new)

mean squared error :  96.77178980451856
mean absolute percentage error :  1.0334583251520242
r2 score :  0.39083272312196227


## saving the model

In [92]:
## importing the package 

import pickle 

with open('model.pkl', 'wb') as file:
    pickle.dump(forest, file)

print('Model Saved')

Model Saved


In [93]:
## saving the selected features as csv

featured_data = data[['Gear Box', 'modelYear', 'Width', 'Kerb Weight', 'Wheel Base', 'Turning Radius', 'Engine Displacement', 'kilometers_driven', 'Length', 'centralVariantId', 'Height', 'Cargo Volumn', 'Engine Type', 'Oem', 'ownerNo', 'Location', 'Body_type', 'Value Configuration', 'Color', 'No of Cylinder', 'price']]

featured_data.to_csv("selected_features.csv")

print("DataFrame saved...")

DataFrame saved...
