# Model Training

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pickle

In [2]:
df = pd.read_csv("Cleaned.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags,Age
0,0,45654403,13328.0,1399.0,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,186005.0,6.0,Automatic,4x4,4,Left wheel,Silver,12.0,13.0
1,1,44731507,16621.0,1018.0,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3.0,192000.0,6.0,Tiptronic,4x4,4,Left wheel,Black,8.0,12.0
2,2,45774419,8467.0,0.0,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,200000.0,4.0,Variator,Front,4,Right-hand drive,Black,2.0,17.0
3,3,45769185,3607.0,862.0,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,168966.0,4.0,Automatic,4x4,4,Left wheel,White,0.0,12.0
4,4,45809263,11726.0,446.0,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,91901.0,4.0,Automatic,Front,4,Left wheel,Silver,4.0,9.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15434 entries, 0 to 15433
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        15434 non-null  int64  
 1   ID                15434 non-null  int64  
 2   Price             15434 non-null  float64
 3   Levy              15434 non-null  float64
 4   Manufacturer      15434 non-null  object 
 5   Model             15434 non-null  object 
 6   Prod. year        15434 non-null  int64  
 7   Category          15434 non-null  object 
 8   Leather interior  15434 non-null  object 
 9   Fuel type         15434 non-null  object 
 10  Engine volume     15434 non-null  float64
 11  Mileage           15434 non-null  float64
 12  Cylinders         15434 non-null  float64
 13  Gear box type     15434 non-null  object 
 14  Drive wheels      15434 non-null  object 
 15  Doors             15434 non-null  object 
 16  Wheel             15434 non-null  object

### Preparing X and y values

In [4]:
X = df.drop(['Unnamed: 0','ID','Price','Prod. year','Color'],axis = 1)
X

Unnamed: 0,Levy,Manufacturer,Model,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Airbags,Age
0,1399.0,LEXUS,RX 450,Jeep,Yes,Hybrid,3.5,186005.0,6.0,Automatic,4x4,4,Left wheel,12.0,13.0
1,1018.0,CHEVROLET,Equinox,Jeep,No,Petrol,3.0,192000.0,6.0,Tiptronic,4x4,4,Left wheel,8.0,12.0
2,0.0,HONDA,FIT,Hatchback,No,Petrol,1.3,200000.0,4.0,Variator,Front,4,Right-hand drive,2.0,17.0
3,862.0,FORD,Escape,Jeep,Yes,Hybrid,2.5,168966.0,4.0,Automatic,4x4,4,Left wheel,0.0,12.0
4,446.0,HONDA,FIT,Hatchback,Yes,Petrol,1.3,91901.0,4.0,Automatic,Front,4,Left wheel,4.0,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15429,1850.0,MERCEDES-BENZ,E 350,Sedan,Yes,Diesel,3.5,122874.0,6.0,Automatic,Rear,4,Left wheel,12.0,15.0
15430,1055.0,MERCEDES-BENZ,E 350,Sedan,Yes,Diesel,3.5,107800.0,6.0,Automatic,Rear,4,Left wheel,12.0,10.0
15431,831.0,HYUNDAI,Sonata,Sedan,Yes,Petrol,2.4,161600.0,4.0,Tiptronic,Front,4,Left wheel,8.0,12.0
15432,836.0,HYUNDAI,Tucson,Jeep,Yes,Diesel,2.0,116365.0,4.0,Automatic,Front,4,Left wheel,4.0,13.0


In [5]:
y = df['Price']
y

0        13328.0
1        16621.0
2         8467.0
3         3607.0
4        11726.0
          ...   
15429      706.0
15430     5802.0
15431    15681.0
15432    26108.0
15433     5331.0
Name: Price, Length: 15434, dtype: float64

In [6]:
from sklearn.preprocessing import LabelEncoder

# Create an instance of LabelEncoder
label_encoder = LabelEncoder()

# Apply LabelEncoder to the 'color' column
cat_columns = ['Manufacturer', 'Model', 'Category', 'Leather interior', 'Fuel type',
       'Gear box type', 'Drive wheels', 'Doors', 'Wheel']
for col in cat_columns:
    X[col] = label_encoder.fit_transform(X[col])


# Display the encoded DataFrame
X

Unnamed: 0,Levy,Manufacturer,Model,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Airbags,Age
0,1399.0,31,992,4,1,2,3.5,186005.0,6.0,0,0,1,0,12.0,13.0
1,1018.0,8,515,4,0,5,3.0,192000.0,6.0,2,0,1,0,8.0,12.0
2,0.0,20,537,3,0,5,1.3,200000.0,4.0,3,1,1,1,2.0,17.0
3,862.0,16,518,4,1,2,2.5,168966.0,4.0,0,0,1,0,0.0,12.0
4,446.0,20,537,3,1,5,1.3,91901.0,4.0,0,1,1,0,4.0,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15429,1850.0,35,466,9,1,1,3.5,122874.0,6.0,0,2,1,0,12.0,15.0
15430,1055.0,35,466,9,1,1,3.5,107800.0,6.0,0,2,1,0,12.0,10.0
15431,831.0,22,1065,9,1,5,2.4,161600.0,4.0,2,1,1,0,8.0,12.0
15432,836.0,22,1142,4,1,1,2.0,116365.0,4.0,0,1,1,0,4.0,13.0


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Create instances of different regression models
linear_model = LinearRegression()
tree_model = DecisionTreeRegressor()
forest_model = RandomForestRegressor()

# Fit the models on the training data
linear_model.fit(X_train, y_train)
tree_model.fit(X_train, y_train)
forest_model.fit(X_train, y_train)


In [10]:
# Define the parameter grids for each model
linear_params = {}  # No hyperparameters to tune for Linear Regression
tree_params = {'max_depth': [5, 10, 15]}
forest_params = {'n_estimators': [100, 200, 300], 'max_depth': [5, 10, 15]}

# Perform grid search for each model
linear_gs = GridSearchCV(linear_model, linear_params, scoring='neg_mean_squared_error', cv=5)
tree_gs = GridSearchCV(tree_model, tree_params, scoring='neg_mean_squared_error', cv=5)
forest_gs = GridSearchCV(forest_model, forest_params, scoring='neg_mean_squared_error', cv=5)

# Fit the models with grid search
linear_gs.fit(X_train, y_train)
tree_gs.fit(X_train, y_train)
forest_gs.fit(X_train, y_train)


In [11]:
# Get the best models from grid search
best_linear_model = linear_gs.best_estimator_
best_tree_model = tree_gs.best_estimator_
best_forest_model = forest_gs.best_estimator_

# Predict on the test data using the best models
linear_predictions = best_linear_model.predict(X_test)
tree_predictions = best_tree_model.predict(X_test)
forest_predictions = best_forest_model.predict(X_test)

# Calculate the mean squared error for each model
linear_mse = mean_squared_error(y_test, linear_predictions)
tree_mse = mean_squared_error(y_test, tree_predictions)
forest_mse = mean_squared_error(y_test, forest_predictions)

# Find the best model with the lowest MSE
best_model = min(linear_mse, tree_mse, forest_mse)
if best_model == linear_mse:
    print("Linear Regression is the best model.")
elif best_model == tree_mse:
    print("Decision Tree Regression is the best model.")
else:
    print("Random Forest Regression is the best model.")


Random Forest Regression is the best model.


In [12]:
model = RandomForestRegressor()
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.813161795908304

### Make a mapping between encoded values and the original values

In [13]:
n = len(cat_columns)  # Number of dictionaries in the list
map_list = [{} for _ in range(n)] 


In [14]:
for i in range(len(cat_columns)):
    for j in range (len(df[cat_columns[i]])):
        map_list[i][df[cat_columns[i]][j]] = X[cat_columns[i]][j]

# Prediction pipepline

In [15]:
Levy = 0	
Manufacturer = 'TOYOTA'
Manufacturer = map_list[0][Manufacturer]

Model = 'Camry'
Model = map_list[1][Model]

Category = 'Jeep'
Category = map_list[2][Category]

Leather_interior = 'Yes'
Leather_interior = map_list[3][Leather_interior]

Fuel_type = 'Petrol'
Fuel_type = map_list[4][Fuel_type]

Engine_volume = 2
Mileage = 20000
Cylinders = 1

Gear_box_type = 'Automatic'	
Gear_box_type = map_list[5][Gear_box_type]

Drive_wheels = 'Front'	
Drive_wheels = map_list[6][Drive_wheels]

Doors = '4'	
Doors = map_list[7][Doors]

Wheel = 'Left wheel'
Wheel = map_list[8][Wheel]


Airbags = 4
Age = 5



In [16]:
#Helper function

def PREDICT(List):
    x = np.zeros(len(X.columns))
    counter = 0
    for i in List:
        x[counter] = i
        counter+=1
        
    return model.predict([x])[0]

In [17]:
PREDICT([Levy, Manufacturer, Model, Category, Leather_interior, Fuel_type, Engine_volume, Mileage,
                     Cylinders, Gear_box_type, Drive_wheels, Doors, Wheel, Airbags, Age])



43313.13

# Save the trained model

In [18]:
with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(model, file)