### ✅ Importing libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder, Normalizer, StandardScaler

import warnings
warnings.filterwarnings("ignore")

### ✅ Getting raw data from github repository

In [2]:
Raw_Data_Url = "https://raw.githubusercontent.com/Alireza-Esp/CPP_Model/main/Data/CarsData.csv"

In [3]:
Raw_Data = pd.read_csv(Raw_Data_Url)

In [4]:
Raw_Data.head(20)

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,Manufacturer
0,I10,2017,7495,Manual,11630,Petrol,145,60.1,1.0,hyundi
1,Polo,2017,10989,Manual,9200,Petrol,145,58.9,1.0,volkswagen
2,2 Series,2019,27990,Semi-Auto,1614,Diesel,145,49.6,2.0,BMW
3,Yeti Outdoor,2017,12495,Manual,30960,Diesel,150,62.8,2.0,skoda
4,Fiesta,2017,7999,Manual,19353,Petrol,125,54.3,1.2,ford
5,C-HR,2019,26791,Automatic,2373,Hybrid,135,74.3,1.8,toyota
6,Kuga,2019,17990,Manual,7038,Petrol,145,34.4,1.5,ford
7,Tiguan,2019,27490,Semi-Auto,3000,Petrol,145,30.4,2.0,volkswagen
8,Fiesta,2018,9891,Manual,31639,Petrol,145,65.7,1.0,ford
9,A Class,2017,17498,Manual,9663,Diesel,30,62.8,2.1,merc


### ✅ Preprocessing Raw Data

#### 🔹 Fixing some problems in dataset

In [5]:
Raw_Data.model = Raw_Data.model.str[1:]

In [6]:
Raw_Data.drop(index=[32875, 35888, 16899, 87943, 52951], inplace=True)

In [7]:
Raw_Data = Raw_Data.reset_index()

In [8]:
Temp_Raw_Data = Raw_Data.copy()
del Raw_Data

Raw_Data = pd.DataFrame()
for i in ["Manufacturer", "model", "transmission", "fuelType", "year", "engineSize", "mileage", "mpg", "tax", "price"]:
    Raw_Data[i] = Temp_Raw_Data[[i]]

del Temp_Raw_Data

Raw_Data

Unnamed: 0,Manufacturer,model,transmission,fuelType,year,engineSize,mileage,mpg,tax,price
0,hyundi,I10,Manual,Petrol,2017,1.0,11630,60.1,145,7495
1,volkswagen,Polo,Manual,Petrol,2017,1.0,9200,58.9,145,10989
2,BMW,2 Series,Semi-Auto,Diesel,2019,2.0,1614,49.6,145,27990
3,skoda,Yeti Outdoor,Manual,Diesel,2017,2.0,30960,62.8,150,12495
4,ford,Fiesta,Manual,Petrol,2017,1.2,19353,54.3,125,7999
...,...,...,...,...,...,...,...,...,...,...
97702,ford,Fiesta,Automatic,Petrol,2017,1.0,8337,54.3,145,10447
97703,BMW,3 Series,Manual,Diesel,2014,2.0,25372,61.4,30,14995
97704,ford,Fiesta,Manual,Petrol,2017,1.2,19910,54.3,125,8950
97705,vauxhall,Astra,Automatic,Petrol,2017,1.4,24468,50.4,125,10700


#### 🔹 Splitting Raw Data to Categorical and Numerical X and y sets

In [9]:
X_cat = Raw_Data.loc[:, ["Manufacturer", "model", "transmission", "fuelType"]]
X_num = Raw_Data.loc[:, ["year", "engineSize", "mileage", "mpg", "tax"]]
y = Raw_Data.loc[:, ["price"]].values

#### 🔹 Encoding Categorical feutures

In [10]:
ohe = OneHotEncoder()
X_cat = pd.DataFrame(ohe.fit_transform(X_cat).toarray())

#### 🔹 Forming X

0 - 208 : Manufacturer, model, transmission, fuelType [Encoded] \
209 : year \
210 : engineSize \
211 : mileage \
212 : mpg \
213 : tax 

In [11]:
X = X_cat.join(X_num).to_numpy()
X

array([[0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.1630e+04, 6.0100e+01,
        1.4500e+02],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.2000e+03, 5.8900e+01,
        1.4500e+02],
       [0.0000e+00, 1.0000e+00, 0.0000e+00, ..., 1.6140e+03, 4.9600e+01,
        1.4500e+02],
       ...,
       [0.0000e+00, 0.0000e+00, 1.0000e+00, ..., 1.9910e+04, 5.4300e+01,
        1.2500e+02],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.4468e+04, 5.0400e+01,
        1.2500e+02],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.0586e+04, 4.8700e+01,
        1.5000e+02]])

#### 🔹 Splitting X and y to TrainSet and TestSet

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.18, random_state=0)

#### 🔹 Scaling Datas

In [13]:
#no = Normalizer()
#X_train = no.fit_transform(X_train)

In [14]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)

### ✅ Training Model

In [15]:
CPP_Model = RandomForestRegressor()
CPP_Model.fit(X_train, y_train)

#### 🔹 Predicting on Test Datas

In [16]:
y_pred = CPP_Model.predict(ss.transform(X_test))

#### 🔹 Evaluating Model (R2-Score)

In [17]:
print(str(r2_score(y_test, y_pred).round(3) * 100) + "%")

95.7%
