# Car Price Prediction

### 1. Import Module

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

### 2. Cleaning Data

In [4]:
df = pd.read_csv("../../datasets/car_price_dataset.csv")
df.head(10)

Unnamed: 0,Brand,Model,Year,Engine_Size,Fuel_Type,Transmission,Mileage,Doors,Owner_Count,Price
0,Kia,Rio,2020,4.2,Diesel,Manual,289944,3,5,8501
1,Chevrolet,Malibu,2012,2.0,Hybrid,Automatic,5356,2,3,12092
2,Mercedes,GLA,2020,4.2,Diesel,Automatic,231440,4,2,11171
3,Audi,Q5,2023,2.0,Electric,Manual,160971,2,1,11780
4,Volkswagen,Golf,2003,2.6,Hybrid,Semi-Automatic,286618,3,3,2867
5,Toyota,Camry,2007,2.7,Petrol,Automatic,157889,4,4,7242
6,Honda,Civic,2010,3.4,Electric,Automatic,139584,3,1,11208
7,Kia,Sportage,2001,4.7,Electric,Semi-Automatic,157495,2,2,7950
8,Kia,Sportage,2014,2.6,Hybrid,Manual,98700,3,4,9926
9,Toyota,RAV4,2005,3.1,Petrol,Manual,107724,2,5,6545


In [12]:
df.duplicated().sum()

np.int64(0)

In [13]:
df.isnull().sum()

Brand           0
Model           0
Year            0
Engine_Size     0
Fuel_Type       0
Transmission    0
Mileage         0
Doors           0
Owner_Count     0
Price           0
dtype: int64

### 3. EDA

In [5]:
df.shape

(10000, 10)

In [6]:
df.columns

Index(['Brand', 'Model', 'Year', 'Engine_Size', 'Fuel_Type', 'Transmission',
       'Mileage', 'Doors', 'Owner_Count', 'Price'],
      dtype='object')

In [7]:
df.dtypes

Brand            object
Model            object
Year              int64
Engine_Size     float64
Fuel_Type        object
Transmission     object
Mileage           int64
Doors             int64
Owner_Count       int64
Price             int64
dtype: object

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Brand         10000 non-null  object 
 1   Model         10000 non-null  object 
 2   Year          10000 non-null  int64  
 3   Engine_Size   10000 non-null  float64
 4   Fuel_Type     10000 non-null  object 
 5   Transmission  10000 non-null  object 
 6   Mileage       10000 non-null  int64  
 7   Doors         10000 non-null  int64  
 8   Owner_Count   10000 non-null  int64  
 9   Price         10000 non-null  int64  
dtypes: float64(1), int64(5), object(4)
memory usage: 781.4+ KB


In [9]:
df.describe()

Unnamed: 0,Year,Engine_Size,Mileage,Doors,Owner_Count,Price
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,2011.5437,3.00056,149239.1118,3.4971,2.9911,8852.9644
std,6.897699,1.149324,86322.348957,1.110097,1.422682,3112.59681
min,2000.0,1.0,25.0,2.0,1.0,2000.0
25%,2006.0,2.0,74649.25,3.0,2.0,6646.0
50%,2012.0,3.0,149587.0,3.0,3.0,8858.5
75%,2017.0,4.0,223577.5,4.0,4.0,11086.5
max,2023.0,5.0,299947.0,5.0,5.0,18301.0


### 4. Data Preprocessing

In [17]:
le = LabelEncoder()
sc = StandardScaler()
df['Brand'] = le.fit_transform(df['Brand'])
df['Model'] = le.fit_transform(df['Model'])
df['Fuel_Type'] = le.fit_transform(df['Fuel_Type'])
df['Transmission'] = le.fit_transform(df['Transmission'])
df['Year'] = sc.fit_transform(df[['Year']])
df['Mileage'] = sc.fit_transform(df[['Mileage']])
df['Price'] = sc.fit_transform(df[['Price']])

In [19]:
X = np.array(df.drop(columns=['Price']))
y = np.array(df['Price'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

### 4. Model Training

In [20]:
lr = LinearRegression().fit(X_train, y_train)
dt = DecisionTreeRegressor().fit(X_train, y_train)
rf = RandomForestRegressor().fit(X_train, y_train)

### 5. Model Evaluating

In [32]:
y_pred1 = lr.predict(X_test)
y_pred2 = dt.predict(X_test)
y_pred3 = rf.predict(X_test)

mse1 = mean_squared_error(y_test, y_pred1)
mse2 = mean_squared_error(y_test, y_pred2)
mse3 = mean_squared_error(y_test, y_pred3)
r2_1 = r2_score(y_test, y_pred1)
r2_2 = r2_score(y_test, y_pred2)
r2_3 = r2_score(y_test, y_pred3)

print(f"MSE LINEAR REGRESSION: {np.around(mse1, 3)}")
print(f"MSE Decision Tree: {np.around(mse2, 3)}")
print(f"MSE Random Forest: {np.around(mse3, 3)}")
print(f"R2 Score Linear Regression: {np.around(r2_1, 3) * 100} %")
print(f"R2 Decision Tree: {np.around(r2_2, 2) * 100} %")
print(f"R2 Random Forest: {np.around(r2_3, 2) * 100} %")

MSE LINEAR REGRESSION: 0.082
MSE Decision Tree: 0.069
MSE Random Forest: 0.026
R2 Score Linear Regression: 91.8 %
R2 Decision Tree: 93.0 %
R2 Random Forest: 97.0 %


In [36]:
df.head(10)

Unnamed: 0,Brand,Model,Year,Engine_Size,Fuel_Type,Transmission,Mileage,Doors,Owner_Count,Price
0,6,24,1.226021,4.2,0,1,1.630075,3,5,-0.113083
1,2,19,0.066156,2.0,2,0,-1.666895,2,3,1.040674
2,7,16,1.226021,4.2,0,0,0.952303,4,2,0.744764
3,0,22,1.66097,2.0,1,1,0.135915,2,1,0.940431
4,9,17,-1.238692,2.6,2,2,1.591543,3,3,-1.923238
5,8,7,-0.65876,2.7,3,0,0.10021,4,4,-0.517589
6,4,8,-0.22381,3.4,1,0,-0.111855,3,1,0.756652
7,6,26,-1.528659,4.7,1,2,0.095645,2,2,-0.290115
8,6,26,0.356122,2.6,2,1,-0.585499,3,4,0.344757
9,8,23,-0.948726,3.1,3,1,-0.480955,2,5,-0.741529


### 6. Model Testing

In [48]:
new_data = pd.DataFrame({
    'Brand': ['Toyota', 'Mercedes'],
    'Model': ['Camry', 'GLA'],
    'Year': [2019, 2017],
    'Engine_Size': [2.8, 3.0],
    'Fuel_Type': ['Diesel', 'Hybrid'],
    'Transmission': ['Automatic', 'Automatic'],
    'Mileage': [3000, 53000],
    'Doors': [4, 4],
    'Owner_Count': [1, 1]
})

In [49]:
new_data['Brand'] = le.fit_transform(new_data['Brand'])
new_data['Model'] = le.fit_transform(new_data['Model'])
new_data['Fuel_Type'] = le.fit_transform(new_data['Fuel_Type'])
new_data['Transmission'] = le.fit_transform(new_data['Transmission'])
new_data['Mileage'] = sc.fit_transform(new_data[['Mileage']])

In [50]:
X_new = np.array(new_data)

In [51]:
hasil = rf.predict(X_new)

In [54]:
hasil = hasil.reshape(-1, 1)

In [55]:
sc.inverse_transform(hasil)

array([[68333.33736409],
       [50139.90115508]])