# CAR PRICE DATASET

# Importing all the libabries needed for examining the dataset

In [67]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, VotingClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor

import warnings
warnings.filterwarnings("ignore")


# 1.) Collect the Dataset

In [2]:
df = pd.read_csv("car_price_dataset.csv")

In [3]:
df

Unnamed: 0,Brand,Model,Year,Engine_Size,Fuel_Type,Transmission,Mileage,Doors,Owner_Count,Price
0,Kia,Rio,2020,4.2,Diesel,Manual,289944,3,5,8501
1,Chevrolet,Malibu,2012,2.0,Hybrid,Automatic,5356,2,3,12092
2,Mercedes,GLA,2020,4.2,Diesel,Automatic,231440,4,2,11171
3,Audi,Q5,2023,2.0,Electric,Manual,160971,2,1,11780
4,Volkswagen,Golf,2003,2.6,Hybrid,Semi-Automatic,286618,3,3,2867
...,...,...,...,...,...,...,...,...,...,...
9995,Kia,Optima,2004,3.7,Diesel,Semi-Automatic,5794,2,4,8884
9996,Chevrolet,Impala,2002,1.4,Electric,Automatic,168000,2,1,6240
9997,BMW,3 Series,2010,3.0,Petrol,Automatic,86664,5,1,9866
9998,Ford,Explorer,2002,1.4,Hybrid,Automatic,225772,4,1,4084


# Explanation of all the columns in the dataset

In [4]:
# Brand (Object): Manufacturer of the car (e.g., Kia, Chevrolet, Audi).
# Model (Object): Car model.
# Year (Integer): Manufacturing year.
# Engine_Size (Float): Engine capacity in liters.
# Fuel_Type (Object): Type of fuel (Diesel, Hybrid, Electric, etc.).
# Transmission (Object): Type of transmission (Manual, Automatic, Semi-Automatic).
# Mileage (Integer): Distance traveled by the car (in km).
# Doors (Integer): Number of doors.
# Owner_Count (Integer): Number of previous owners.
# Price (Integer): Price of the car.

# 2.) Data Cleaning and Preprocessing 

In [5]:
df.head()

Unnamed: 0,Brand,Model,Year,Engine_Size,Fuel_Type,Transmission,Mileage,Doors,Owner_Count,Price
0,Kia,Rio,2020,4.2,Diesel,Manual,289944,3,5,8501
1,Chevrolet,Malibu,2012,2.0,Hybrid,Automatic,5356,2,3,12092
2,Mercedes,GLA,2020,4.2,Diesel,Automatic,231440,4,2,11171
3,Audi,Q5,2023,2.0,Electric,Manual,160971,2,1,11780
4,Volkswagen,Golf,2003,2.6,Hybrid,Semi-Automatic,286618,3,3,2867


In [6]:
df.tail()

Unnamed: 0,Brand,Model,Year,Engine_Size,Fuel_Type,Transmission,Mileage,Doors,Owner_Count,Price
9995,Kia,Optima,2004,3.7,Diesel,Semi-Automatic,5794,2,4,8884
9996,Chevrolet,Impala,2002,1.4,Electric,Automatic,168000,2,1,6240
9997,BMW,3 Series,2010,3.0,Petrol,Automatic,86664,5,1,9866
9998,Ford,Explorer,2002,1.4,Hybrid,Automatic,225772,4,1,4084
9999,Volkswagen,Tiguan,2001,2.1,Diesel,Manual,157882,3,3,3342


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Brand         10000 non-null  object 
 1   Model         10000 non-null  object 
 2   Year          10000 non-null  int64  
 3   Engine_Size   10000 non-null  float64
 4   Fuel_Type     10000 non-null  object 
 5   Transmission  10000 non-null  object 
 6   Mileage       10000 non-null  int64  
 7   Doors         10000 non-null  int64  
 8   Owner_Count   10000 non-null  int64  
 9   Price         10000 non-null  int64  
dtypes: float64(1), int64(5), object(4)
memory usage: 781.4+ KB


In [8]:
df.shape

(10000, 10)

In [9]:
df.columns


Index(['Brand', 'Model', 'Year', 'Engine_Size', 'Fuel_Type', 'Transmission',
       'Mileage', 'Doors', 'Owner_Count', 'Price'],
      dtype='object')

In [10]:
df.describe()

Unnamed: 0,Year,Engine_Size,Mileage,Doors,Owner_Count,Price
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,2011.5437,3.00056,149239.1118,3.4971,2.9911,8852.9644
std,6.897699,1.149324,86322.348957,1.110097,1.422682,3112.59681
min,2000.0,1.0,25.0,2.0,1.0,2000.0
25%,2006.0,2.0,74649.25,3.0,2.0,6646.0
50%,2012.0,3.0,149587.0,3.0,3.0,8858.5
75%,2017.0,4.0,223577.5,4.0,4.0,11086.5
max,2023.0,5.0,299947.0,5.0,5.0,18301.0


In [11]:
df.sample(10)

Unnamed: 0,Brand,Model,Year,Engine_Size,Fuel_Type,Transmission,Mileage,Doors,Owner_Count,Price
3103,Ford,Explorer,2017,4.1,Electric,Automatic,268157,3,5,11436
7015,Toyota,Camry,2023,4.8,Diesel,Manual,129245,5,4,13215
989,Kia,Sportage,2002,2.1,Diesel,Manual,170511,5,1,3389
6693,Ford,Focus,2003,1.3,Electric,Manual,73415,3,3,6831
9355,Chevrolet,Malibu,2019,4.2,Hybrid,Automatic,167792,5,4,13144
2713,Chevrolet,Malibu,2023,2.8,Electric,Manual,186725,3,2,12065
3321,Toyota,Camry,2016,2.2,Electric,Manual,259242,2,5,7915
3582,Mercedes,E-Class,2014,4.3,Electric,Automatic,95694,4,4,14186
6292,Hyundai,Tucson,2008,2.5,Hybrid,Manual,51676,3,2,8966
8691,Volkswagen,Golf,2011,4.2,Electric,Automatic,290139,2,2,9297


In [12]:
df.isnull().sum()

Brand           0
Model           0
Year            0
Engine_Size     0
Fuel_Type       0
Transmission    0
Mileage         0
Doors           0
Owner_Count     0
Price           0
dtype: int64

In [13]:
df["Brand"].unique()

array(['Kia', 'Chevrolet', 'Mercedes', 'Audi', 'Volkswagen', 'Toyota',
       'Honda', 'BMW', 'Hyundai', 'Ford'], dtype=object)

In [14]:
numerical = ["Engine_Size", "Doors", "Owner_Count", "Price"]
categorical = ["Brand", "Model", "Fuel_Type", "Transmission"]


# 2.) ColumnTransfer

In [15]:
Transformer = ColumnTransformer([
    ("num_tr", StandardScaler(), numerical),  
    ("cat_tr", OneHotEncoder(sparse_output=False), categorical)  
])

In [16]:
transformed_data = Transformer.fit_transform(df)

In [17]:
feature_names = Transformer.get_feature_names_out()
transformed_df = pd.DataFrame(transformed_data, columns=feature_names)

In [18]:
transformed_df

Unnamed: 0,num_tr__Engine_Size,num_tr__Doors,num_tr__Owner_Count,num_tr__Price,cat_tr__Brand_Audi,cat_tr__Brand_BMW,cat_tr__Brand_Chevrolet,cat_tr__Brand_Ford,cat_tr__Brand_Honda,cat_tr__Brand_Hyundai,...,cat_tr__Model_Tiguan,cat_tr__Model_Tucson,cat_tr__Model_X5,cat_tr__Fuel_Type_Diesel,cat_tr__Fuel_Type_Electric,cat_tr__Fuel_Type_Hybrid,cat_tr__Fuel_Type_Petrol,cat_tr__Transmission_Automatic,cat_tr__Transmission_Manual,cat_tr__Transmission_Semi-Automatic
0,1.043657,-0.447821,1.412122,-0.113083,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-0.870607,-1.348689,0.006256,1.040674,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,1.043657,0.453046,-0.696677,0.744764,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-0.870607,-1.348689,-1.399610,0.940431,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,-0.348535,-0.447821,0.006256,-1.923238,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.608597,-1.348689,0.709189,0.009971,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
9996,-1.392679,-1.348689,-1.399610,-0.839523,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
9997,-0.000487,1.353914,-1.399610,0.325479,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
9998,-1.392679,0.453046,-1.399610,-1.532226,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


# 3.) Test_train_Split

In [19]:
X = transformed_df.drop(["num_tr__Price"], axis=1) 
y = transformed_df["num_tr__Price"] 


In [20]:
pca = PCA(n_components = 7)
x_pca = pca.fit_transform(X)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = .1, random_state = 88)

# 4.) Applying Models

# LogisiticRegression

In [50]:
threshold = y_train.median() 
y_train_class = (y_train > threshold).astype(int) 
y_test_class = (y_test > threshold).astype(int)  

lr = LogisticRegression()
lr.fit(X_train, y_train_class)


In [51]:
y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)


print(f"Train Accuracy: {accuracy_score(y_train_class, y_pred_train):.2f}")
print(f"Test Accuracy: {accuracy_score(y_test_class, y_pred_test):.2f}")

Train Accuracy: 0.67
Test Accuracy: 0.67


# LinearRegression

In [52]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [57]:
y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)


In [58]:
train_mse = mean_squared_error(y_train, y_pred_train)  
test_mse = mean_squared_error(y_test, y_pred_test)     
train_r2 = r2_score(y_train, y_pred_train)             
test_r2 = r2_score(y_test, y_pred_test) 

print(f"Train MSE: {train_mse:.2f}")
print(f"Test MSE: {test_mse:.2f}")
print(f"Train R²: {train_r2:.2f}")
print(f"Test R²: {test_r2:.2f}")

Train MSE: 0.75
Test MSE: 0.72
Train R²: 0.26
Test R²: 0.24


# SVR

In [61]:
svr = SVR()
svr.fit(X_train, y_train)

In [62]:
y_pred_train = svr.predict(X_train)
y_pred_test = svr.predict(X_test)

In [64]:

train_mse = mean_squared_error(y_train, y_pred_train)  
test_mse = mean_squared_error(y_test, y_pred_test)     
train_r2 = r2_score(y_train, y_pred_train)             
test_r2 = r2_score(y_test, y_pred_test)                  

print(f"Train MSE: {train_mse:.2f}")
print(f"Test MSE: {test_mse:.2f}")
print(f"Train R²: {train_r2:.2f}")
print(f"Test R²: {test_r2:.2f}")

Train MSE: 0.65
Test MSE: 0.77
Train R²: 0.36
Test R²: 0.18


# DescisionTreeRegressor

In [72]:
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)

In [73]:
y_pred_train_dt = dt.predict(X_train)
y_pred_test_dt = dt.predict(X_test)

In [74]:
train_mse_dt = mean_squared_error(y_train, y_pred_train_dt)
test_mse_dt = mean_squared_error(y_test, y_pred_test_dt)
train_r2_dt = r2_score(y_train, y_pred_train_dt)
test_r2_dt = r2_score(y_test, y_pred_test_dt)

# Print the results
print(f"Train MSE: {train_mse_dt:.2f}")
print(f"Test MSE: {test_mse_dt:.2f}")
print(f"Train R²: {train_r2_dt:.2f}")
print(f"Test R²: {test_r2_dt:.2f}")

Train MSE: 0.74
Test MSE: 0.74
Train R²: 0.26
Test R²: 0.22


# RandomForestRegressor

In [76]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [77]:
y_pred_train_rf = rf.predict(X_train)
y_pred_test_rf = rf.predict(X_test)

In [78]:
train_mse_rf = mean_squared_error(y_train, y_pred_train_rf)
test_mse_rf = mean_squared_error(y_test, y_pred_test_rf)
train_r2_rf = r2_score(y_train, y_pred_train_rf)
test_r2_rf = r2_score(y_test, y_pred_test_rf)

print(f"Train MSE: {train_mse_rf:.2f}")
print(f"Test MSE: {test_mse_rf:.2f}")
print(f"Train R²: {train_r2_rf:.2f}")
print(f"Test R²: {test_r2_rf:.2f}")

Train MSE: 0.13
Test MSE: 0.82
Train R²: 0.87
Test R²: 0.13
