In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt

In [4]:
df = pd.read_csv("car_sales_data.csv")

In [5]:
df.head(20)

Unnamed: 0,Manufacturer,Model,Engine size,Fuel type,Year of manufacture,Mileage,Price
0,Ford,Fiesta,1.0,Petrol,2002,127300,3074
1,Porsche,718 Cayman,4.0,Petrol,2016,57850,49704
2,Ford,Mondeo,1.6,Diesel,2014,39190,24072
3,Toyota,RAV4,1.8,Hybrid,1988,210814,1705
4,VW,Polo,1.0,Petrol,2006,127869,4101
5,Ford,Focus,1.4,Petrol,2018,33603,29204
6,Ford,Mondeo,1.8,Diesel,2010,86686,14350
7,Toyota,Prius,1.4,Hybrid,2015,30663,30297
8,VW,Polo,1.2,Petrol,2012,73470,9977
9,Ford,Focus,2.0,Diesel,1992,262514,1049


In [6]:
df.describe()

Unnamed: 0,Engine size,Year of manufacture,Mileage,Price
count,50000.0,50000.0,50000.0,50000.0
mean,1.773058,2004.20944,112497.3207,13828.90316
std,0.734108,9.645965,71632.515602,16416.681336
min,1.0,1984.0,630.0,76.0
25%,1.4,1996.0,54352.25,3060.75
50%,1.6,2004.0,100987.5,7971.5
75%,2.0,2012.0,158601.0,19026.5
max,5.0,2022.0,453537.0,168081.0


In [7]:
df.shape

(50000, 7)

In [8]:
df.isnull().sum()

Manufacturer           0
Model                  0
Engine size            0
Fuel type              0
Year of manufacture    0
Mileage                0
Price                  0
dtype: int64

In [14]:
X = df[["Manufacturer","Model","Engine size","Fuel type","Year of manufacture","Mileage"]]
y = df["Price"]

In [15]:
# Categorical + numerical columns
categorical = ["Manufacturer", "Model", "Fuel type"]
numerical = ["Engine size", "Year of manufacture", "Mileage"]

In [16]:
# Preprocessor
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
    ("num", "passthrough", numerical)
])

In [17]:
# Pipeline with Random Forest
model = Pipeline([
    ("preprocess", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))
])

In [18]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# Train
model.fit(X_train, y_train)

In [20]:
# Predict
y_pred = model.predict(X_test)

In [21]:
# Evaluation
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

MAE: 288.460149
R² Score: 0.998513978172311


In [22]:
# Example Prediction
new_car = pd.DataFrame({
    "Manufacturer": ["Toyota"],
    "Model": ["Prius"],
    "Engine size": [1.8],
    "Fuel type": ["Hybrid"],
    "Year of manufacture": [2017],
    "Mileage": [45000]
})
pred_price = model.predict(new_car)
print("Predicted Price:", pred_price[0])

Predicted Price: 35987.64


In [23]:
# Another example car
another_car = pd.DataFrame({
    "Manufacturer": ["VW"],
    "Model": ["Polo"],
    "Engine size": [1.2],
    "Fuel type": ["Petrol"],
    "Year of manufacture": [2015],
    "Mileage": [65000]
})

# Predict price
pred_price2 = model.predict(another_car)
print("Predicted Price:", pred_price2[0])

Predicted Price: 12916.2
