In [72]:
# importing libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [73]:
# 📌 Step 2: Load the Dataset

car_sales_data = pd.read_csv("./data/car-sales-extended.csv")
# car_sales_data.describe()

x = car_sales_data.drop("Price", axis=1)
y = car_sales_data["Price"]

x["Doors"] = x["Doors"].astype(str)
# x.dtypes #get type of columns data
x.head(), y.head()

(     Make Colour  Odometer (KM) Doors
 0   Honda  White          35431     4
 1     BMW   Blue         192714     5
 2   Honda  White          84714     4
 3  Toyota  White         154365     4
 4  Nissan   Blue         181577     3,
 0    15323
 1    19943
 2    28343
 3    13434
 4    14043
 Name: Price, dtype: int64)

In [74]:
# 📌 Step 3: Split Data into Training and Testing Sets

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=42)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((800, 4), (800,), (200, 4), (200,))

In [75]:
# Step 4. Apply Preprocessing (OneHotEncoder + StandardScaler)

categorical_features = ["Make","Colour", "Doors"]
numerical_features = ['Odometer (KM)']

scaler = RobustScaler()
one_hot = OneHotEncoder(sparse_output=False)

# Create ColumnTransformer
preprocessor = ColumnTransformer([
    ('one_hot', one_hot, categorical_features),
    ('scaler', scaler, numerical_features)
], remainder="passthrough")  # Keep other columns if needed

In [76]:
# 📌 Step 5: Apply Linear Regression

# Create the Pipeline
_pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("model", RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42))
])

######## Check before fitting if the Scaling and Encoding working correctly##########

# Transform the training data (without model training)
#x_train_transformed = preprocessor.fit_transform(x_train)

# Convert to DataFrame for better readability
#feature_names = preprocessor.get_feature_names_out()
#df_transformed = pd.DataFrame(x_train_transformed, columns=feature_names)

# Show transformed data
#print(df_transformed.head())

#########################################################################################


# Train the Model
_pipeline.fit(x_train, y_train)

y_pred = _pipeline.predict(x_test)
# len(y_pred), len(y_test)

In [78]:
# Step 6. Performance Evolution

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"🔹 Regression Performance:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

🔹 Regression Performance:
Mean Absolute Error (MAE): 5836.99
Root Mean Squared Error (RMSE): 7642.34
R² Score: 0.36


In [79]:
x.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3


In [63]:
y.head()

0    15323
1    19943
2    28343
3    13434
4    14043
Name: Price, dtype: int64