In [2]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

In [4]:
df = pd.read_csv('car_price_dataset.csv')

In [5]:
df

Unnamed: 0,Brand,Year,Fuel_Type,Transmission,Owner_Type,Kilometers_Driven,Price
0,Honda,2021,Hybrid,Manual,Second,69887,702587.5
1,Hyundai,2020,Hybrid,Automatic,Second,24477,599452.5
2,BMW,2005,Petrol,Automatic,Second,153338,1252076.0
3,Hyundai,2005,Electric,Automatic,Second,132815,426315.5
4,Hyundai,2022,Electric,Automatic,First,179598,570409.0
...,...,...,...,...,...,...,...
995,Chevrolet,2005,Diesel,Manual,Second,23074,489755.0
996,Hyundai,2017,Diesel,Manual,First,151621,564534.5
997,Toyota,2018,Petrol,Manual,Third,34963,713528.5
998,Toyota,2021,Petrol,Manual,Third,57265,697305.5


In [6]:
X = df.drop("Price", axis=1)
y = df["Price"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
categorical_features = ['Brand', 'Fuel_Type', 'Transmission', 'Owner_Type']

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'  # Keep numeric columns as is
)

In [10]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [11]:
model.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [12]:
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [13]:
print(f"Mean Absolute Error: ₹{mae:.2f}")
print(f"R² Score: {r2:.4f}")

Mean Absolute Error: ₹33660.14
R² Score: 0.9838


In [14]:
import pickle
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Define and train the model again (if not already saved)
categorical_features = ['Brand', 'Fuel_Type', 'Transmission', 'Owner_Type']
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Assuming you already have X_train and y_train prepared
model.fit(X_train, y_train)

# Save the trained model using pickle
with open('car_price_model.pkl', 'wb') as f:
    pickle.dump(model, f)

print("Model saved as car_price_model.pkl")

Model saved as car_price_model.pkl
