# Car Price Prediction

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import RandomizedSearchCV
import joblib

In [2]:
# Load dataset
df = pd.read_csv("car.csv")

In [3]:
df

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.60,6.87,42450,Diesel,Dealer,Manual,0
...,...,...,...,...,...,...,...,...,...
296,city,2016,9.50,11.60,33988,Diesel,Dealer,Manual,0
297,brio,2015,4.00,5.90,60000,Petrol,Dealer,Manual,0
298,city,2009,3.35,11.00,87934,Petrol,Dealer,Manual,0
299,city,2017,11.50,12.50,9000,Diesel,Dealer,Manual,0


In [4]:
df.size

2709

# Data Preprocessing

In [5]:
# Drop irrelevant column
df = df.drop('Car_Name', axis=1)

In [6]:
df

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,2014,4.60,6.87,42450,Diesel,Dealer,Manual,0
...,...,...,...,...,...,...,...,...
296,2016,9.50,11.60,33988,Diesel,Dealer,Manual,0
297,2015,4.00,5.90,60000,Petrol,Dealer,Manual,0
298,2009,3.35,11.00,87934,Petrol,Dealer,Manual,0
299,2017,11.50,12.50,9000,Diesel,Dealer,Manual,0


In [7]:
# Derive new feature: car age
df['Car_Age'] = 2025 - df['Year']
df.drop('Year', axis=1, inplace=True)

In [8]:
# Encode categorical variables
df = pd.get_dummies(df, drop_first=True)

In [9]:
df

Unnamed: 0,Selling_Price,Present_Price,Kms_Driven,Owner,Car_Age,Fuel_Type_Diesel,Fuel_Type_Petrol,Seller_Type_Individual,Transmission_Manual
0,3.35,5.59,27000,0,11,False,True,False,True
1,4.75,9.54,43000,0,12,True,False,False,True
2,7.25,9.85,6900,0,8,False,True,False,True
3,2.85,4.15,5200,0,14,False,True,False,True
4,4.60,6.87,42450,0,11,True,False,False,True
...,...,...,...,...,...,...,...,...,...
296,9.50,11.60,33988,0,9,True,False,False,True
297,4.00,5.90,60000,0,10,False,True,False,True
298,3.35,11.00,87934,0,16,False,True,False,True
299,11.50,12.50,9000,0,8,True,False,False,True


# Feature & Target Splitting

In [10]:
X = df.drop('Selling_Price', axis=1)
y = df['Selling_Price']

In [11]:
X

Unnamed: 0,Present_Price,Kms_Driven,Owner,Car_Age,Fuel_Type_Diesel,Fuel_Type_Petrol,Seller_Type_Individual,Transmission_Manual
0,5.59,27000,0,11,False,True,False,True
1,9.54,43000,0,12,True,False,False,True
2,9.85,6900,0,8,False,True,False,True
3,4.15,5200,0,14,False,True,False,True
4,6.87,42450,0,11,True,False,False,True
...,...,...,...,...,...,...,...,...
296,11.60,33988,0,9,True,False,False,True
297,5.90,60000,0,10,False,True,False,True
298,11.00,87934,0,16,False,True,False,True
299,12.50,9000,0,8,True,False,False,True


In [12]:
y

0       3.35
1       4.75
2       7.25
3       2.85
4       4.60
       ...  
296     9.50
297     4.00
298     3.35
299    11.50
300     5.30
Name: Selling_Price, Length: 301, dtype: float64

# Train-Test Splitting

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
X_train.shape

(240, 8)

In [16]:
y_train.shape

(240,)

In [17]:
X_test.shape

(61, 8)

In [18]:
y_test.shape

(61,)

# Model Training using Random Forest

In [19]:
# Hyperparameter tuning setup
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestRegressor(random_state=42)
rf_random = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=10,
                               cv=3, verbose=1, n_jobs=-1, scoring='neg_mean_squared_error')

# Fit model

In [20]:
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [21]:
best_model = rf_random.best_estimator_

In [22]:
best_model

# Model Evaluation

In [25]:
y_pred = best_model.predict(X_test)
y_pred

array([ 0.41802406, 11.07344948,  4.89056501,  0.22031476,  7.82781783,
        6.30914392,  1.08128312,  0.59530024,  0.45614189,  6.58582077,
        8.10790579,  1.02720837,  8.00266139,  0.45909963,  5.29971257,
        2.76431336,  1.12580667, 13.84004222,  0.462818  ,  1.53620717,
        0.32894976,  7.87179697,  4.69352943,  2.81109974,  0.525197  ,
        3.5123588 ,  5.33813003,  3.12765317,  1.16539108,  1.17261721,
        0.41547175, 10.09558249,  0.44384317,  2.64557996,  7.86197951,
        4.28232202,  6.01894533,  6.46873835,  2.51018618,  7.09638466,
        4.26168369,  3.44742577,  4.91756415,  0.54114389,  6.16026515,
        0.73257097,  8.46583472,  6.68020124,  2.96484226,  3.64468213,
        5.04871852,  1.49012503, 23.06607556, 20.59917037,  6.44317996,
       10.20820798,  5.05456085,  9.11715992,  2.73071687,  6.78030575,
        0.23872894])

In [26]:
mse = mean_squared_error(y_test, y_pred)
mse

0.9521376310562976

In [27]:
rmse = np.sqrt(mse)
rmse

0.9757753999032245

In [28]:
r2 = r2_score(y_test, y_pred)
r2

0.9586666606954283

In [29]:
print(f"Model Evaluation:\nRMSE: {rmse:.2f}\nR² Score: {r2:.2f}")

Model Evaluation:
RMSE: 0.98
R² Score: 0.96


# Save Model for Deployment

In [30]:
joblib.dump(best_model, "car_price_model.pkl")
print("✅ Model saved as 'car_price_model.pkl'")

✅ Model saved as 'car_price_model.pkl'
