# Second Hand Car Price Prediction


## Importing Libraries and Dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
dataset = pd.read_csv("car_details.csv")
dataset.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,5.0


## First Sight at Dataset

In [3]:
dataset.isnull().sum()

Unnamed: 0,0
name,0
year,0
selling_price,0
km_driven,0
fuel,0
seller_type,0
transmission,0
owner,0
mileage,221
engine,221


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           8128 non-null   object 
 1   year           8128 non-null   int64  
 2   selling_price  8128 non-null   int64  
 3   km_driven      8128 non-null   int64  
 4   fuel           8128 non-null   object 
 5   seller_type    8128 non-null   object 
 6   transmission   8128 non-null   object 
 7   owner          8128 non-null   object 
 8   mileage        7907 non-null   object 
 9   engine         7907 non-null   object 
 10  max_power      7913 non-null   object 
 11  seats          7907 non-null   float64
dtypes: float64(1), int64(3), object(8)
memory usage: 762.1+ KB


## Data Preprocessing


### Dropping NaN Values

In [5]:
# Instead of replacing NaN values with values, we drop rows that include NaN values. Baceuse we have small portion of Nan Values
dataset = dataset.dropna()

### Converting Mileage Column to Float

In [6]:
dataset['mileage'].unique()
def convert_mileage(value):
    if "km/kg" in value:
        return float(value.replace("km/kg", "").strip()) * 1.33
    elif "kmpl" in value:
        return float(value.replace("kmpl", "").strip())
    else:
        return None

dataset['mileage'] = dataset['mileage'].apply(convert_mileage)

### Converting Engine Column to Float

In [7]:
dataset["engine"].unique()
dataset["engine"] = dataset["engine"].str.replace("CC", "").astype(float)


### Converting Max_Power Column to Float

In [8]:
dataset["max_power"].unique()
dataset["max_power"] = dataset["max_power"].str.replace("bhp", "").str.strip()
dataset["max_power"] = dataset["max_power"].replace("", np.nan)  # Replace empty strings with NaN
dataset["max_power"] = pd.to_numeric(dataset["max_power"], errors="coerce")
dataset["max_power"].fillna(dataset["max_power"].median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset["max_power"].fillna(dataset["max_power"].median(), inplace=True)


### Ensuring All Types Changed

In [10]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7907 entries, 0 to 8127
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           7907 non-null   object 
 1   year           7907 non-null   int64  
 2   selling_price  7907 non-null   int64  
 3   km_driven      7907 non-null   int64  
 4   fuel           7907 non-null   object 
 5   seller_type    7907 non-null   object 
 6   transmission   7907 non-null   object 
 7   owner          7907 non-null   object 
 8   mileage        7907 non-null   float64
 9   engine         7907 non-null   float64
 10  max_power      7907 non-null   float64
 11  seats          7907 non-null   float64
dtypes: float64(4), int64(3), object(5)
memory usage: 803.1+ KB


## Encoder


In [11]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
for col in ["fuel", "seller_type", "transmission", "owner"]:
    dataset[col] = label_encoder.fit_transform(dataset[col])

In [12]:
dataset.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,1,1,1,0,23.4,1248.0,74.0,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,1,1,1,2,21.14,1498.0,103.52,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,3,1,1,4,17.7,1497.0,78.0,5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,1,1,1,0,23.0,1396.0,90.0,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,3,1,1,0,16.1,1298.0,88.2,5.0


In [13]:
dataset["brand"] = dataset["name"].str.split(" ").str[0]
dataset["model"] = dataset["name"].str.split(" ").str[1]

In [14]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

hot_encoder = OneHotEncoder(sparse_output=False)

encoded = hot_encoder.fit_transform(dataset[["brand", "model"]])

encoded_df = pd.DataFrame(encoded, columns=hot_encoder.get_feature_names_out(["brand", "model"]))

dataset = pd.concat([dataset, encoded_df], axis=1)

dataset = dataset.drop(["brand", "model"], axis=1)

dataset.head()


Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,...,model_XUV500,model_Xcent,model_Xenon,model_Xylo,model_Yaris,model_Yeti,model_Zen,model_Zest,model_i10,model_i20
0,Maruti Swift Dzire VDI,2014.0,450000.0,145500.0,1.0,1.0,1.0,0.0,23.4,1248.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Skoda Rapid 1.5 TDI Ambition,2014.0,370000.0,120000.0,1.0,1.0,1.0,2.0,21.14,1498.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Honda City 2017-2020 EXi,2006.0,158000.0,140000.0,3.0,1.0,1.0,4.0,17.7,1497.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Hyundai i20 Sportz Diesel,2010.0,225000.0,127000.0,1.0,1.0,1.0,0.0,23.0,1396.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,Maruti Swift VXI BSIII,2007.0,130000.0,120000.0,3.0,1.0,1.0,0.0,16.1,1298.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
dataset = dataset.dropna()

## Building Machine Learning Modal

In [16]:
X = dataset.drop(["selling_price", "name"], axis=1)
y = dataset["selling_price"]


In [17]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train, y_train)

In [20]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("Model Score:", model.score(X_test, y_test))

Mean Absolute Error: 68960.88155396619
Mean Squared Error: 14323703787.545122
Root Mean Squared Error: 119681.67690814297
Model Score: 0.9810639025776314
