# Car Price Prediction

### Understanding the Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
car_dataset = pd.read_csv(r"E:\1. Tech\Machine Learning Projects\Car Price Predictor\car_data.csv")

In [3]:
car_dataset.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [4]:
car_dataset.isnull().sum()

name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
dtype: int64

In [5]:
car_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           4340 non-null   object
 1   year           4340 non-null   int64 
 2   selling_price  4340 non-null   int64 
 3   km_driven      4340 non-null   int64 
 4   fuel           4340 non-null   object
 5   seller_type    4340 non-null   object
 6   transmission   4340 non-null   object
 7   owner          4340 non-null   object
dtypes: int64(3), object(5)
memory usage: 271.4+ KB


In [6]:
car_dataset.describe()

Unnamed: 0,year,selling_price,km_driven
count,4340.0,4340.0,4340.0
mean,2013.090783,504127.3,66215.777419
std,4.215344,578548.7,46644.102194
min,1992.0,20000.0,1.0
25%,2011.0,208749.8,35000.0
50%,2014.0,350000.0,60000.0
75%,2016.0,600000.0,90000.0
max,2020.0,8900000.0,806599.0


### Encoding the Data

In [7]:
X = car_dataset.iloc[:, [1,3,4,6,]].values
Y = car_dataset.iloc[:,2].values

In [8]:
Y

array([ 60000, 135000, 600000, ..., 110000, 865000, 225000], dtype=int64)

In [9]:
X

array([[2007, 70000, 'Petrol', 'Manual'],
       [2007, 50000, 'Petrol', 'Manual'],
       [2012, 100000, 'Diesel', 'Manual'],
       ...,
       [2009, 83000, 'Petrol', 'Manual'],
       [2016, 90000, 'Diesel', 'Manual'],
       [2016, 40000, 'Petrol', 'Manual']], dtype=object)

In [10]:
from sklearn.preprocessing import LabelEncoder


In [11]:
# Encoding FuelTank Column
lb = LabelEncoder()
X[:,2] = lb.fit_transform(X[:,2])

In [12]:
# Encoding Transmission Column
lb1 = LabelEncoder()
X[:,3] = lb1.fit_transform(X[:,3])

In [13]:
# Encoded 0,1,2,3,4 for 4 categorical values
X[:,2]

array([4, 4, 1, ..., 4, 1, 4], dtype=object)

In [14]:
X[:,3]

array([1, 1, 1, ..., 1, 1, 1], dtype=object)

### Splitting The Data into Train and Test

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size= 0.2, random_state=0)

### Training the Model

In [17]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=300, random_state=0)

In [18]:
regressor.fit(X_train, Y_train)

RandomForestRegressor(n_estimators=300, random_state=0)

In [19]:
accuracy = regressor.score(X_test, Y_test)
accuracy

0.7579783473129759

### Performing a test on given input

In [27]:
test_data = [2007, 7000, 'Petrol', 'Manual']
test_data[2] = lb.transform([test_data[2]])[0]
test_data[3] = lb1.transform([test_data[3]])[0]


In [28]:
test_data

[2007, 7000, 4, 1]

In [29]:
regressor.predict([test_data])

array([223331.38888889])