# Part 1: Data preprocessing

dataset: https://www.kaggle.com/nehalbirla/vehicle-dataset-from-cardekho

## Importing the libraries and dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
dataset = pd.read_csv('car data.csv')

## Exploratory Data Analysis

In [3]:
dataset.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [4]:
dataset.shape

(301, 9)

In [5]:
# check the columns
dataset.columns

Index(['Car_Name', 'Year', 'Selling_Price', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner'],
      dtype='object')

In [6]:
# information about the dataset
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


The result of dataset.info() tells us there are total number of rows and columns, and the name of each column, with the data type It also contains no of not null values in each column

In [7]:
# columns with categorical values
dataset.select_dtypes(include=['object']).columns

Index(['Car_Name', 'Fuel_Type', 'Seller_Type', 'Transmission'], dtype='object')

In [8]:
len(dataset.select_dtypes(include=['object']).columns)

4

In [6]:
# columns with numerical values
dataset.select_dtypes(include=['int64', 'float64']).columns

Index(['Year', 'Selling_Price', 'Present_Price', 'Kms_Driven', 'Owner'], dtype='object')

In [7]:
len(dataset.select_dtypes(include=['int64', 'float64']).columns)

5

## Dealing with the missing data

In [12]:
# check if there are any null values
dataset.isnull().values.any() # this function returns true and false

False

In [9]:
# check how many null values
dataset.isnull().values.sum()

0

## Encoding the categorical data

In [9]:
dataset.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [10]:
dataset.shape

(301, 9)

In [11]:
# check the number of unique values in each column
print(dataset['Car_Name'].nunique())
print(dataset['Fuel_Type'].nunique())
print(dataset['Seller_Type'].nunique())
print(dataset['Transmission'].nunique())
print(dataset['Owner'].nunique())

98
3
2
2
3


In [12]:
# final dataset
dataset = dataset.drop(columns='Car_Name')# drop the car name column

In [13]:
dataset.head()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [14]:
# add a column as current year
dataset['Current Year']=2023

In [15]:
dataset.head()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,Current Year
0,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0,2023
1,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0,2023
2,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0,2023
3,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0,2023
4,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0,2023


In [16]:
dataset['years_old'] = dataset['Current Year'] - dataset['Year']

In [17]:
dataset.head()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,Current Year,years_old
0,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0,2023,9
1,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0,2023,10
2,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0,2023,6
3,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0,2023,12
4,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0,2023,9


In [18]:
dataset = dataset.drop(columns=['Year', 'Current Year'])

In [19]:
dataset.head()

Unnamed: 0,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,years_old
0,3.35,5.59,27000,Petrol,Dealer,Manual,0,9
1,4.75,9.54,43000,Diesel,Dealer,Manual,0,10
2,7.25,9.85,6900,Petrol,Dealer,Manual,0,6
3,2.85,4.15,5200,Petrol,Dealer,Manual,0,12
4,4.6,6.87,42450,Diesel,Dealer,Manual,0,9


In [20]:
#dataset = pd.get_dummies(data=dataset, drop_first=True)
dataset = pd.get_dummies(data=dataset)

In [25]:
dataset.head()

Unnamed: 0,Selling_Price,Present_Price,Kms_Driven,Owner,years_old,Fuel_Type_CNG,Fuel_Type_Diesel,Fuel_Type_Petrol,Seller_Type_Dealer,Seller_Type_Individual,Transmission_Automatic,Transmission_Manual
0,3.35,5.59,27000,0,9,0,0,1,1,0,0,1
1,4.75,9.54,43000,0,10,0,1,0,1,0,0,1
2,7.25,9.85,6900,0,6,0,0,1,1,0,0,1
3,2.85,4.15,5200,0,12,0,0,1,1,0,0,1
4,4.6,6.87,42450,0,9,0,1,0,1,0,0,1


In [21]:
dataset.shape

(301, 12)

## Splitting the dataset into train and test set

In [22]:
dataset.head()

Unnamed: 0,Selling_Price,Present_Price,Kms_Driven,Owner,years_old,Fuel_Type_CNG,Fuel_Type_Diesel,Fuel_Type_Petrol,Seller_Type_Dealer,Seller_Type_Individual,Transmission_Automatic,Transmission_Manual
0,3.35,5.59,27000,0,9,0,0,1,1,0,0,1
1,4.75,9.54,43000,0,10,0,1,0,1,0,0,1
2,7.25,9.85,6900,0,6,0,0,1,1,0,0,1
3,2.85,4.15,5200,0,12,0,0,1,1,0,0,1
4,4.6,6.87,42450,0,9,0,1,0,1,0,0,1


In [25]:
x = dataset.iloc[:, 1:].values

In [26]:
x.shape
x

array([[5.5900e+00, 2.7000e+04, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [9.5400e+00, 4.3000e+04, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [9.8500e+00, 6.9000e+03, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       ...,
       [1.1000e+01, 8.7934e+04, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [1.2500e+01, 9.0000e+03, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [5.9000e+00, 5.4640e+03, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00]])

In [23]:
y = dataset.iloc[:, 0].values

In [24]:
y.shape

(301,)

In [27]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [28]:
x_train.shape

(240, 11)

In [29]:
x_test.shape

(61, 11)

In [30]:
y_train.shape

(240,)

In [32]:
y_test.shape

(61,)

# Part 2: Building the model

## 1) Multiple linear regression

In [33]:
# linear regression model
from sklearn.linear_model import LinearRegression
regressor_lr = LinearRegression()
regressor_lr.fit(x_train, y_train)

LinearRegression()

In [34]:
y_pred = regressor_lr.predict(x_test)

In [45]:
# np.set_printoptions(precision=2)
# print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [40]:
# R^2 (coefficient of determination) regression score function
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.8481334543521506

## 2)  Random forest regression

In [41]:
from sklearn.ensemble import RandomForestRegressor
regressor_rf = RandomForestRegressor()
regressor_rf.fit(x_train, y_train)

RandomForestRegressor()

In [42]:
y_pred = regressor_rf.predict(x_test)

In [43]:
# R^2 (coefficient of determination) regression score function
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.9288055522800874

# Part 4: Final model (Random forest)

In [44]:
y_pred = regressor_rf.predict(x_test)

In [45]:
# R^2 (coefficient of determination) regression score function
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.9288055522800874

# Part 5: Predict price for a car

In [67]:
dataset.head()

Unnamed: 0,Selling_Price,Present_Price,Kms_Driven,Owner,years_old,Fuel_Type_CNG,Fuel_Type_Diesel,Fuel_Type_Petrol,Seller_Type_Dealer,Seller_Type_Individual,Transmission_Automatic,Transmission_Manual
0,3.35,5.59,27000,0,9,0,0,1,1,0,0,1
1,4.75,9.54,43000,0,10,0,1,0,1,0,0,1
2,7.25,9.85,6900,0,6,0,0,1,1,0,0,1
3,2.85,4.15,5200,0,12,0,0,1,1,0,0,1
4,4.6,6.87,42450,0,9,0,1,0,1,0,0,1


In [47]:
single_obs = [[9.54, 3500, 0, 4, 1, 0, 0, 1,0,1,0]]

In [48]:
new_pred = regressor_rf.predict(single_obs)

In [98]:
print(' The predicted selling price for the car is: {} '.format(new_pred))

 The predicted selling price for the car is: [8.08010645] 
