# Electric-Car Price Prediction Model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Data Preparation

In [18]:
# create a Data Frame
dataset = pd.read_csv("Electric cars.csv")
df = pd.DataFrame(dataset)
df

Unnamed: 0,Year,BEV average price (USD),Global Sales Volume,Mileage (Km),Lithium Ion Battery Price (USD),Average price of new car
0,2010,64032,50000.0,127,1191,37500
1,2011,51736,60000.0,139,924,37311
2,2012,52084,80000.0,160,726,36874
3,2013,56028,150000.0,189,668,37826
4,2014,44776,224700.0,210,592,37519
5,2015,42340,380100.0,211,384,38240
6,2016,46284,506880.0,233,295,38455
7,2017,44776,846210.0,267,221,38350
8,2018,41412,1436580.0,304,181,38365
9,2019,42804,1708500.0,336,157,40546


In [19]:
# make columns uniform by setting them all to lowercase letters and replacing spaces with underscores
df.columns = df.columns.str.lower().str.replace(" ", "_")
df

Unnamed: 0,year,bev_average_price_(usd),global_sales_volume,mileage_(km),lithium_ion_battery_price_(usd),_average_price_of_new_car
0,2010,64032,50000.0,127,1191,37500
1,2011,51736,60000.0,139,924,37311
2,2012,52084,80000.0,160,726,36874
3,2013,56028,150000.0,189,668,37826
4,2014,44776,224700.0,210,592,37519
5,2015,42340,380100.0,211,384,38240
6,2016,46284,506880.0,233,295,38455
7,2017,44776,846210.0,267,221,38350
8,2018,41412,1436580.0,304,181,38365
9,2019,42804,1708500.0,336,157,40546


In [20]:
# list of all columns whose values are objects
strings = list(df.dtypes[df.dtypes == "object"].index)
strings

[]

No object data type column

In [21]:
# nicely prepared data frame
df.head()

Unnamed: 0,year,bev_average_price_(usd),global_sales_volume,mileage_(km),lithium_ion_battery_price_(usd),_average_price_of_new_car
0,2010,64032,50000.0,127,1191,37500
1,2011,51736,60000.0,139,924,37311
2,2012,52084,80000.0,160,726,36874
3,2013,56028,150000.0,189,668,37826
4,2014,44776,224700.0,210,592,37519


## Exploratory Data Analysis

In [23]:
# check for some information about the dataset such as unique values, number of unique values per column e.t.c.
for col in df.columns:
    print(col)
    print(df[col].unique()[:5])
    print(df[col].nunique())

year
[2010 2011 2012 2013 2014]
13
bev_average_price_(usd)
[64032 51736 52084 56028 44776]
12
global_sales_volume
[ 50000.  60000.  80000. 150000. 224700.]
12
mileage_(km)
[127 139 160 189 210]
13
lithium_ion_battery_price_(usd)
[1191  924  726  668  592]
13
_average_price_of_new_car
[37500 37311 36874 37826 37519]
13


## X and y values

In [4]:
dataset = dataset.dropna(axis=0)

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12 entries, 0 to 11
Data columns (total 6 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Year                             12 non-null     int64  
 1   BEV average price (USD)          12 non-null     int64  
 2   Global Sales Volume              12 non-null     float64
 3   Mileage (Km)                     12 non-null     int64  
 4   Lithium Ion Battery Price (USD)  12 non-null     int64  
 5    Average price of new car        12 non-null     int64  
dtypes: float64(1), int64(5)
memory usage: 672.0 bytes


In [6]:
X = dataset.iloc[:, :5].values
y = dataset.iloc[:, 5].values

In [7]:
X

array([[2.01000e+03, 6.40320e+04, 5.00000e+04, 1.27000e+02, 1.19100e+03],
       [2.01100e+03, 5.17360e+04, 6.00000e+04, 1.39000e+02, 9.24000e+02],
       [2.01200e+03, 5.20840e+04, 8.00000e+04, 1.60000e+02, 7.26000e+02],
       [2.01300e+03, 5.60280e+04, 1.50000e+05, 1.89000e+02, 6.68000e+02],
       [2.01400e+03, 4.47760e+04, 2.24700e+05, 2.10000e+02, 5.92000e+02],
       [2.01500e+03, 4.23400e+04, 3.80100e+05, 2.11000e+02, 3.84000e+02],
       [2.01600e+03, 4.62840e+04, 5.06880e+05, 2.33000e+02, 2.95000e+02],
       [2.01700e+03, 4.47760e+04, 8.46210e+05, 2.67000e+02, 2.21000e+02],
       [2.01800e+03, 4.14120e+04, 1.43658e+06, 3.04000e+02, 1.81000e+02],
       [2.01900e+03, 4.28040e+04, 1.70850e+06, 3.36000e+02, 1.57000e+02],
       [2.02000e+03, 5.46000e+04, 2.26800e+06, 3.38000e+02, 1.37000e+02],
       [2.02100e+03, 5.29000e+04, 4.79250e+06, 3.49000e+02, 1.32000e+02]])

In [8]:
y

array([37500, 37311, 36874, 37826, 37519, 38240, 38455, 38350, 38365,
       40546, 44021, 49185], dtype=int64)

## Splitting the dataset

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Fitting MLR in our model

In [10]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

## Make Prediction

In [11]:
prediction = regressor.predict(X_test)
prediction

array([38821.5738429 , 49866.67577636, 36746.36740653])

In [12]:
print("Train score: ", regressor.score(X_train, y_train))
print("Test score: ", regressor.score(X_test, y_test))

Train score:  0.9416130967641753
Test score:  0.9857675790296068


## Predicting and outputing the prediction results using a given set of data

In [13]:
prediction = regressor.predict([[2021, 52900, 4792500.0, 349, 132]])
print(prediction)

[49866.67577636]
