# Problem Statement
As EV adoption accelerates worldwide, governments need to understand which vehicles qualify for clean-fuel incentives and how EV adoption varies across geographies. Your task is to use Washington State’s EV registration dataset to build machine learning models that predict:

__The electric range of an EV given its features.__

In [1]:
# importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
%matplotlib inline
warnings.filterwarnings("ignore")

In [2]:
# Loading data set
data=pd.read_csv("Electric_Vehicle_Population_Data.csv")

In [3]:
data

Unnamed: 0,VIN (1-10),County,City,State,Postal Code,Model Year,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Electric Range,Base MSRP,Legislative District,DOL Vehicle ID,Vehicle Location,Electric Utility,2020 Census Tract
0,5YJ3E1EB5K,Yakima,Yakima,WA,98901.0,2019.0,TESLA,MODEL 3,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,220.0,0.0,15.0,347724772.0,POINT (-120.50729 46.60464),PACIFICORP,5.307700e+10
1,1C4RJXU67R,Kitsap,Port Orchard,WA,98367.0,2024.0,JEEP,WRANGLER,Plug-in Hybrid Electric Vehicle (PHEV),Not eligible due to low battery range,21.0,0.0,35.0,272165288.0,POINT (-122.68471 47.50524),PUGET SOUND ENERGY INC,5.303509e+10
2,KNDCD3LD0N,Snohomish,Lynnwood,WA,98036.0,2022.0,KIA,NIRO,Plug-in Hybrid Electric Vehicle (PHEV),Not eligible due to low battery range,26.0,0.0,32.0,203182584.0,POINT (-122.29245 47.82557),PUGET SOUND ENERGY INC,5.306105e+10
3,5UXKT0C37H,King,Auburn,WA,98001.0,2017.0,BMW,X5,Plug-in Hybrid Electric Vehicle (PHEV),Not eligible due to low battery range,14.0,0.0,30.0,349010287.0,POINT (-122.23035 47.3074),PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA),5.303303e+10
4,1N4AZ0CP1D,Skagit,Mount Vernon,,98273.0,2013.0,NISSAN,LEAF,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,75.0,0.0,40.0,131684150.0,POINT (-122.33891 48.41644),PUGET SOUND ENERGY INC,5.305795e+10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257630,2C4RC1L73L,Snohomish,Mukilteo,WA,98275.0,2020.0,CHRYSLER,PACIFICA,Plug-in Hybrid Electric Vehicle (PHEV),Clean Alternative Fuel Vehicle Eligible,32.0,0.0,21.0,161576068.0,POINT (-122.29196 47.89908),PUGET SOUND ENERGY INC,5.306104e+10
257631,7SAYGDEE8N,King,Kirkland,WA,98033.0,2022.0,TESLA,MODEL Y,Battery Electric Vehicle (BEV),Eligibility unknown as battery range has not b...,0.0,0.0,48.0,194978738.0,POINT (-122.2066 47.67887),PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA),5.303302e+10
257632,WBAJB1C53K,Clark,Vancouver,WA,98683.0,2019.0,BMW,530E,Plug-in Hybrid Electric Vehicle (PHEV),Not eligible due to low battery range,15.0,55700.0,18.0,278137940.0,POINT (-122.49212 45.60365),BONNEVILLE POWER ADMINISTRATION||PUD NO 1 OF C...,5.301104e+10
257633,JTDKARFP9K,Whatcom,Bellingham,WA,98229.0,2019.0,TOYOTA,PRIUS PRIME (PHEV),Plug-in Hybrid Electric Vehicle (PHEV),Not eligible due to low battery range,25.0,0.0,40.0,177812232.0,POINT (-122.45486 48.7449),PUGET SOUND ENERGY INC||PUD NO 1 OF WHATCOM CO...,5.307300e+10


### Basic checks

In [4]:
data.shape # To check number rows and columns

(257635, 17)

In [5]:
data.info() # To know informations about the dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 257635 entries, 0 to 257634
Data columns (total 17 columns):
 #   Column                                             Non-Null Count   Dtype  
---  ------                                             --------------   -----  
 0   VIN (1-10)                                         257628 non-null  object 
 1   County                                             257624 non-null  object 
 2   City                                               257624 non-null  object 
 3   State                                              257634 non-null  object 
 4   Postal Code                                        257624 non-null  float64
 5   Model Year                                         257634 non-null  float64
 6   Make                                               257631 non-null  object 
 7   Model                                              257631 non-null  object 
 8   Electric Vehicle Type                              257631 non-null  object

In [6]:
data.describe()

Unnamed: 0,Postal Code,Model Year,Electric Range,Base MSRP,Legislative District,DOL Vehicle ID,2020 Census Tract
count,257624.0,257634.0,257630.0,257631.0,257034.0,257634.0,257627.0
mean,98177.613631,2021.713194,43.127889,705.273278,28.897158,240400800.0,52975300000.0
std,2535.936895,3.023824,81.633111,6997.377799,14.882193,66029380.0,1597188000.0
min,1469.0,2000.0,0.0,0.0,1.0,4385.0,1001020000.0
25%,98052.0,2020.0,0.0,0.0,17.0,213432100.0,53033010000.0
50%,98133.0,2023.0,0.0,0.0,32.0,258634000.0,53033030000.0
75%,98380.0,2024.0,35.0,0.0,42.0,273827200.0,53053070000.0
max,99577.0,2026.0,337.0,845000.0,49.0,479254800.0,66010950000.0


In [7]:
data.describe(include="O")

Unnamed: 0,VIN (1-10),County,City,State,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Vehicle Location,Electric Utility
count,257628,257624,257624,257634,257631,257631,257631,257633,257619,257627
unique,15513,226,830,50,46,179,2,3,1029,76
top,7SAYGDEE7P,King,Seattle,WA,TESLA,MODEL Y,Battery Electric Vehicle (BEV),Eligibility unknown as battery range has not b...,POINT (-122.13158 47.67858),PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA)
freq,1194,128272,40534,257037,107534,53559,205092,157670,6268,92161


### EDA

In [8]:
from ydata_profiling import ProfileReport
Profile = ProfileReport(data,title = "EDA",explorative = True)

In [9]:
Profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|                                                                                           | 0/17 [00:00<?, ?it/s][A
  6%|████▉                                                                              | 1/17 [00:07<01:55,  7.24s/it][A
 12%|█████████▊                                                                         | 2/17 [00:07<00:46,  3.09s/it][A
 18%|██████████████▋                                                                    | 3/17 [00:07<00:26,  1.86s/it][A
 24%|███████████████████▌                                                               | 4/17 [00:08<00:16,  1.26s/it][A
 53%|███████████████████████████████████████████▉                                       | 9/17 [00:09<00:04,  1.68it/s][A
 76%|██████████████████████████████████████████████████████████████▋                   | 13/17 [00:10<00:01,  2.74it/s][A
100%|██████████████████████████████████████████████████████████████████████████████████| 17/17 [00:11<00:00,  1.48it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [10]:
data.isnull().sum()

VIN (1-10)                                             7
County                                                11
City                                                  11
State                                                  1
Postal Code                                           11
Model Year                                             1
Make                                                   4
Model                                                  4
Electric Vehicle Type                                  4
Clean Alternative Fuel Vehicle (CAFV) Eligibility      2
Electric Range                                         5
Base MSRP                                              4
Legislative District                                 601
DOL Vehicle ID                                         1
Vehicle Location                                      16
Electric Utility                                       8
2020 Census Tract                                      8
dtype: int64

In [11]:
data.duplicated().sum()

0

### Data Cleaning

In [12]:
data.columns

Index(['VIN (1-10)', 'County', 'City', 'State', 'Postal Code', 'Model Year',
       'Make', 'Model', 'Electric Vehicle Type',
       'Clean Alternative Fuel Vehicle (CAFV) Eligibility', 'Electric Range',
       'Base MSRP', 'Legislative District', 'DOL Vehicle ID',
       'Vehicle Location', 'Electric Utility', '2020 Census Tract'],
      dtype='object')

We can remove unwanted features which are not neccessary for our modeling.

In [13]:
columns_to_drop = ['VIN (1-10)','Base MSRP','Legislative District','DOL Vehicle ID','2020 Census Tract','County', 'City', 'State', 'Postal Code']
data.drop(columns_to_drop,axis=1,inplace=True)

In [14]:
data.shape

(257635, 8)

In [15]:
data.head()

Unnamed: 0,Model Year,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Electric Range,Vehicle Location,Electric Utility
0,2019.0,TESLA,MODEL 3,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,220.0,POINT (-120.50729 46.60464),PACIFICORP
1,2024.0,JEEP,WRANGLER,Plug-in Hybrid Electric Vehicle (PHEV),Not eligible due to low battery range,21.0,POINT (-122.68471 47.50524),PUGET SOUND ENERGY INC
2,2022.0,KIA,NIRO,Plug-in Hybrid Electric Vehicle (PHEV),Not eligible due to low battery range,26.0,POINT (-122.29245 47.82557),PUGET SOUND ENERGY INC
3,2017.0,BMW,X5,Plug-in Hybrid Electric Vehicle (PHEV),Not eligible due to low battery range,14.0,POINT (-122.23035 47.3074),PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA)
4,2013.0,NISSAN,LEAF,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,75.0,POINT (-122.33891 48.41644),PUGET SOUND ENERGY INC


In [16]:
data.isnull().sum()

Model Year                                            1
Make                                                  4
Model                                                 4
Electric Vehicle Type                                 4
Clean Alternative Fuel Vehicle (CAFV) Eligibility     2
Electric Range                                        5
Vehicle Location                                     16
Electric Utility                                      8
dtype: int64

In [17]:
data['Model Year'].median()

2023.0

In [21]:
data['Model Year'].fillna(data['Model Year'].median(), inplace=True)
data['Make'].fillna(data['Make'].mode()[0], inplace=True)
data['Model'].fillna(data['Model'].mode()[0], inplace=True)
data['Electric Vehicle Type'].fillna(data['Electric Vehicle Type'].mode()[0], inplace=True)
data['Clean Alternative Fuel Vehicle (CAFV) Eligibility'].fillna(data['Clean Alternative Fuel Vehicle (CAFV) Eligibility'].mode()[0], inplace=True)
data['Electric Range'].fillna(data['Electric Range'].median(), inplace=True)
data['Vehicle Location'].fillna(data['Vehicle Location'].mode()[0], inplace=True)
data['Electric Utility'].fillna(data['Electric Utility'].mode()[0], inplace=True)

In [22]:
data.isnull().sum()

Model Year                                           0
Make                                                 0
Model                                                0
Electric Vehicle Type                                0
Clean Alternative Fuel Vehicle (CAFV) Eligibility    0
Electric Range                                       0
Vehicle Location                                     0
Electric Utility                                     0
dtype: int64

In [23]:
data['Clean Alternative Fuel Vehicle (CAFV) Eligibility'].value_counts()

Clean Alternative Fuel Vehicle (CAFV) Eligibility
Eligibility unknown as battery range has not been researched    157672
Clean Alternative Fuel Vehicle Eligible                          76156
Not eligible due to low battery range                            23807
Name: count, dtype: int64

### Preprocessing

In [25]:
from sklearn.preprocessing import LabelEncoder
label_columns=['Make','Model','Vehicle Location','Electric Utility']
encoder= LabelEncoder()
for col in label_columns:
    data[col]=encoder.fit_transform(data[col])
data.head()


Unnamed: 0,Model Year,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Electric Range,Vehicle Location,Electric Utility
0,2019.0,39,100,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,220.0,301,65
1,2024.0,19,170,Plug-in Hybrid Electric Vehicle (PHEV),Not eligible due to low battery range,21.0,636,73
2,2022.0,20,106,Plug-in Hybrid Electric Vehicle (PHEV),Not eligible due to low battery range,26.0,472,73
3,2017.0,5,172,Plug-in Hybrid Electric Vehicle (PHEV),Not eligible due to low battery range,14.0,447,74
4,2013.0,31,95,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,75.0,503,73


In [26]:
data['Electric Vehicle Type'].value_counts()

Electric Vehicle Type
Battery Electric Vehicle (BEV)            205096
Plug-in Hybrid Electric Vehicle (PHEV)     52539
Name: count, dtype: int64

In [27]:
data['Electric Vehicle Type'] = data['Electric Vehicle Type'].map({'Battery Electric Vehicle (BEV)': 1, 'Plug-in Hybrid Electric Vehicle (PHEV)': 0})

In [28]:
data

Unnamed: 0,Model Year,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Electric Range,Vehicle Location,Electric Utility
0,2019.0,39,100,1,Clean Alternative Fuel Vehicle Eligible,220.0,301,65
1,2024.0,19,170,0,Not eligible due to low battery range,21.0,636,73
2,2022.0,20,106,0,Not eligible due to low battery range,26.0,472,73
3,2017.0,5,172,0,Not eligible due to low battery range,14.0,447,74
4,2013.0,31,95,1,Clean Alternative Fuel Vehicle Eligible,75.0,503,73
...,...,...,...,...,...,...,...,...
257630,2020.0,9,113,0,Clean Alternative Fuel Vehicle Eligible,32.0,471,73
257631,2022.0,39,103,1,Eligibility unknown as battery range has not b...,0.0,439,74
257632,2019.0,5,3,0,Not eligible due to low battery range,15.0,564,36
257633,2019.0,41,117,0,Not eligible due to low battery range,25.0,549,75


In [34]:
data.replace({"Clean Alternative Fuel Vehicle (CAFV) Eligibility":{"Clean Alternative Fuel Vehicle Eligible":1,"Not eligible due to low battery range":0,
                                                                   "Eligibility unknown as battery range has not been researched":0}})

Unnamed: 0,Model Year,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Electric Range,Vehicle Location,Electric Utility,CAFV_Binary
0,2019.0,39,100,1,1,220.0,301,65,1
1,2024.0,19,170,0,0,21.0,636,73,0
2,2022.0,20,106,0,0,26.0,472,73,0
3,2017.0,5,172,0,0,14.0,447,74,0
4,2013.0,31,95,1,1,75.0,503,73,1
...,...,...,...,...,...,...,...,...,...
257630,2020.0,9,113,0,1,32.0,471,73,1
257631,2022.0,39,103,1,0,0.0,439,74,Eligibility unknown as battery range has not b...
257632,2019.0,5,3,0,0,15.0,564,36,0
257633,2019.0,41,117,0,0,25.0,549,75,0


In [36]:
drop_columns=['Clean Alternative Fuel Vehicle (CAFV) Eligibility','CAFV_Binary']
data.drop(drop_columns,axis=1,inplace=True)

In [37]:
data.head()

Unnamed: 0,Model Year,Make,Model,Electric Vehicle Type,Electric Range,Vehicle Location,Electric Utility
0,2019.0,39,100,1,220.0,301,65
1,2024.0,19,170,0,21.0,636,73
2,2022.0,20,106,0,26.0,472,73
3,2017.0,5,172,0,14.0,447,74
4,2013.0,31,95,1,75.0,503,73


### Modeling

In [38]:
x=data.drop('Electric Range',axis=1)
y=data['Electric Range']

In [40]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [41]:
x_train.shape

(206108, 6)

In [42]:
y_train.shape

(206108,)

In [43]:
x_test.shape

(51527, 6)

## Linear Regression

In [44]:
from sklearn.linear_model import LinearRegression
model=LinearRegression() 
model.fit(x_train,y_train)

In [45]:
y_pred = model.predict(x_test)

In [46]:
y_pred

array([ 18.30036671,  16.44113545, 137.53877492, ...,   1.42785308,
        78.41305176, 130.34583394])

In [51]:
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from math import sqrt

In [52]:
RMSE= np.sqrt(mean_squared_error(y_test,y_pred))

In [53]:
RMSE

67.6492209639447

### RandomForest

In [50]:
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor() 
rf.fit(x_train,y_train)

In [54]:
y_pred_rf = rf.predict(x_test)

In [55]:
y_pred_rf

array([  0.        ,   0.        , 200.        , ...,   0.        ,
       302.46933333,  38.        ])

In [56]:
RMSE_rf= np.sqrt(mean_squared_error(y_test,y_pred_rf))

In [57]:
RMSE_rf

6.476271043044902

### GradientBoosting

In [58]:
from sklearn.ensemble import GradientBoostingRegressor
GB = GradientBoostingRegressor()
GB.fit(x_train,y_train)

In [59]:
y_pred_GB = GB.predict(x_test)

In [60]:
y_pred_GB

array([  1.97175969,  -0.30241955, 194.54648118, ...,  -0.30241955,
       280.05530933,  39.98428106])

In [61]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred_GB))
rmse

14.211656646833543

### DecisionTree

In [64]:
from sklearn.tree import DecisionTreeRegressor
dtree = DecisionTreeRegressor()
dtree.fit(x_train,y_train)

In [65]:
dtree_pred = dtree.predict(x_test)
dtree_pred

array([  0.        ,   0.        , 200.        , ...,   0.        ,
       303.33333333,  38.        ])

In [66]:
rmse = np.sqrt(mean_squared_error(y_test, dtree_pred))
rmse

7.057051950419554

## Conclusion

Comparing all these models RandomForest Regressor performs well.