# Electric Vehicle Population

https://catalog.data.gov/dataset/electric-vehicle-population-data

# predict the `Electric Vehicle Type` based on the rest of the features

# import libraries

In [1]:
import time
from IPython.display import display
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport


# pd.options.display.max_columns = None

# print head of the data

In [2]:
df = pd.read_csv('dataset.csv')

# Generate a report of the data

In [None]:
profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True)
profile.to_notebook_iframe()

# print info of the data

In [3]:
display(df.info())
display(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186879 entries, 0 to 186878
Data columns (total 17 columns):
 #   Column                                             Non-Null Count   Dtype  
---  ------                                             --------------   -----  
 0   VIN (1-10)                                         186879 non-null  object 
 1   County                                             186876 non-null  object 
 2   City                                               186876 non-null  object 
 3   State                                              186879 non-null  object 
 4   Postal Code                                        186876 non-null  float64
 5   Model Year                                         186879 non-null  int64  
 6   Make                                               186879 non-null  object 
 7   Model                                              186879 non-null  object 
 8   Electric Vehicle Type                              186879 non-null  object

None

Unnamed: 0,VIN (1-10),County,City,State,Postal Code,Model Year,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Electric Range,Base MSRP,Legislative District,DOL Vehicle ID,Vehicle Location,Electric Utility,2020 Census Tract
0,WBY8P6C58K,King,Seattle,WA,98115.0,2019,BMW,I3,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,153,0,43.0,259254397,POINT (-122.3008235 47.6862671),CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),53033000000.0
1,5YJSA1DN4D,Kitsap,Bremerton,WA,98312.0,2013,TESLA,MODEL S,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,208,69900,35.0,127420940,POINT (-122.6961203 47.5759584),PUGET SOUND ENERGY INC,53035080000.0
2,5YJSA1E26J,King,Kent,WA,98042.0,2018,TESLA,MODEL S,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,249,0,47.0,170287183,POINT (-122.1145138 47.3581107),PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA),53033030000.0
3,WBY2Z2C54E,King,Bellevue,WA,98004.0,2014,BMW,I8,Plug-in Hybrid Electric Vehicle (PHEV),Not eligible due to low battery range,14,0,41.0,205545868,POINT (-122.202397 47.619252),PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA),53033020000.0
4,5YJXCDE23J,King,Bellevue,WA,98004.0,2018,TESLA,MODEL X,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,238,0,41.0,237977386,POINT (-122.202397 47.619252),PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA),53033020000.0


# how many records does the data have?

In [4]:
display("Number of records: ", df.shape[0])

'Number of records: '

186879

# how many features does the data have?

In my project, I will consider all the features except the `Electric Vehicle
Type` as the feature that I will predict. the rest of the features will be
used to predict the `Electric Vehicle Type`.

In [5]:
display("Number of features: ", df.shape[1] - 1)

'Number of features: '

16

# How many many different classes exist in the dataset?

In [6]:
display("Number of classes: ", len(df['Electric Vehicle Type'].unique()))
display("Number of examples per class:\n",
        df['Electric Vehicle Type'].value_counts())

'Number of classes: '

2

'Number of examples per class:\n'

Battery Electric Vehicle (BEV)            146297
Plug-in Hybrid Electric Vehicle (PHEV)     40582
Name: Electric Vehicle Type, dtype: int64

# show how many NULL values exist in the dataset

In [7]:
display("Number of NULL values per feature:\n", df.isnull().sum())

'Number of NULL values per feature:\n'

VIN (1-10)                                             0
County                                                 3
City                                                   3
State                                                  0
Postal Code                                            3
Model Year                                             0
Make                                                   0
Model                                                  0
Electric Vehicle Type                                  0
Clean Alternative Fuel Vehicle (CAFV) Eligibility      0
Electric Range                                         0
Base MSRP                                              0
Legislative District                                 403
DOL Vehicle ID                                         0
Vehicle Location                                       8
Electric Utility                                       3
2020 Census Tract                                      3
dtype: int64

# Which featiures are not numerical?

In [8]:
display("Non numerical features:\n",
        df.select_dtypes(exclude=[np.number]).columns)

'Non numerical features:\n'

Index(['VIN (1-10)', 'County', 'City', 'State', 'Make', 'Model',
       'Electric Vehicle Type',
       'Clean Alternative Fuel Vehicle (CAFV) Eligibility', 'Vehicle Location',
       'Electric Utility'],
      dtype='object')

# Find the best correlated Features in the Dataset

In [9]:
df_tmp = df.copy()
label_encoder = LabelEncoder()
for column in df_tmp.select_dtypes(include=['object']).columns:
    df_tmp[column] = label_encoder.fit_transform(df_tmp[column].astype(str))
df_tmp = (df_tmp - df_tmp.mean()) / df_tmp.std()
correlation = df_tmp.corr()

# Find the best-correlated features in pairs
for column in correlation.columns:
    display(correlation[column].sort_values(ascending=False).head(2))

VIN (1-10)    1.00000
Model Year    0.19042
Name: VIN (1-10), dtype: float64

County              1.000000
Electric Utility    0.117813
Name: County, dtype: float64

City                     1.000000
Electric Vehicle Type    0.046245
Name: City, dtype: float64

State                1.000000
2020 Census Tract    0.998154
Name: State, dtype: float64

Postal Code          1.000000
2020 Census Tract    0.490491
Name: Postal Code, dtype: float64

Model Year                                           1.00000
Clean Alternative Fuel Vehicle (CAFV) Eligibility    0.37193
Name: Model Year, dtype: float64

Make     1.000000
Model    0.220237
Name: Make, dtype: float64

Model                    1.000000
Electric Vehicle Type    0.289535
Name: Model, dtype: float64

Electric Vehicle Type    1.000000
Model                    0.289535
Name: Electric Vehicle Type, dtype: float64

Clean Alternative Fuel Vehicle (CAFV) Eligibility    1.00000
Model Year                                           0.37193
Name: Clean Alternative Fuel Vehicle (CAFV) Eligibility, dtype: float64

Electric Range    1.00000
Base MSRP         0.10903
Name: Electric Range, dtype: float64

Base MSRP         1.00000
Electric Range    0.10903
Name: Base MSRP, dtype: float64

Legislative District    1.00000
Electric Utility        0.20764
Name: Legislative District, dtype: float64

DOL Vehicle ID    1.000000
Model Year        0.156393
Name: DOL Vehicle ID, dtype: float64

Vehicle Location        1.000000
Legislative District    0.159224
Name: Vehicle Location, dtype: float64

Electric Utility        1.00000
Legislative District    0.20764
Name: Electric Utility, dtype: float64

2020 Census Tract    1.000000
State                0.998154
Name: 2020 Census Tract, dtype: float64

# remove the features that have high correlation with each other

the `2020 Census Tract` have the most correlation with the `State`. we will
remove the `2020 Census Tract` feature.

In [10]:
df = df.drop(columns=['2020 Census Tract'])

# remove `DOL Vehicle ID` since it's not useful for the prediction, all the
values are unique

In [11]:
df = df.drop(columns=['DOL Vehicle ID'])

# Drop the `State` feature since it's not useful for the prediction, all the
values are the same

In [12]:
df = df.drop(columns=['State'])

# transform the following features with LabelEncoder

`VIN (1-10)`, `County`, `City`, `State`, `Postal Code`, `Model Year`, `Make`,
`Model`, `Electric Vehicle Type`, `Electric Range`, `Base MSRP`, `Legislative
District`, `Electric Utility`.

In [13]:
label_encoder = LabelEncoder()

for column in ['VIN (1-10)', 'County', 'City', 'Postal Code', 'Model Year',
               'Make', 'Model', 'Electric Vehicle Type', 'Electric Range',
               'Base MSRP', 'Legislative District', 'Electric Utility']:
    df[column] = label_encoder.fit_transform(df[column].astype(str))

# transform the 'Clean Alternative Fuel Vehicle (CAFV) Eligibility` with
OneHotEncoder

In [14]:
one_hot_encoder = OneHotEncoder()
transformed_data = one_hot_encoder.fit_transform(
    df[['Clean Alternative Fuel Vehicle (CAFV) Eligibility']]).toarray()
column_names = one_hot_encoder.categories_[0]
df = pd.concat(
    [df, pd.DataFrame(transformed_data, columns=column_names)], axis=1)

df = df.drop(columns=['Clean Alternative Fuel Vehicle (CAFV) Eligibility'])

# transform the 'Vehicle Location' by splitting the values for the latitude
and longitude. i.e. `POINT (-122.1207376 47.6705374` to `-122.1207376` and
`47.6705374`

In [15]:
cleaned_data = df['Vehicle Location'].str.extract(r'POINT \((?P<Longitude>[-\d.]+) (?P<Latitude>[-\d.]+)')
df['Longitude'] = cleaned_data['Longitude']
df['Latitude'] = cleaned_data['Latitude']
df['Longitude'] = df['Longitude'].astype(float)
df['Latitude'] = df['Latitude'].astype(float)

df = df.drop(columns=['Vehicle Location'])

# drop the rest of the rows that have NULL values

In [16]:
df.dropna(inplace=True)

# simple impute

In [17]:
imputer = SimpleImputer(strategy='mean')
df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# display the head and info of the data

In [18]:
display(df.info())
display(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186871 entries, 0 to 186870
Data columns (total 17 columns):
 #   Column                                                        Non-Null Count   Dtype  
---  ------                                                        --------------   -----  
 0   VIN (1-10)                                                    186871 non-null  float64
 1   County                                                        186871 non-null  float64
 2   City                                                          186871 non-null  float64
 3   Postal Code                                                   186871 non-null  float64
 4   Model Year                                                    186871 non-null  float64
 5   Make                                                          186871 non-null  float64
 6   Model                                                         186871 non-null  float64
 7   Electric Vehicle Type                                   

None

Unnamed: 0,VIN (1-10),County,City,Postal Code,Model Year,Make,Model,Electric Vehicle Type,Electric Range,Base MSRP,Legislative District,Electric Utility,Clean Alternative Fuel Vehicle Eligible,Eligibility unknown as battery range has not been researched,Not eligible due to low battery range,Longitude,Latitude
0,9281.0,83.0,581.0,395.0,16.0,4.0,66.0,0.0,19.0,0.0,37.0,57.0,1.0,0.0,0.0,-122.300824,47.686267
1,3574.0,85.0,59.0,490.0,10.0,34.0,82.0,0.0,30.0,23.0,28.0,73.0,1.0,0.0,0.0,-122.69612,47.575958
2,3687.0,83.0,299.0,362.0,15.0,34.0,82.0,0.0,45.0,0.0,41.0,74.0,1.0,0.0,0.0,-122.114514,47.358111
3,8989.0,83.0,44.0,333.0,11.0,4.0,70.0,1.0,14.0,0.0,35.0,74.0,0.0,0.0,1.0,-122.202397,47.619252
4,4268.0,83.0,44.0,333.0,15.0,34.0,83.0,0.0,41.0,0.0,35.0,74.0,1.0,0.0,0.0,-122.202397,47.619252


# normalize the data

In [19]:
scaler = StandardScaler()
df[df.columns] = scaler.fit_transform(df[df.columns])

# display the head and info of the data

In [20]:
display(df.info())
display(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186871 entries, 0 to 186870
Data columns (total 17 columns):
 #   Column                                                        Non-Null Count   Dtype  
---  ------                                                        --------------   -----  
 0   VIN (1-10)                                                    186871 non-null  float64
 1   County                                                        186871 non-null  float64
 2   City                                                          186871 non-null  float64
 3   Postal Code                                                   186871 non-null  float64
 4   Model Year                                                    186871 non-null  float64
 5   Make                                                          186871 non-null  float64
 6   Model                                                         186871 non-null  float64
 7   Electric Vehicle Type                                   

None

Unnamed: 0,VIN (1-10),County,City,Postal Code,Model Year,Make,Model,Electric Vehicle Type,Electric Range,Base MSRP,Legislative District,Electric Utility,Clean Alternative Fuel Vehicle Eligible,Eligibility unknown as battery range has not been researched,Not eligible due to low battery range,Longitude,Latitude
0,2.017198,-0.446484,0.80202,-0.48338,-0.55571,-1.857056,-0.494005,-0.526672,-0.159064,-0.124105,0.760239,-0.22352,1.325175,-1.058752,-0.349137,-0.15307,0.300534
1,-0.266451,-0.400086,-1.572625,0.2226,-2.563898,0.734896,0.073182,-0.526672,0.203881,9.288278,0.122318,0.618811,1.325175,-1.058752,-0.349137,-0.37459,0.16621
2,-0.221234,-0.446484,-0.480834,-0.728616,-0.890408,0.734896,0.073182,-0.526672,0.698807,-0.124105,1.04376,0.671457,1.325175,-1.058752,-0.349137,-0.048665,-0.099065
3,1.900355,-0.446484,-1.640862,-0.944126,-2.2292,-1.857056,-0.352208,1.898714,-0.32404,-0.124105,0.618479,0.671457,-0.754618,-1.058752,2.864202,-0.097913,0.218929
4,0.011253,-0.446484,-1.640862,-0.944126,-0.890408,0.734896,0.108631,-0.526672,0.566827,-0.124105,0.618479,0.671457,1.325175,-1.058752,-0.349137,-0.097913,0.218929


# train data with 5 different classification models

- KNeighborsClassifier
- DecisionTreeClassifier
- RandomForestClassifier
- AdaBoostClassifier
- Naive Bayes

I will save the accuracy and the time it took to train the model for each
model in a dictionary.

# split the data to X and y

In [21]:
X = df.drop(columns=['Electric Vehicle Type'])
y = df['Electric Vehicle Type'].astype(int)  # Ensure y is categorical

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# KNeighborsClassifier

In [22]:
knn = KNeighborsClassifier()
start_time = time.time()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
end_time = time.time()
knn_accuracy = accuracy_score(y_test, knn_pred)

# display(f"KNeighborsClassifier Accuracy: {knn_accuracy}")
# display(f"Training Time: {end_time - start_time} seconds")

models_info_dict = {}

models_info_dict['KNeighborsClassifier'] = {
    'accuracy': knn_accuracy, 'time': end_time - start_time
}

# DecisionTreeClassifier

In [23]:
dt = DecisionTreeClassifier()
start_time = time.time()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
end_time = time.time()
dt_accuracy = accuracy_score(y_test, dt_pred)

# display(f"DecisionTreeClassifier Accuracy: {dt_accuracy}")
# display(f"Training Time: {end_time - start_time} seconds")

models_info_dict['DecisionTreeClassifier'] = {
    'accuracy': dt_accuracy, 'time': end_time - start_time
}

# RandomForestClassifier

In [24]:
rf = RandomForestClassifier()
start_time = time.time()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
end_time = time.time()
rf_accuracy = accuracy_score(y_test, rf_pred)

# display(f"RandomForestClassifier Accuracy: {rf_accuracy}")
# display(f"Training Time: {end_time - start_time} seconds")

models_info_dict['RandomForestClassifier'] = {
    'accuracy': rf_accuracy, 'time': end_time - start_time
}

# AdaBoostClassifier

In [25]:
ada = AdaBoostClassifier()
start_time = time.time()
ada.fit(X_train, y_train)
ada_pred = ada.predict(X_test)
end_time = time.time()
ada_accuracy = accuracy_score(y_test, ada_pred)

# display(f"AdaBoostClassifier Accuracy: {ada_accuracy}")
# display(f"Training Time: {end_time - start_time} seconds")

models_info_dict['AdaBoostClassifier'] = {
    'accuracy': ada_accuracy, 'time': end_time - start_time
}



# Naive Bayes

In [26]:
nb = LogisticRegression()
start_time = time.time()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_test)
end_time = time.time()
nb_accuracy = accuracy_score(y_test, nb_pred)

display(f"Naive Bayes Accuracy: {nb_accuracy}")
display(f"Training Time: {end_time - start_time} seconds")

models_info_dict['Naive Bayes'] = {
    'accuracy': nb_accuracy, 'time': end_time - start_time
}

'Naive Bayes Accuracy: 0.9837859531772575'

'Training Time: 0.21153473854064941 seconds'

# display the models info

In [27]:
display("With random split we reached the following results:")
display(models_info_dict)

'With random split we reached the following results:'

{'KNeighborsClassifier': {'accuracy': 0.9972709030100334,
  'time': 6.462297439575195},
 'DecisionTreeClassifier': {'accuracy': 0.9999732441471572,
  'time': 0.2932779788970947},
 'RandomForestClassifier': {'accuracy': 0.9999197324414716,
  'time': 9.46071195602417},
 'AdaBoostClassifier': {'accuracy': 0.9999464882943144,
  'time': 6.043435335159302},
 'Naive Bayes': {'accuracy': 0.9837859531772575, 'time': 0.21153473854064941}}

# 5-fold approach to measure the performance of the system

In [28]:
# KFold
models_info_dict = {}

kf = KFold(n_splits=5)
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
ada = AdaBoostClassifier()
nb = LogisticRegression()

t1 = time.time()
knn_scores = cross_val_score(knn, X, y, cv=kf)
t2 = time.time()
dt_scores = cross_val_score(dt, X, y, cv=kf)
t3 = time.time()
rf_scores = cross_val_score(rf, X, y, cv=kf)
t4 = time.time()
ada_scores = cross_val_score(ada, X, y, cv=kf)
t5 = time.time()
nb_scores = cross_val_score(nb, X, y, cv=kf)
t6 = time.time()

display("With 5-fold approach we reached the following results:")

models_info_dict['KNeighborsClassifier'] = {
    'accuracy': knn_scores.mean(), 'time': t2 - t1
}
models_info_dict['DecisionTreeClassifier'] = {
    'accuracy': dt_scores.mean(), 'time': t3 - t2
}
models_info_dict['RandomForestClassifier'] = {
    'accuracy': rf_scores.mean(), 'time': t4 - t3
}
models_info_dict['AdaBoostClassifier'] = {
    'accuracy': ada_scores.mean(), 'time': t5 - t4
}
models_info_dict['Naive Bayes'] = {
    'accuracy': nb_scores.mean(), 'time': t6 - t5
}

display(models_info_dict)



'With 5-fold approach we reached the following results:'

{'KNeighborsClassifier': {'accuracy': 0.9967410618116113,
  'time': 32.992400884628296},
 'DecisionTreeClassifier': {'accuracy': 0.9999678924038727,
  'time': 1.3860268592834473},
 'RandomForestClassifier': {'accuracy': 0.999930433350819,
  'time': 42.670209884643555},
 'AdaBoostClassifier': {'accuracy': 0.9999250817507136,
  'time': 29.012213945388794},
 'Naive Bayes': {'accuracy': 0.9844224051452792, 'time': 0.9665944576263428}}

# 10 best features from the dataset

In [29]:
best_features = SelectKBest(score_func=f_classif, k=10)
fit = best_features.fit(X, y)
display("10 best features from the dataset:", X.columns[fit.get_support()])
df_scores = pd.DataFrame(fit.scores_)
df_columns = pd.DataFrame(X.columns)
feature_scores = pd.concat([df_columns, df_scores], axis=1)
feature_scores.columns = ['Feature', 'Score']
display(feature_scores.nlargest(10, 'Score'))

'10 best features from the dataset:'

Index(['Postal Code', 'Model Year', 'Make', 'Model', 'Electric Range',
       'Legislative District', 'Electric Utility',
       'Clean Alternative Fuel Vehicle Eligible',
       'Eligibility unknown as battery range has not been researched',
       'Not eligible due to low battery range'],
      dtype='object')

Unnamed: 0,Feature,Score
13,Not eligible due to low battery range,146236.874242
12,Eligibility unknown as battery range has not b...,84322.956212
7,Electric Range,61209.992531
5,Make,32668.850515
6,Model,17095.93784
4,Model Year,4775.48967
11,Clean Alternative Fuel Vehicle Eligible,4305.603226
3,Postal Code,1636.980849
9,Legislative District,936.559519
10,Electric Utility,769.817448
