# Import Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Loading and Preprocessing

In [5]:
df = pd.read_csv("C:/Users/AmalDev/OneDrive/Desktop/Electric_Vehicle_Population_Data.csv")

## Preprocessing

In [8]:
# Check Rows and Columns
print("Rows:", df.shape[0])
print("Columns:", df.shape[1])

Rows: 247344
Columns: 17


In [10]:
# Clean and View Column Names
df.columns = df.columns.str.strip()
print(df.columns.tolist())

['VIN (1-10)', 'County', 'City', 'State', 'Postal Code', 'Model Year', 'Make', 'Model', 'Electric Vehicle Type', 'Clean Alternative Fuel Vehicle (CAFV) Eligibility', 'Electric Range', 'Base MSRP', 'Legislative District', 'DOL Vehicle ID', 'Vehicle Location', 'Electric Utility', '2020 Census Tract']


In [12]:
# Head of the data
display(df.head())

Unnamed: 0,VIN (1-10),County,City,State,Postal Code,Model Year,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Electric Range,Base MSRP,Legislative District,DOL Vehicle ID,Vehicle Location,Electric Utility,2020 Census Tract
0,1N4BZ0CP5G,King,Seattle,WA,98125.0,2016,NISSAN,LEAF,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,84.0,0.0,46.0,349455557,POINT (-122.30253 47.72656),CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),53033000000.0
1,KNDJX3AEXG,King,Renton,WA,98058.0,2016,KIA,SOUL,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,93.0,31950.0,11.0,210641315,POINT (-122.08747 47.4466),PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA),53033030000.0
2,5YJ3E1EB2J,King,Seattle,WA,98115.0,2018,TESLA,MODEL 3,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,215.0,0.0,43.0,171417494,POINT (-122.31676 47.68156),CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),53033000000.0
3,1C4RJXN64R,Kitsap,Bremerton,WA,98312.0,2024,JEEP,WRANGLER,Plug-in Hybrid Electric Vehicle (PHEV),Not eligible due to low battery range,21.0,0.0,26.0,262542927,POINT (-122.65223 47.57192),PUGET SOUND ENERGY INC,53035080000.0
4,5YJ3E1EB1J,Thurston,Olympia,WA,98512.0,2018,TESLA,MODEL 3,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,215.0,0.0,35.0,110432815,POINT (-122.9131 47.01359),PUGET SOUND ENERGY INC,53067010000.0


In [14]:
# Tail of the data
display(df.tail())

Unnamed: 0,VIN (1-10),County,City,State,Postal Code,Model Year,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Electric Range,Base MSRP,Legislative District,DOL Vehicle ID,Vehicle Location,Electric Utility,2020 Census Tract
247339,7G2CEHED0R,King,Newcastle,WA,98056.0,2024,TESLA,CYBERTRUCK,Battery Electric Vehicle (BEV),Eligibility unknown as battery range has not b...,0.0,0.0,41.0,278417947,POINT (-122.1805 47.50006),PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA),53033030000.0
247340,1C4RJYC65R,Pierce,Puyallup,WA,98374.0,2024,JEEP,GRAND CHEROKEE,Plug-in Hybrid Electric Vehicle (PHEV),Not eligible due to low battery range,25.0,0.0,25.0,271451197,POINT (-122.27575 47.13959),PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA),53053070000.0
247341,7SAYGDEE0P,King,Bothell,WA,98011.0,2023,TESLA,MODEL Y,Battery Electric Vehicle (BEV),Eligibility unknown as battery range has not b...,0.0,0.0,1.0,249563667,POINT (-122.20563 47.76144),PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA),53033020000.0
247342,KNDPYDAH9P,Kitsap,Bainbridge Island,WA,98110.0,2023,KIA,SPORTAGE,Plug-in Hybrid Electric Vehicle (PHEV),Clean Alternative Fuel Vehicle Eligible,34.0,0.0,23.0,238968376,POINT (-122.521 47.62728),PUGET SOUND ENERGY INC,53035090000.0
247343,KNDPZDAH8P,Whatcom,Bellingham,WA,98226.0,2023,KIA,SPORTAGE,Plug-in Hybrid Electric Vehicle (PHEV),Clean Alternative Fuel Vehicle Eligible,34.0,0.0,42.0,253768637,POINT (-122.49756 48.7999),PUGET SOUND ENERGY INC||PUD NO 1 OF WHATCOM CO...,53073000000.0


In [18]:
# Dataset Info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247344 entries, 0 to 247343
Data columns (total 17 columns):
 #   Column                                             Non-Null Count   Dtype  
---  ------                                             --------------   -----  
 0   VIN (1-10)                                         247344 non-null  object 
 1   County                                             247340 non-null  object 
 2   City                                               247340 non-null  object 
 3   State                                              247344 non-null  object 
 4   Postal Code                                        247340 non-null  float64
 5   Model Year                                         247344 non-null  int64  
 6   Make                                               247344 non-null  object 
 7   Model                                              247344 non-null  object 
 8   Electric Vehicle Type                              247344 non-null  object

In [20]:
# Descriptive Statistics
print(df.describe())

         Postal Code     Model Year  Electric Range      Base MSRP  \
count  247340.000000  247344.000000   247324.000000  247324.000000   
mean    98180.773999    2021.572450       44.645659     740.206308   
std      2496.155753       3.014246       82.945315    6971.140984   
min      1731.000000    2000.000000        0.000000       0.000000   
25%     98052.000000    2020.000000        0.000000       0.000000   
50%     98125.000000    2023.000000        0.000000       0.000000   
75%     98382.000000    2024.000000       37.000000       0.000000   
max     99577.000000    2026.000000      337.000000  845000.000000   

       Legislative District  DOL Vehicle ID  2020 Census Tract  
count         246791.000000    2.473440e+05       2.473400e+05  
mean              29.039924    2.382157e+08       5.297516e+10  
std               14.857603    6.709585e+07       1.606887e+09  
min                1.000000    4.385000e+03       1.001020e+09  
25%               18.000000    2.089133e+08 

In [22]:
# Check Missing and Duplicate Values
print("Missing values:\n", df.isnull().sum())
print("Duplicate rows:", df.duplicated().sum())

Missing values:
 VIN (1-10)                                             0
County                                                 4
City                                                   4
State                                                  0
Postal Code                                            4
Model Year                                             0
Make                                                   0
Model                                                  0
Electric Vehicle Type                                  0
Clean Alternative Fuel Vehicle (CAFV) Eligibility      0
Electric Range                                        20
Base MSRP                                             20
Legislative District                                 553
DOL Vehicle ID                                         0
Vehicle Location                                      11
Electric Utility                                       4
2020 Census Tract                                      4
dtype: int64
D

In [24]:
# Check Value Counts for Key Categorical Columns
print("Electric Vehicle Type:\n", df['Electric Vehicle Type'].value_counts())
print("\nTop 10 Makes:\n", df['Make'].value_counts().head(10))
print("\nModel Year Distribution:\n", df['Model Year'].value_counts().sort_index())

Electric Vehicle Type:
 Electric Vehicle Type
Battery Electric Vehicle (BEV)            197146
Plug-in Hybrid Electric Vehicle (PHEV)     50198
Name: count, dtype: int64

Top 10 Makes:
 Make
TESLA        105001
CHEVROLET     17840
NISSAN        15892
FORD          13270
KIA           11978
BMW           10370
TOYOTA        10245
HYUNDAI        8048
RIVIAN         7491
VOLVO          6428
Name: count, dtype: int64

Model Year Distribution:
 Model Year
2000        8
2002        2
2003        1
2008       17
2010       23
2011      656
2012     1462
2013     4168
2014     3364
2015     4613
2016     5302
2017     8755
2018    14504
2019    11055
2020    12390
2021    20794
2022    29337
2023    59088
2024    49799
2025    20820
2026     1186
Name: count, dtype: int64


In [26]:
# Unique Value Counts
print("Unique value count per column:\n", df.nunique())

Unique value count per column:
 VIN (1-10)                                            14792
County                                                  215
City                                                    807
State                                                    49
Postal Code                                             998
Model Year                                               21
Make                                                     46
Model                                                   177
Electric Vehicle Type                                     2
Clean Alternative Fuel Vehicle (CAFV) Eligibility         3
Electric Range                                          111
Base MSRP                                                31
Legislative District                                     49
DOL Vehicle ID                                       247344
Vehicle Location                                        997
Electric Utility                                         75
2020 Cen

In [28]:
# Remove Repeated Header Rows (if any)
df = df[df['VIN (1-10)'] != 'VIN (1-10)']
df.reset_index(drop=True, inplace=True)

### Remove Duplicate Records

In [33]:
df.drop_duplicates(inplace=True)

### Handle Missing Values

In [36]:
df['Electric Range'] = pd.to_numeric(df['Electric Range'], errors='coerce')
df['Base MSRP'] = pd.to_numeric(df['Base MSRP'], errors='coerce')

df['Electric Range'].fillna(df['Electric Range'].median(), inplace=True)
df['Base MSRP'].fillna(df['Base MSRP'].median(), inplace=True)
df.fillna("Unknown", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Electric Range'].fillna(df['Electric Range'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Base MSRP'].fillna(df['Base MSRP'].median(), inplace=True)
  df.fillna("Unknown", inplace=True)


### Fix Data Types

In [39]:
df['Model Year'] = df['Model Year'].astype(int)
df['Postal Code'] = df['Postal Code'].astype(str).str[:5]

### Clean Text Columns

In [42]:
text_cols = ['Make', 'Model', 'City', 'County', 'Electric Vehicle Type', 'Electric Utility']
for col in text_cols:
    df[col] = df[col].astype(str).str.strip().str.lower()

### Feature Engineering

In [45]:
current_year = datetime.now().year
df['Vehicle Age'] = current_year - df['Model Year']

df['Range Category'] = pd.cut(df['Electric Range'], bins=[0, 80, 200, 1000],
                              labels=['Short', 'Medium', 'Long'])

df['Is CAFV Eligible'] = df['Clean Alternative Fuel Vehicle (CAFV) Eligibility']\
    .apply(lambda x: 1 if 'eligible' in str(x).lower() else 0)

### Drop Irrelevant Columns

In [48]:
df.drop(columns=['VIN (1-10)', 'DOL Vehicle ID', 'Vehicle Location', '2020 Census Tract'],
        errors='ignore', inplace=True)

### Encode Categorical Variables

In [53]:
le = LabelEncoder()
categorical_cols = ['Make', 'Model', 'City', 'County', 'Electric Utility']
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

### Outlier Detection (IQR Method)

In [56]:
numeric_cols = ['Electric Range', 'Base MSRP', 'Vehicle Age']
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df = df[(df[col] >= lower) & (df[col] <= upper)]

### Skewness Handling (Log Transform)

In [60]:
for col in numeric_cols:
    if df[col].skew() > 1:
        df[col] = np.log1p(df[col])

### Scale Numeric Features

In [63]:
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

### Define Features and Target

In [66]:
X = df.drop(columns=['Electric Vehicle Type'])
y = df['Electric Vehicle Type']

### Train/Test Split

In [69]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)