In [1]:
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [4]:
# Sample data for your EV county dataset
data = {
    "Date": ["2022-09-30", "2022-12-31", "2020-01-31", "2022-06-30", "2021-07-31"],
    "Country": ["Brazil","Russia","India","China","South Africa"],
    "State": ["CA", "VA", "MN", "WA", "CO"],
    "Vehicle Primary Use": ["Passenger", "Passenger", "Passenger", "Truck", "Passenger"],
    "Battery Electric Vehicles (BEVs)": [7, 1, 0, 0, 0],
    "Plug-In Hybrid Electric Vehicles (PHEVs)": [0, 2, 1, 0, 1],
    "Electric Vehicle (EV) Total": [7, 3, 1, 0, 1],
    "Non-Electric Vehicle Total": [460, 188, 32, 3575, 83],
    "Total Vehicles": [467, 191, 33, 3575, 84],
    "Percent Electric Vehicles": [1.50, 1.57, 3.03, 0.00, 1.19],
    "EV_Population": [2500000, 7000000, 1500000, 500000, 400000],
    "EV_Percentage": [4.1, 5.3, 6.0, 0.8, 80.0],
    "Charging_Stations": [50000, 80000, 20000, 8000, 12000],
    "GDP_Per_Capita": [65000, 12000, 47000, 2500, 75000],
}

# Create a DataFrame
df = pd.DataFrame(data)

# Save it to a CSV file
df.to_csv("EV_County_Data.csv", index=False)

# Display the first few rows
print("Sample data:")
print(df.head())


Sample data:
         Date       Country State Vehicle Primary Use  \
0  2022-09-30        Brazil    CA           Passenger   
1  2022-12-31        Russia    VA           Passenger   
2  2020-01-31         India    MN           Passenger   
3  2022-06-30         China    WA               Truck   
4  2021-07-31  South Africa    CO           Passenger   

   Battery Electric Vehicles (BEVs)  Plug-In Hybrid Electric Vehicles (PHEVs)  \
0                                 7                                         0   
1                                 1                                         2   
2                                 0                                         1   
3                                 0                                         0   
4                                 0                                         1   

   Electric Vehicle (EV) Total  Non-Electric Vehicle Total  Total Vehicles  \
0                            7                         460             467   
1

In [5]:
# no of rows and cols
df.shape

(5, 14)

In [6]:
# Data Types, class and memory alloc
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 14 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Date                                      5 non-null      object 
 1   Country                                   5 non-null      object 
 2   State                                     5 non-null      object 
 3   Vehicle Primary Use                       5 non-null      object 
 4   Battery Electric Vehicles (BEVs)          5 non-null      int64  
 5   Plug-In Hybrid Electric Vehicles (PHEVs)  5 non-null      int64  
 6   Electric Vehicle (EV) Total               5 non-null      int64  
 7   Non-Electric Vehicle Total                5 non-null      int64  
 8   Total Vehicles                            5 non-null      int64  
 9   Percent Electric Vehicles                 5 non-null      float64
 10  EV_Population                             

In [7]:
df.isnull()

Unnamed: 0,Date,Country,State,Vehicle Primary Use,Battery Electric Vehicles (BEVs),Plug-In Hybrid Electric Vehicles (PHEVs),Electric Vehicle (EV) Total,Non-Electric Vehicle Total,Total Vehicles,Percent Electric Vehicles,EV_Population,EV_Percentage,Charging_Stations,GDP_Per_Capita
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [8]:
df.isnull().sum()

Unnamed: 0,0
Date,0
Country,0
State,0
Vehicle Primary Use,0
Battery Electric Vehicles (BEVs),0
Plug-In Hybrid Electric Vehicles (PHEVs),0
Electric Vehicle (EV) Total,0
Non-Electric Vehicle Total,0
Total Vehicles,0
Percent Electric Vehicles,0


In [9]:
# Compute Q1 and Q3
Q1 = df['Percent Electric Vehicles'].quantile(0.25)
Q3 = df['Percent Electric Vehicles'].quantile(0.75)
IQR = Q3 - Q1

# Define outlier boundaries
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print('lower_bound:', lower_bound)
print('upper_bound:', upper_bound)

# Identify outliers
outliers = df[(df['Percent Electric Vehicles'] < lower_bound) | (df['Percent Electric Vehicles'] > upper_bound)]
print("Number of outliers in 'Percent Electric Vehicles':", outliers.shape[0])

lower_bound: 0.6199999999999998
upper_bound: 2.14
Number of outliers in 'Percent Electric Vehicles': 2


In [11]:
# Converts the "Date" column to actual datetime objects
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Removes rows where "Date" conversion failed
df = df[df['Date'].notnull()]

# Removes rows where the target (EV Total) is missing
df = df[df['Electric Vehicle (EV) Total'].notnull()]

# Fill missing values
# df['County'] = df['County'].fillna('Unknown') # Removed as 'County' column does not exist
df['State'] = df['State'].fillna('Unknown')

# Confirm remaining nulls
print("Missing after fill:")
# print(df[['County', 'State']].isnull().sum()) # Modified to not include 'County'
print(df[['State']].isnull().sum())

df.head()

Missing after fill:
State    0
dtype: int64


Unnamed: 0,Date,Country,State,Vehicle Primary Use,Battery Electric Vehicles (BEVs),Plug-In Hybrid Electric Vehicles (PHEVs),Electric Vehicle (EV) Total,Non-Electric Vehicle Total,Total Vehicles,Percent Electric Vehicles,EV_Population,EV_Percentage,Charging_Stations,GDP_Per_Capita
0,2022-09-30,Brazil,CA,Passenger,7,0,7,460,467,1.5,2500000,4.1,50000,65000
1,2022-12-31,Russia,VA,Passenger,1,2,3,188,191,1.57,7000000,5.3,80000,12000
2,2020-01-31,India,MN,Passenger,0,1,1,32,33,3.03,1500000,6.0,20000,47000
3,2022-06-30,China,WA,Truck,0,0,0,3575,3575,0.0,500000,0.8,8000,2500
4,2021-07-31,South Africa,CO,Passenger,0,1,1,83,84,1.19,400000,80.0,12000,75000


In [12]:
# Cap the outliers - it keeps all the data while reducing the skew from extreme values.

df['Percent Electric Vehicles'] = np.where(df['Percent Electric Vehicles'] > upper_bound, upper_bound,
                                 np.where(df['Percent Electric Vehicles'] < lower_bound, lower_bound, df['Percent Electric Vehicles']))

# Identify outliers
outliers = df[(df['Percent Electric Vehicles'] < lower_bound) | (df['Percent Electric Vehicles'] > upper_bound)]
print("Number of outliers in 'Percent Electric Vehicles':", outliers.shape[0])

Number of outliers in 'Percent Electric Vehicles': 0
