In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Load dataset
df = pd.read_csv('Electric_Vehicle_Population_By_County.csv')
df.head()  # Output: displays first 5 rows of the dataset

In [None]:
# Data info and missing values check
df.info()  # Output: shows column names, non-null counts, and data types
df.isnull().sum()  # Output: total missing values per column

In [None]:
# Outlier detection in EV percentage
Q1 = df['Percent Electric Vehicles'].quantile(0.25)
Q3 = df['Percent Electric Vehicles'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = df[(df['Percent Electric Vehicles'] < lower_bound) | (df['Percent Electric Vehicles'] > upper_bound)]
print('Outliers found:', outliers.shape[0])  # Output: number of outlier rows

In [None]:
# Date cleanup and missing handling
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df[df['Date'].notnull()]
df = df[df['Electric Vehicle (EV) Total'].notnull()]
df['County'] = df['County'].fillna('Unknown')
df['State'] = df['State'].fillna('Unknown')

In [None]:
# Cap outliers
df['Percent Electric Vehicles'] = np.where(df['Percent Electric Vehicles'] > upper_bound, upper_bound,
    np.where(df['Percent Electric Vehicles'] < lower_bound, lower_bound, df['Percent Electric Vehicles']))

outliers = df[(df['Percent Electric Vehicles'] < lower_bound) | (df['Percent Electric Vehicles'] > upper_bound)]
print('Outliers after capping:', outliers.shape[0])  # Output: should be 0

In [None]:
# (Optional) Visualization
sns.boxplot(data=df, y='Percent Electric Vehicles')
plt.title('Capped EV Percentage')
plt.show()  # Output: Boxplot without extreme outliers