# Week 1 Assignment: Electric Vehicle Data Analysis

This notebook performs data cleaning, outlier handling, feature engineering, and trains a Random Forest model on Electric Vehicle registration data.

## 1. Import Libraries

In [None]:
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from google.colab import files


## 2. Upload Dataset

In [None]:
uploaded = files.upload()
df = pd.read_csv("3ae033f50fa345051652.csv")
df.head()

## 3. Data Overview

In [None]:
print("Shape:", df.shape)
df.info()
print("Missing values:\n", df.isnull().sum())

## 4. Handling Outliers in 'Percent Electric Vehicles'

In [None]:
Q1 = df['Percent Electric Vehicles'].quantile(0.25)
Q3 = df['Percent Electric Vehicles'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Print outlier bounds and count
print('Lower bound:', lower_bound)
print('Upper bound:', upper_bound)
outliers = df[(df['Percent Electric Vehicles'] < lower_bound) | (df['Percent Electric Vehicles'] > upper_bound)]
print("Outliers count:", outliers.shape[0])

# Cap the outliers
df['Percent Electric Vehicles'] = np.where(df['Percent Electric Vehicles'] > upper_bound, upper_bound,
                                           np.where(df['Percent Electric Vehicles'] < lower_bound, lower_bound, df['Percent Electric Vehicles']))

## 5. Handling Missing Values

In [None]:
# Convert 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df[df['Date'].notnull()]

# Remove rows with missing target
df = df[df['Electric Vehicle (EV) Total'].notnull()]

# Fill other missing values
df['County'] = df['County'].fillna('Unknown')
df['State'] = df['State'].fillna('Unknown')
print("Missing after fill:\n", df[['County', 'State']].isnull().sum())

## 6. Feature Engineering

In [None]:
# Example: Label encode categorical columns if needed
le = LabelEncoder()
df['County'] = le.fit_transform(df['County'])
df['State'] = le.fit_transform(df['State'])

## 7. Model Building - Random Forest

In [None]:
X = df.drop(['Electric Vehicle (EV) Total', 'Date'], axis=1)
y = df['Electric Vehicle (EV) Total']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

## 8. Evaluation Metrics

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'MAE: {mae}\nMSE: {mse}\nRMSE: {rmse}\nR2 Score: {r2}')

## 9. Save the Model

In [None]:
joblib.dump(model, 'rf_model.joblib')