In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# -------------------------------
# Load Data
# -------------------------------
df = pd.read_csv("Electric_Vehicle_Population_Data.csv")

# Remove rows where Electric Range = 0
df = df[df["Electric Range"] > 0]

# Drop ID columns (they add noise)
drop_cols = ["VIN", "DOL Vehicle ID", "Vehicle ID"]
df = df.drop(columns=[col for col in drop_cols if col in df.columns])

# -------------------------------
# Features & Target
# -------------------------------
X = df.drop("Electric Range", axis=1)
y = df["Electric Range"]

# -------------------------------
# Train Test Split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -------------------------------
# Identify column types
# -------------------------------
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

# -------------------------------
# Preprocessing pipelines
# -------------------------------
preprocess = ColumnTransformer(
    transformers=[
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("encoder", OneHotEncoder(handle_unknown="ignore"))
        ]), categorical_cols),

        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median"))
        ]), numeric_cols)
    ]
)

# -------------------------------
# Full Model
# -------------------------------
model = Pipeline([
    ("preprocess", preprocess),
    ("regressor", LinearRegression())
])

model.fit(X_train, y_train)

# -------------------------------
# Evaluate
# -------------------------------
y_pred = model.predict(X_test)

print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2:", r2_score(y_test, y_pred))


MAE: 91.82802785230105
MSE: 9899.432913058294
RMSE: 99.495893950747
R2: -0.0001399242210036178


In [18]:
print(df.columns)

Index(['VIN (1-10)', 'County', 'City', 'State', 'Postal Code', 'Model Year',
       'Make', 'Model', 'Electric Vehicle Type',
       'Clean Alternative Fuel Vehicle (CAFV) Eligibility', 'Electric Range',
       'Base MSRP', 'Legislative District', 'Vehicle Location',
       'Electric Utility', '2020 Census Tract'],
      dtype='object')


In [19]:
clean_cols = [
    "VIN (1-10)", "County", "City", "State", "Postal Code",
    "Legislative District", "Vehicle Location",
    "Electric Utility", "2020 Census Tract"
]

df = df.drop(columns=[col for col in clean_cols if col in df.columns])

# remove rows with Electric Range = 0
df = df[df["Electric Range"] > 0]

# Features and target
X = df.drop("Electric Range", axis=1)
y = df["Electric Range"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Preprocessing
categorical_cols = ["Make", "Model", "Electric Vehicle Type"]
numeric_cols = ["Model Year", "Base MSRP"]

preprocess = ColumnTransformer(
    transformers=[
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("encoder", OneHotEncoder(handle_unknown="ignore"))
        ]), categorical_cols),

        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median"))
        ]), numeric_cols)
    ]
)

from sklearn.ensemble import RandomForestRegressor

model = Pipeline([
    ("preprocess", preprocess),
    ("regressor", RandomForestRegressor(
        n_estimators=300,
        max_depth=None,
        random_state=42
    ))
])

model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)

print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2:", r2_score(y_test, y_pred))


MAE: 1.4794162819507497
MSE: 39.45996641110107
RMSE: 6.281716836271838
R2: 0.9960133587284477
