In [2]:
from src.loader import Loader
loader = Loader.cleaned()
df = loader.get_sample(100000)
df

Unnamed: 0,number_of_motorist_injured,vehicle_type_code1,number_of_pedestrians_injured,crash_time,latitude,contributing_factor_vehicle_1,number_of_pedestrians_killed,longitude,number_of_cyclist_injured,number_of_persons_killed,crash_date,number_of_persons_injured,number_of_cyclist_killed,number_of_motorist_killed
656,1,sedan,0,40,40.72654,Driver Inattention/Distraction,0,-73.715889,0,0,2021-04-15,1,0,0
5913,0,sedan,0,706,40.699993,Other,0,-73.853912,0,0,2021-04-21,0,0,0
8247,0,sedan,0,1365,40.62146,Unspecified,0,-74.071747,0,0,2021-09-15,0,0,0
7785,2,sedan,0,853,40.703163,Other,0,-73.81662,0,0,2021-04-27,2,0,0
9547,1,station wagon/sport utility vehicle,0,0,40.694317,Unspecified,0,-73.737526,0,0,2021-04-30,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
603159,4,sedan,0,470,40.632236,View Obstructed/Limited,0,-74.087151,0,0,2021-11-22,4,0,0
99435,0,pick-up truck,0,1050,40.654591,Unspecified,0,-73.864487,0,0,2021-11-13,0,0,0
600881,1,station wagon/sport utility vehicle,0,1352,40.607105,Traffic Control Disregarded,0,-73.968643,0,0,2021-11-17,1,0,0
602471,0,sedan,0,975,40.61132,Other,0,-74.114716,0,0,2021-11-19,0,0,0


Things we can do:

   -    scaling on latitude and longitude
   -    collapse number of * killed or injured
   -    dummies for categoricals
   -    crash date to day of year (maybe roll it)

In [4]:
df.columns

Index(['latitude', 'longitude', 'number_of_pedestrians_killed',
       'vehicle_type_code1', 'number_of_persons_injured',
       'number_of_cyclist_killed', 'number_of_motorist_injured',
       'number_of_persons_killed', 'contributing_factor_vehicle_1',
       'number_of_pedestrians_injured', 'crash_date', 'crash_time',
       'number_of_motorist_killed', 'number_of_cyclist_injured'],
      dtype='object')

In [12]:
from matplotlib import pyplot as plt
rf=df[[ 'number_of_pedestrians_killed', 'number_of_persons_injured',
       'number_of_cyclist_killed', 'number_of_motorist_injured',
       'number_of_persons_killed',
       'number_of_pedestrians_injured',
       'number_of_motorist_killed', 'number_of_cyclist_injured']].sum(axis=1)
rf

656       2
5913      0
8247      0
7785      4
9547      2
         ..
603159    8
99435     0
600881    2
602471    0
604614    2
Length: 100000, dtype: Int64

In [11]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load your DataFrame (replace this with actual loading code)

# Feature Engineering
df["day_of_year"] = pd.to_datetime(df["crash_date"]).dt.dayofyear

def roll_day_of_year(day):
    return (day - 172) % 365  # Rolling to start around summer (June 21)
df["day_of_year"] = df["day_of_year"].apply(roll_day_of_year)

df["total_killed"] = df[["number_of_pedestrians_killed", "number_of_cyclist_killed", "number_of_motorist_killed", "number_of_persons_killed"]].sum(axis=1)
df["total_injured"] = df[["number_of_pedestrians_injured", "number_of_cyclist_injured", "number_of_motorist_injured", "number_of_persons_injured"]].sum(axis=1)

categorical_features = ["vehicle_type_code1", "contributing_factor_vehicle_1"]
numerical_features = ["latitude", "longitude", "day_of_year", "total_killed", "total_injured"]

target = "total_killed"  # Change to your target variable
X = df[numerical_features + categorical_features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define five different pipelines
pipelines = {
    "Pipeline 1: Standard Scaling & OneHot": ColumnTransformer([
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]),
    "Pipeline 2: MinMax Scaling & OneHot": ColumnTransformer([
        ("num", StandardScaler(with_mean=False), numerical_features),
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]),
    "Pipeline 3: No Scaling, Just Dummies": ColumnTransformer([
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ], remainder='passthrough'),
    "Pipeline 4: Log Transform & Scaling": ColumnTransformer([
        ("num", Pipeline([
            ("scaler", StandardScaler())
        ]), numerical_features),
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]),
    "Pipeline 5: Polynomial Features & Scaling": ColumnTransformer([
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])
}

# Evaluate pipelines
results = {}
for name, transformer in pipelines.items():
    model = Pipeline([
        ("preprocessor", transformer),
        ("regressor", LinearRegression())
    ])

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    results[name] = mse

# Print results
for pipeline, mse in results.items():
    print(f"{pipeline}: MSE = {mse:.4f}")


Pipeline 1: Standard Scaling & OneHot: MSE = 0.0000
Pipeline 2: MinMax Scaling & OneHot: MSE = 0.0000
Pipeline 3: No Scaling, Just Dummies: MSE = 0.0000
Pipeline 4: Log Transform & Scaling: MSE = 0.0000
Pipeline 5: Polynomial Features & Scaling: MSE = 0.0000
