In [17]:
import pandas as pd
import numpy as np
from joblib import dump
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from modules.utils import convert_lv95_to_wgs84

In [18]:
accidents = pd.read_csv('../data/clean/merged.csv')

In [19]:
accidents['AccidentSeverityCategory'].value_counts()

AccidentSeverityCategory
as4    43036
as3    12128
as2     2341
as1       74
Name: count, dtype: int64

In [20]:
locations = accidents[['x', 'y', 'RoadType']]
locations = convert_lv95_to_wgs84(locations)
locations.to_csv('../data/inference/locations.csv', index=False)



In [19]:
drop_features = ['x', 'y', 'year', 'dateTime', 'AccidentSeverityCategory']

X = accidents.drop(columns=drop_features, axis=1)
y = accidents['AccidentSeverityCategory']

rs = 1

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=rs, stratify=y)

In [20]:
X_train.columns

Index(['AccidentType', 'AccidentInvolvingPedestrian',
       'AccidentInvolvingBicycle', 'AccidentInvolvingMotorcycle', 'RoadType',
       'month', 'weekday', 'hour', 'traffic_volume_0_period_0',
       'traffic_volume_1_period_0', 'traffic_volume_2_period_0',
       'traffic_volume_0_period_1', 'traffic_volume_1_period_1',
       'traffic_volume_2_period_1', 'pedestrian_volume_0_period_0',
       'pedestrian_volume_1_period_0', 'pedestrian_volume_2_period_0',
       'pedestrian_volume_0_period_1', 'pedestrian_volume_1_period_1',
       'pedestrian_volume_2_period_1', 'temperature_2m_period_0',
       'precipitation_period_0', 'snowfall_period_0', 'snow_depth_period_0',
       'surface_pressure_period_0', 'cloud_cover_period_0',
       'temperature_2m_period_1', 'precipitation_period_1',
       'snowfall_period_1', 'snow_depth_period_1', 'surface_pressure_period_1',
       'cloud_cover_period_1', 'temperature_2m_period_2',
       'precipitation_period_2', 'snowfall_period_2', 'snow_dep

In [21]:
numerical_features = ['traffic_volume_0_period_0',
       'traffic_volume_1_period_0', 'traffic_volume_2_period_0',
       'traffic_volume_0_period_1', 'traffic_volume_1_period_1',
       'traffic_volume_2_period_1', 'pedestrian_volume_0_period_0',
       'pedestrian_volume_1_period_0', 'pedestrian_volume_2_period_0',
       'pedestrian_volume_0_period_1', 'pedestrian_volume_1_period_1',
       'pedestrian_volume_2_period_1', 'temperature_2m_period_0',
       'precipitation_period_0', 'snowfall_period_0', 'snow_depth_period_0',
       'surface_pressure_period_0', 'cloud_cover_period_0',
       'temperature_2m_period_1', 'precipitation_period_1',
       'snowfall_period_1', 'snow_depth_period_1', 'surface_pressure_period_1',
       'cloud_cover_period_1', 'temperature_2m_period_2',
       'precipitation_period_2', 'snowfall_period_2', 'snow_depth_period_2',
       'surface_pressure_period_2', 'cloud_cover_period_2',
       'temperature_2m_period_3', 'precipitation_period_3',
       'snowfall_period_3', 'snow_depth_period_3', 'surface_pressure_period_3',
       'cloud_cover_period_3']

categorical_features = ['AccidentType', 'AccidentInvolvingPedestrian', 'AccidentInvolvingBicycle',
       'AccidentInvolvingMotorcycle', 'RoadType', 'month',
       'weekday', 'hour']

In [22]:
numerical_pipeline = Pipeline(steps=[   
    ('scaler', StandardScaler())                          # Scale features
])

categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))    # Encode categorical variables
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

In [23]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

dump(preprocessor, '../data/inference/preprocessor.pkl')

['../data/inference/preprocessor.pkl']

In [33]:
cat_onehot_features = preprocessor.named_transformers_['cat']\
    .named_steps['onehot'].get_feature_names_out(categorical_features)

processed_features = numerical_features + list(cat_onehot_features)

X_train = pd.DataFrame(X_train, columns=processed_features)
X_test = pd.DataFrame(X_test, columns=processed_features)

In [34]:
y_train = (y_train != 'as4').astype(int)
y_test = (y_test != 'as4').astype(int)

In [35]:
X_test.to_csv("../data/training/X_test.csv", index=False)
X_train.to_csv("../data/training/X_train.csv", index=False)

In [36]:

y_test.to_csv("../data/training/y_test.csv", index=False)
y_train.to_csv("../data/training/y_train.csv", index=False)

In [37]:
X_train

Unnamed: 0,traffic_volume_0_period_0,traffic_volume_1_period_0,traffic_volume_2_period_0,traffic_volume_0_period_1,traffic_volume_1_period_1,traffic_volume_2_period_1,pedestrian_volume_0_period_0,pedestrian_volume_1_period_0,pedestrian_volume_2_period_0,pedestrian_volume_0_period_1,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
0,-0.343188,-0.259128,-0.527920,-0.110299,0.170852,-0.081870,0.150599,0.209563,0.466944,-0.279617,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.409243,-0.141882,0.324138,0.220024,-0.091723,0.021292,0.635557,-0.670434,0.151812,1.089513,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.356221,1.177136,-0.973373,-0.521110,-0.647451,-1.170223,-0.676681,-0.720720,-0.714800,-0.666545,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.741547,-0.732997,-0.489072,-0.725046,-0.752259,-0.396512,0.065019,-0.695577,2.095124,-0.368908,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.659526,0.060856,0.010767,-1.016028,-0.798570,-0.902003,-0.676681,-0.720720,-0.714800,-0.666545,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46058,0.516119,-0.034406,0.583122,0.391628,-0.103910,0.508729,0.435869,-0.419006,-0.609756,0.821640,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
46059,1.768815,1.949006,0.210185,1.814207,1.819202,0.225036,0.350288,0.008421,-0.058275,-0.041508,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
46060,-1.218765,-1.226408,-1.076967,-1.219964,-1.225116,-1.175381,-0.676681,-0.720720,-0.609756,-0.666545,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
46061,-0.110229,0.446791,-1.035529,-0.123186,0.666310,-0.984532,-0.305831,-0.670434,-0.688539,-0.577254,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
new = pd.read_csv("../data/training/X_train.csv")

In [40]:
new.isna().sum()

traffic_volume_0_period_0    0
traffic_volume_1_period_0    0
traffic_volume_2_period_0    0
traffic_volume_0_period_1    0
traffic_volume_1_period_1    0
                            ..
hour_19                      0
hour_20                      0
hour_21                      0
hour_22                      0
hour_23                      0
Length: 99, dtype: int64