In [9]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler


In [10]:
accidents = pd.read_csv('../data/clean/merged.csv')

In [11]:
accidents.columns

Index(['AccidentType', 'AccidentSeverityCategory',
       'AccidentInvolvingPedestrian', 'AccidentInvolvingBicycle',
       'AccidentInvolvingMotorcycle', 'RoadType', 'x', 'y', 'year', 'month',
       'weekday', 'hour', 'dateTime', 'traffic_volume_0_period_0',
       'traffic_volume_1_period_0', 'traffic_volume_2_period_0',
       'traffic_volume_0_period_1', 'traffic_volume_1_period_1',
       'traffic_volume_2_period_1', 'pedestrian_volume_0_period_0',
       'pedestrian_volume_1_period_0', 'pedestrian_volume_2_period_0',
       'pedestrian_volume_0_period_1', 'pedestrian_volume_1_period_1',
       'pedestrian_volume_2_period_1', 'temperature_2m_period_0',
       'precipitation_period_0', 'snowfall_period_0', 'snow_depth_period_0',
       'surface_pressure_period_0', 'cloud_cover_period_0',
       'temperature_2m_period_1', 'precipitation_period_1',
       'snowfall_period_1', 'snow_depth_period_1', 'surface_pressure_period_1',
       'cloud_cover_period_1', 'temperature_2m_period_2'

In [12]:
drop_features = ['x', 'y', 'year', 'dateTime', 'AccidentSeverityCategory']

X = accidents.drop(columns=drop_features, axis=1)
y = accidents['AccidentSeverityCategory']

rs = 1

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=rs, stratify=y)

In [13]:
numerical_features = ['traffic_volume_0_period_0',
       'traffic_volume_1_period_0', 'traffic_volume_2_period_0',
       'traffic_volume_0_period_1', 'traffic_volume_1_period_1',
       'traffic_volume_2_period_1', 'pedestrian_volume_0_period_0',
       'pedestrian_volume_1_period_0', 'pedestrian_volume_2_period_0',
       'pedestrian_volume_0_period_1', 'pedestrian_volume_1_period_1',
       'pedestrian_volume_2_period_1', 'temperature_2m_period_0',
       'precipitation_period_0', 'snowfall_period_0', 'snow_depth_period_0',
       'surface_pressure_period_0', 'cloud_cover_period_0',
       'temperature_2m_period_1', 'precipitation_period_1',
       'snowfall_period_1', 'snow_depth_period_1', 'surface_pressure_period_1',
       'cloud_cover_period_1', 'temperature_2m_period_2',
       'precipitation_period_2', 'snowfall_period_2', 'snow_depth_period_2',
       'surface_pressure_period_2', 'cloud_cover_period_2',
       'temperature_2m_period_3', 'precipitation_period_3',
       'snowfall_period_3', 'snow_depth_period_3', 'surface_pressure_period_3',
       'cloud_cover_period_3']

categorical_features = ['AccidentType', 'AccidentInvolvingPedestrian', 'AccidentInvolvingBicycle',
       'AccidentInvolvingMotorcycle', 'RoadType', 'month',
       'weekday', 'hour']

In [14]:
numerical_pipeline = Pipeline(steps=[   
    ('scaler', StandardScaler())                          # Scale features
])

categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))    # Encode categorical variables
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

In [17]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [22]:
cat_onehot_features = preprocessor.named_transformers_['cat']\
    .named_steps['onehot'].get_feature_names_out(categorical_features)

processed_features = numerical_features + list(cat_onehot_features)

X_train = pd.DataFrame(X_train, columns=processed_features)
X_test = pd.DataFrame(X_test, columns=processed_features)

In [23]:
X_train

Unnamed: 0,traffic_volume_0_period_0,traffic_volume_1_period_0,traffic_volume_2_period_0,traffic_volume_0_period_1,traffic_volume_1_period_1,traffic_volume_2_period_1,pedestrian_volume_0_period_0,pedestrian_volume_1_period_0,pedestrian_volume_2_period_0,pedestrian_volume_0_period_1,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
0,-0.343217,-0.259156,-0.527950,-0.110380,0.170775,-0.081952,0.076391,0.881698,0.106843,0.400832,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.409218,-0.141909,0.324113,0.219947,-0.091804,0.021212,0.551258,2.161693,1.402487,0.546245,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.356202,1.177116,-0.973406,-0.521198,-0.647542,-1.170323,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.741578,-0.733028,-0.489102,-0.725137,-0.752352,-0.396599,-0.119143,-0.147863,-0.005822,-0.442565,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.659556,0.060830,0.010740,-1.016124,-0.798663,-0.902099,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46058,0.516096,-0.034433,0.583099,0.391555,-0.103991,0.508658,0.690925,0.353005,0.726499,0.371749,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
46059,1.768798,1.948990,0.210159,1.814156,1.819152,0.224959,-0.230876,0.269527,0.106843,0.051840,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
46060,-1.218798,-1.226441,-1.077000,-1.220063,-1.225216,-1.175481,-0.733677,-0.732209,-0.738143,-0.733392,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
46061,-0.110256,0.446767,-1.035562,-0.123268,0.666241,-0.984630,-0.147076,-0.453949,-0.484647,-0.326234,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
y_train = (y_train != 'Unfall mit Sachschaden').astype(int)
y_test = (y_test != 'Unfall mit Sachschaden').astype(int)


In [25]:
X_test.to_csv("../data/training/X_test.csv", index=False)
X_train.to_csv("../data/training/X_train.csv", index=False)
y_test.to_csv("../data/training/y_test.csv", index=False)
y_train.to_csv("../data/training/y_train.csv", index=False)