In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [4]:
accidents = pd.read_csv('../data/clean/accidents_clean.csv')
accidents.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61058 entries, 0 to 61057
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   AccidentSeverityCategory_de  61058 non-null  object
 1   AccidentInvolvingPedestrian  61058 non-null  bool  
 2   AccidentInvolvingBicycle     61058 non-null  bool  
 3   AccidentInvolvingMotorcycle  61058 non-null  bool  
 4   RoadType_de                  61058 non-null  object
 5   AccidentYear                 61058 non-null  int64 
 6   AccidentMonth_de             61058 non-null  object
 7   AccidentWeekDay_de           61058 non-null  object
 8   AccidentHour                 61058 non-null  int64 
dtypes: bool(3), int64(2), object(4)
memory usage: 3.0+ MB


In [5]:
# Aufteilung in Features und Label
X = accidents.drop('AccidentSeverityCategory_de', axis=1)
y = accidents['AccidentSeverityCategory_de']

rs = 1

# Aufteilung in Trainings- und Testdaten (80% training, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=rs, stratify=y)

In [6]:
dummy_variables = ["RoadType_de", "AccidentMonth_de", "AccidentWeekDay_de", "AccidentHour"]
X_train = pd.get_dummies(X_train, columns=dummy_variables) #drop_first?
X_test = pd.get_dummies(X_test, columns=dummy_variables)

train_columns = X_train.columns
test_columns = X_test.columns

missing_columns = set(train_columns) - set(test_columns)
for column in missing_columns:
    X_test[column] = False
    
extra_columns = set(test_columns) - set(train_columns)
for column in extra_columns:
    X_test.drop(columns=extra_columns, inplace=True)

# Die Spalten im Testdatensatz entsprechend dem Trainingsdatensatz sortieren
X_test = X_test[train_columns]

print("Trainingsdaten:")
print(X_train.info())
print("\nTestdaten:")
print(X_test.info())

Trainingsdaten:
<class 'pandas.core.frame.DataFrame'>
Index: 48846 entries, 42995 to 21725
Data columns (total 50 columns):
 #   Column                         Non-Null Count  Dtype
---  ------                         --------------  -----
 0   AccidentInvolvingPedestrian    48846 non-null  bool 
 1   AccidentInvolvingBicycle       48846 non-null  bool 
 2   AccidentInvolvingMotorcycle    48846 non-null  bool 
 3   AccidentYear                   48846 non-null  int64
 4   RoadType_de_Hauptstrasse       48846 non-null  bool 
 5   RoadType_de_Nebenstrasse       48846 non-null  bool 
 6   RoadType_de_andere             48846 non-null  bool 
 7   AccidentMonth_de_April         48846 non-null  bool 
 8   AccidentMonth_de_August        48846 non-null  bool 
 9   AccidentMonth_de_Dezember      48846 non-null  bool 
 10  AccidentMonth_de_Februar       48846 non-null  bool 
 11  AccidentMonth_de_Januar        48846 non-null  bool 
 12  AccidentMonth_de_Juli          48846 non-null  bool 
 13  A

In [7]:
y_test.value_counts()

AccidentSeverityCategory_de
Unfall mit Sachschaden         9086
Unfall mit Leichtverletzten    2601
Unfall mit Schwerverletzten     508
Unfall mit Getöteten             17
Name: count, dtype: int64

In [8]:
y_train = (y_train != 'Unfall mit Sachschaden').astype(int)
y_test = (y_test != 'Unfall mit Sachschaden').astype(int)


In [9]:
X_test.to_csv("../data/training/X_test.csv", index=False)
X_train.to_csv("../data/training/X_train.csv", index=False)
y_test.to_csv("../data/training/y_test.csv", index=False)
y_train.to_csv("../data/training/y_train.csv", index=False)

In [10]:
X_train

Unnamed: 0,AccidentInvolvingPedestrian,AccidentInvolvingBicycle,AccidentInvolvingMotorcycle,AccidentYear,RoadType_de_Hauptstrasse,RoadType_de_Nebenstrasse,RoadType_de_andere,AccidentMonth_de_April,AccidentMonth_de_August,AccidentMonth_de_Dezember,...,AccidentHour_14,AccidentHour_15,AccidentHour_16,AccidentHour_17,AccidentHour_18,AccidentHour_19,AccidentHour_20,AccidentHour_21,AccidentHour_22,AccidentHour_23
42995,False,False,False,2023,True,False,False,False,False,True,...,False,False,False,False,False,True,False,False,False,False
57034,False,False,False,2023,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
22423,False,False,False,2021,False,True,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
34126,False,False,False,2019,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6628,False,False,False,2015,False,True,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16812,False,False,True,2020,False,True,False,False,True,False,...,False,False,False,False,False,False,False,False,True,False
8648,False,False,False,2016,True,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
24962,False,False,False,2023,False,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
34866,False,False,False,2022,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
