In [1]:
import pandas as pd
import matplotlib.pyplot as plt
#aprox. time 00:

accidents = pd.read_csv('data/edited_file.csv', index_col='ID')
accidents.dtypes

Unnamed: 0                 int64
Severity                   int64
Start_Time                object
End_Time                  object
Start_Lat                float64
Start_Lng                float64
Distance(mi)             float64
Description               object
Street                    object
City                      object
County                    object
State                     object
Zipcode                   object
Country                   object
Timezone                  object
Airport_Code              object
Temperature(F)           float64
Humidity(%)              float64
Pressure(in)             float64
Visibility(mi)           float64
Wind_Direction            object
Wind_Speed(mph)          float64
Weather_Condition         object
Amenity                     bool
Bump                        bool
Crossing                    bool
Give_Way                    bool
Junction                    bool
No_Exit                     bool
Railway                     bool
Roundabout

In [2]:
# Start_Time and End_Time are objects, so we will convert them to DateTime-Objects

accidents['Start_Time'] = pd.to_datetime(accidents['Start_Time'])
accidents['End_Time'] = pd.to_datetime(accidents['End_Time'])

# after that we are able to extract hour, day, month and year of each date

accidents['Hour'] = accidents['Start_Time'].dt.hour
accidents['Day'] = accidents['Start_Time'].dt.day
accidents['Month'] = accidents['Start_Time'].dt.month
accidents['Year'] = accidents['Start_Time'].dt.year

# we will just clean the other datatypes here:
accidents['Weather_Timestamp'] = pd.to_datetime(accidents['Weather_Timestamp'])

# we will see the new added columns (extracted partss of the start date time)
accidents.head()

KeyError: 'Weather_Timestamp'

In [None]:
nan_count = accidents['Temperature(F)'].isna().sum()

print(f"Anzahl der NaN-Werte in der 'Temperature(F)'-Spalte: {nan_count}")

In [None]:
# Example for Binning

#dropping al NaN (Temperature) rows
accidents_cleaned = accidents.dropna(subset=['Temperature(F)'])

#setting edges by ourself, because otherwise all tempperatures will result in 'moderate' when we let it automatically decide
bin_edges = [float('-inf'), 32, 50, 65, 80, float('inf')]
bin_labels = ['Very Cold', 'Cold', 'Moderate', 'Warm', 'Hot']
# Setting bins to 5, could also set 3 (cold, moderate, hot)
accidents_cleaned['Temperature_Bin'] = pd.cut(accidents_cleaned['Temperature(F)'], bins=bin_edges, labels=bin_labels)

accidents_cleaned

In [None]:
#One-Hot-Encoding
#Encoding all Weather Conditions

accidents_cleaned = pd.get_dummies(accidents_cleaned, columns=['Weather_Condition'], drop_first=True, prefix='Weather')
accidents_cleaned

In [None]:
#Creating time series

# Konvertiere 'Start_Time' in ein DateTime-Objekt
accidents_cleaned['Start_Time'] = pd.to_datetime(accidents_cleaned['Start_Time'])

# Setze 'Start_Time' als Index für die Time Series
accidents_cleaned.set_index('Start_Time', inplace=True)

In [None]:
# Calculate daily accidents
daily_accidents = accidents_cleaned.resample('D').size()

# daily accidents graph
plt.figure(figsize=(12, 6))
daily_accidents.plot(title='Tägliche Unfallanzahl', xlabel='Datum', ylabel='Anzahl')
plt.show()