In [1]:
import pandas as pd
import os
import datetime

In [2]:
# Setting up paths (should work for all OS)
dir_path = os.path.dirname(os.path.realpath("__file__"))
path_to_datasets_folder = os.path.join(dir_path, "datasets")
# path_to_weather_file = os.path.join(path_to_datasets_folder, "messwerte_mythenquai_2007-2020.csv")
path_to_accidents_file = os.path.join(path_to_datasets_folder, "RoadTrafficAccidentLocations.csv")

In [3]:
df_accidents = pd.read_csv(path_to_accidents_file)

## Basic preprocessing

In [4]:
# only include accidents that provide time information

accidents_before = df_accidents.shape[0]

df_accidents = df_accidents.dropna(subset=['AccidentYear'])
df_accidents = df_accidents.dropna(subset=['AccidentMonth'])
df_accidents = df_accidents.dropna(subset=['AccidentWeekDay'])
df_accidents = df_accidents.dropna(subset=['AccidentHour'])

# reset to a continuing index
df_accidents = df_accidents.reset_index(drop=True)

accidents_after= df_accidents.shape[0]
print("Removed {} accidents that did not provide time information".format(accidents_before-accidents_after))


Removed 4 accidents that did not provide time information


In [5]:
first_day = 0
day_multiplier = 0
start = 0
prev_days = ('aw402', 'aw403', 'aw404', 'aw405', 'aw406', 'aw407')

for i, row in df_accidents.iterrows():
    
    # Set the reducer variable according to which day the current month starts with
    # This is done so that in the if, elif loops below to get the correct day for
    # every month
    
    reducer = datetime.date(row['AccidentYear'], row['AccidentMonth'], 1).weekday()
    
    # increase the day multiplier if the a "higher" Weekday is followed by a "lower" weekday
    # example Saturday is followed by a thursday, in this case it is inferred that the next 
    # week has started
    
    #print("type of i: {}".format(type(i)))
    
    if start > 0:
        if int(df_accidents.at[i-1, 'AccidentWeekDay'][-1]) > int(df_accidents.at[i, 'AccidentWeekDay'][-1]):
            day_multiplier += 1
            
        # reset the day_multiplier if a new month starts
        
        if df_accidents.at[i-1, 'AccidentMonth'] != df_accidents.at[i, 'AccidentMonth']:
            day_multiplier = 0
        
        
    
    if row['AccidentWeekDay'] == 'aw401':
        df_accidents.at[i, 'day'] = 1 + 7 * day_multiplier - reducer
    elif row['AccidentWeekDay'] == 'aw402':
        df_accidents.at[i, 'day'] = 2 + 7 * day_multiplier - reducer
    elif row['AccidentWeekDay'] == 'aw403':
        df_accidents.at[i, 'day'] = 3 + 7 * day_multiplier - reducer
    elif row['AccidentWeekDay'] == 'aw404':
        df_accidents.at[i, 'day'] = 4 + 7 * day_multiplier - reducer
    elif row['AccidentWeekDay'] == 'aw405':
        df_accidents.at[i, 'day'] = 5 + 7 * day_multiplier - reducer
    elif row['AccidentWeekDay'] == 'aw406':
        df_accidents.at[i, 'day'] = 6 + 7 * day_multiplier - reducer
    elif row['AccidentWeekDay'] == 'aw407':
        df_accidents.at[i, 'day'] = 7 + 7 * day_multiplier - reducer
    
    # print((df_accidents.at[i, 'AccidentHour']))

    # print(int(df_accidents.at[i, 'AccidentHour']))
    # accident_date = datetime.datetime(row['AccidentYear'], row['AccidentMonth'], df_accidents.at[i, 'day'], int(df_accidents.at[i, 'AccidentHour']))
    # print("type of at: {}".format(type(df_accidents.at[i, 'day'])))
    # print("value of at: {}".format(df_accidents.at[i, 'day']))
    # print("type of Month: {}".format(type(row['AccidentMonth'])))
    # print("type of Year: {}".format(type(row['AccidentYear'])))
    assert df_accidents.at[i, 'day'] % 1 == 0, "day format is not valid: {}".format(df_accidents.at[i, 'day'])
    assert df_accidents.at[i, 'AccidentHour'] % 1 == 0, "hour format is not valid: {} is of type {} in row {}".format(df_accidents.at[i, 'AccidentHour'], type(df_accidents.at[i, 'AccidentHour']), i)
    accident_date = datetime.datetime(row['AccidentYear'], row['AccidentMonth'], int(df_accidents.at[i, 'day']), int(df_accidents.at[i, 'AccidentHour']), 30)
    df_accidents.at[i, 'date'] = accident_date
    
    start = 1

In [6]:
# for further processing with pandas
df_accidents.to_pickle("datasets\\accidents.pkl")

# for further processing with spark
df_accidents.to_csv('datasets/accidents.csv')

In [7]:
# df_accidents.head()
# df_accidents[df_accidents["AccidentYear"]==2020]
# df_accidents[['AccidentUID', 'AccidentYear', 'AccidentMonth', 'AccidentWeekDay', 'AccidentWeekDay_de', 'AccidentHour_text', 'day', 'date', 'AccidentHour']]