# Preprocessing RoadTrafficAccidents

In [63]:
import pandas as pd
import os
import datetime
import math

Getting the full path to the dataset folder

In [64]:
dir_path = os.path.dirname(os.path.realpath("__file__"))
path_to_datasets_folder = os.path.join(dir_path, "datasets")
path_to_accidents_file = os.path.join(path_to_datasets_folder, "RoadTrafficAccidentLocations.csv")

Reading the TrafficAccident file and remove all language duplicates

In [65]:
df_accidents = pd.read_csv(path_to_accidents_file)
df_accidents = df_accidents.drop(columns=['AccidentType_de', 'AccidentType_fr', 'AccidentType_it'])
df_accidents = df_accidents.drop(columns=['AccidentSeverityCategory_de', 'AccidentSeverityCategory_fr', 'AccidentSeverityCategory_it'])
df_accidents = df_accidents.drop(columns=['RoadType_de', 'RoadType_fr', 'RoadType_it'])
df_accidents = df_accidents.drop(columns=['AccidentMonth_de', 'AccidentMonth_fr','AccidentMonth_it'])
df_accidents = df_accidents.drop(columns=['AccidentWeekDay_de', 'AccidentWeekDay_fr', 'AccidentWeekDay_it'])
df_accidents.head()

Unnamed: 0,AccidentUID,AccidentType,AccidentType_en,AccidentSeverityCategory,AccidentSeverityCategory_en,AccidentInvolvingPedestrian,AccidentInvolvingBicycle,AccidentInvolvingMotorcycle,RoadType,RoadType_en,...,AccidentLocation_CHLV95_N,CantonCode,MunicipalityCode,AccidentYear,AccidentMonth,AccidentMonth_en,AccidentWeekDay,AccidentWeekDay_en,AccidentHour,AccidentHour_text
0,A2D2677533867004E0430A865E337004,at0,Accident with skidding or self-accident,as4,Accident with property damage,False,False,False,rt433,Minor road,...,1245194,ZH,261,2011,1,January,aw406,Saturday,0.0,00h-01h
1,9FD6441F802C20A6E0430A865E3320A6,at0,Accident with skidding or self-accident,as3,Accident with light injuries,False,True,False,rt433,Minor road,...,1246980,ZH,261,2011,1,January,aw406,Saturday,1.0,01h-02h
2,9FDA0DC4856A6094E0430A865E336094,at0,Accident with skidding or self-accident,as4,Accident with property damage,False,False,False,rt439,Other,...,1247749,ZH,261,2011,1,January,aw406,Saturday,2.0,02h-03h
3,A3B66E42396E6000E0430A865E336000,at5,Accident when crossing the lane(s),as3,Accident with light injuries,False,False,False,rt433,Minor road,...,1247102,ZH,261,2011,1,January,aw406,Saturday,2.0,02h-03h
4,9FDA0DBE8CCE9096E0430A865E339096,at0,Accident with skidding or self-accident,as4,Accident with property damage,False,False,False,rt433,Minor road,...,1250690,ZH,261,2011,1,January,aw406,Saturday,3.0,03h-04h


create the correct datetime for each row and add 'nan' as value in the date row if the accident hour is missing (rows cases)

In [73]:
first_day = 0
day_multiplier = 0
start = 0
prev_days = ('aw402', 'aw403', 'aw404', 'aw405', 'aw406', 'aw407')

for i, row in df_accidents.iterrows():
    
    # Set the reducer variable according to which day the current month starts with
    # This is done so that in the if, elif loops below to get the correct day for
    # every month
    
    reducer = datetime.date(row['AccidentYear'], row['AccidentMonth'], 1).weekday()
    
    # increase the day multiplier if the a "higher" Weekday is followed by a "lower" weekday
    # example Saturday is followed by a thursday, in this case it is inferred that the next 
    # week has started
    
    if start > 0:
        if int(df_accidents.at[i-1, 'AccidentWeekDay'][-1]) > int(df_accidents.at[i, 'AccidentWeekDay'][-1]):
            day_multiplier += 1
            
        # reset the day_multiplier if a new month starts
        
        if df_accidents.at[i-1, 'AccidentMonth'] != df_accidents.at[i, 'AccidentMonth']:
            day_multiplier = 0
        
        
    
    if row['AccidentWeekDay'] == 'aw401':
        df_accidents.at[i, 'day'] = 1 + 7 * day_multiplier - reducer
    elif row['AccidentWeekDay'] == 'aw402':
        df_accidents.at[i, 'day'] = 2 + 7 * day_multiplier - reducer
    elif row['AccidentWeekDay'] == 'aw403':
        df_accidents.at[i, 'day'] = 3 + 7 * day_multiplier - reducer
    elif row['AccidentWeekDay'] == 'aw404':
        df_accidents.at[i, 'day'] = 4 + 7 * day_multiplier - reducer
    elif row['AccidentWeekDay'] == 'aw405':
        df_accidents.at[i, 'day'] = 5 + 7 * day_multiplier - reducer
    elif row['AccidentWeekDay'] == 'aw406':
        df_accidents.at[i, 'day'] = 6 + 7 * day_multiplier - reducer
    elif row['AccidentWeekDay'] == 'aw407':
        df_accidents.at[i, 'day'] = 7 + 7 * day_multiplier - reducer
    
    # safe null values in Accident hour as 'nan' in date column
    if math.isnan(df_accidents.at[i, 'AccidentHour']):
        accident_date = df_accidents.at[i, 'AccidentHour']
        print(df_accidents.at[i, 'AccidentHour'])
        
    # convert to datetime format all time with :30
    else:
        accident_date = datetime.datetime(row['AccidentYear'], row['AccidentMonth'], int(df_accidents.at[i, 'day']), int(df_accidents.at[i, 'AccidentHour']), 30)

    df_accidents.at[i, 'date'] = accident_date
    
    start = 1

nan
nan
nan
nan


In [76]:
df_accidents

Unnamed: 0,AccidentUID,AccidentType,AccidentType_en,AccidentSeverityCategory,AccidentSeverityCategory_en,AccidentInvolvingPedestrian,AccidentInvolvingBicycle,AccidentInvolvingMotorcycle,RoadType,RoadType_en,...,MunicipalityCode,AccidentYear,AccidentMonth,AccidentMonth_en,AccidentWeekDay,AccidentWeekDay_en,AccidentHour,AccidentHour_text,day,date
0,A2D2677533867004E0430A865E337004,at0,Accident with skidding or self-accident,as4,Accident with property damage,False,False,False,rt433,Minor road,...,261,2011,1,January,aw406,Saturday,0.0,00h-01h,1.0,2011-01-01 00:30:00
1,9FD6441F802C20A6E0430A865E3320A6,at0,Accident with skidding or self-accident,as3,Accident with light injuries,False,True,False,rt433,Minor road,...,261,2011,1,January,aw406,Saturday,1.0,01h-02h,1.0,2011-01-01 01:30:00
2,9FDA0DC4856A6094E0430A865E336094,at0,Accident with skidding or self-accident,as4,Accident with property damage,False,False,False,rt439,Other,...,261,2011,1,January,aw406,Saturday,2.0,02h-03h,1.0,2011-01-01 02:30:00
3,A3B66E42396E6000E0430A865E336000,at5,Accident when crossing the lane(s),as3,Accident with light injuries,False,False,False,rt433,Minor road,...,261,2011,1,January,aw406,Saturday,2.0,02h-03h,1.0,2011-01-01 02:30:00
4,9FDA0DBE8CCE9096E0430A865E339096,at0,Accident with skidding or self-accident,as4,Accident with property damage,False,False,False,rt433,Minor road,...,261,2011,1,January,aw406,Saturday,3.0,03h-04h,1.0,2011-01-01 03:30:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48016,B80D6976188D39D8E053DDB9D80AD71B,at2,Accident with rear-end collision,as4,Accident with property damage,False,True,False,rt433,Minor road,...,261,2020,12,December,aw404,Thursday,15.0,15h-16h,31.0,2020-12-31 15:30:00
48017,B80D6976180239D8E053DDB9D80AD71B,at1,Accident when overtaking or changing lanes,as4,Accident with property damage,False,False,False,rt433,Minor road,...,261,2020,12,December,aw404,Thursday,17.0,17h-18h,31.0,2020-12-31 17:30:00
48018,B849F1BB0E86D521E053DDB9D80A2BED,at7,Accident when parking,as4,Accident with property damage,False,False,False,rt433,Minor road,...,261,2020,12,December,aw404,Thursday,18.0,18h-19h,31.0,2020-12-31 18:30:00
48019,B89A7A16E9C675AEE053DDB9D80A0D5A,at0,Accident with skidding or self-accident,as4,Accident with property damage,False,False,False,rt430,Motorway,...,261,2020,12,December,aw404,Thursday,19.0,19h-20h,31.0,2020-12-31 19:30:00
