In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import sys
import datetime
import math

ROOT_DIR = '../'
sys.path.insert(1, '../production_code/')
from constants import *

In [2]:
df = pd.read_csv(ROOT_DIR + FULLY_CLEANED_DATA_DIR)

# fixing date type
df.loc[:,'date'] = pd.to_datetime(df.loc[:,'date'])

# adding pure date stamp
df.loc[:,'date_stamp'] =  pd.to_datetime(df.loc[:,'date'].dt.date)


day_dict = {0: "Monday", 1: "Tuesday", 2: "Wednesday", 3: "Thursday", 4: "Friday", 5: "Saturday", 6: "Sunday"}
hour_bin_dict = {0: 'Night', 1: 'Morning', 2:'Afternoon',3:'Evening'}

print(df.shape)
df.head(3)


(68903, 11)


Unnamed: 0.1,Unnamed: 0,ACCIDENT_NO,date,day,hour_bin,region,lga,sky,police_needed,ambulance_needed,date_stamp
0,0,T20150000056,2015-02-01 03:00:00,Sunday,Night,EASTERN REGION,BAW BAW,Clear,1,True,2015-02-01
1,1,T20150000060,2015-02-01 07:30:00,Sunday,Morning,WESTERN REGION,HEPBURN,Clear,1,True,2015-02-01
2,2,T20150000063,2015-02-01 08:30:00,Sunday,Morning,SOUTH WESTERN REGION,SOUTHERN GRAMPIANS,Clear,1,True,2015-02-01


# Main Issues

### initial pivot

after completing an intial pivot of the data, there are two main issues

1. there is no larger date grouping, so if we have 6 years of data, the calculated needed will be double that of 3 years
2. Rainy days dont apear as often as clear days, so currently, looks like rainy days are less dangerous
3. some less populated regions have very little need
4. 


In [18]:
# testing how it pivots
(df.pivot_table(
    index = ['day','hour_bin','region','sky'],
    values = ['police_needed', 'ambulance_needed'],
    aggfunc = 'sum'
))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,ambulance_needed,police_needed
day,hour_bin,region,sky,Unnamed: 4_level_1,Unnamed: 5_level_1
Friday,Afternoon,EASTERN REGION,Clear,211,176
Friday,Afternoon,EASTERN REGION,Not clear,39,33
Friday,Afternoon,METROPOLITAN NORTH WEST REGION,Clear,1296,1044
Friday,Afternoon,METROPOLITAN NORTH WEST REGION,Not clear,155,126
Friday,Afternoon,METROPOLITAN SOUTH EAST REGION,Clear,1473,1201
...,...,...,...,...,...
Wednesday,Night,NORTHERN REGION,Not clear,6,6
Wednesday,Night,SOUTH WESTERN REGION,Clear,41,38
Wednesday,Night,SOUTH WESTERN REGION,Not clear,8,6
Wednesday,Night,WESTERN REGION,Clear,23,23


In [14]:
# exploring the values of crashes at one friday afternoon with clear skys in the south eastern metro region
df.query('hour_bin == "Afternoon" & day == "Friday" & sky == "Clear" & region == "METROPOLITAN SOUTH EAST REGION"').groupby('date_stamp').sum()

Unnamed: 0_level_0,Unnamed: 0,police_needed,ambulance_needed,scaled_police,scaled_ambulance
date_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-01-02,16453,3,4,3.356125,4.474834
2015-01-09,63162,6,7,6.712251,7.830959
2015-01-16,2579,4,5,4.474834,5.593542
2015-01-23,6068,5,6,5.593542,6.712251
2015-01-30,16435,6,7,6.712251,7.830959
...,...,...,...,...,...
2020-09-18,137636,2,2,2.237417,2.237417
2020-10-02,323773,5,5,5.593542,5.593542
2020-10-09,206109,3,3,3.356125,3.356125
2020-10-16,137727,2,2,2.237417,2.237417



### Issue 1. scaling by time span

The largest time scale is the 7 days of a week. Therefore if we had 1 year of data, there would be 52 potential weeks that would be combined into the one table. and for 3 years, 156

Therefore, we simply need to divide everything by the number of weeks in the data set

In [5]:
print(pd.to_datetime(df['date']).max())
print(pd.to_datetime(df['date']).min())

# small function to find num of weeks
def calc_num_weeks_between(min, max):
    return math.ceil(
        (max - min).days / 7
    )
    
num_of_weeks = calc_num_weeks_between(df['date'].min(), df['date'].max())

# then final result must be divided by this
num_of_weeks


2020-12-10 17:45:00
2015-01-02 00:04:00


310

### Issue 2: need to scale weather conditions

If we assume that the dataset is vast enough, for each date, there would be at least one crash, with the weather reported. Therefore, by looking at the number of times it is either clear or not, for each hourly bin (removing duplicates from the smallest time group to ignore multiple crashes). We can get a rough distribution of the percentage of time it is clear, grouped by hourly bin for some extra granuality

In [20]:
# calculating count of times its not clear per hourly bin
time_dist_rain = df[df['sky'] == 'Not clear']\
        .drop_duplicates(   # removing duplicates in smallest time group
            subset = ['date_stamp']
            )\
        .groupby(    # grouping by hourly_bin
            ['hour_bin']
            )['ACCIDENT_NO']\
        .count() 


# repeats the
#  same for clear weather
time_dist_clear = df[df['sky'] == 'Clear'].drop_duplicates(subset = ['date_stamp','hour_bin']).groupby(['hour_bin'])['ACCIDENT_NO'].count()

# calculating percentage clear
pr_clear = time_dist_clear / (time_dist_rain + time_dist_clear)
pr_clear

hour_bin
Afternoon    0.893888
Evening      0.922148
Morning      0.780909
Night        0.800304
Name: ACCIDENT_NO, dtype: float64

In [7]:
# need to use these weights to scale
def scale_need(row, clear_weights):
    
    # if its clear
    if row['sky'] == 'Clear':

        # scales based on clear weights
        return row[0] * 1/(pr_clear[row['hour_bin']])
    # else its raining
    else:

        # scales based on not clear weights
        return row[0] * 1/(1 - pr_clear[row['hour_bin']])

# uses above function to scale need
df.loc[:,'scaled_police'] = df.loc[:,['police_needed','sky','region','hour_bin']].apply(scale_need, axis = 1, clear_weights = pr_clear)
df.loc[:,'scaled_ambulance'] = df.loc[:,['ambulance_needed','sky','region','hour_bin']].apply(scale_need, axis = 1, clear_weights = pr_clear)

### Issue 3: low counts in regional areas

No scaling makes sense here, the counts are low because the number of crashes are low.

# final pivotted data

looking at the data now, it looks really good, the number needed for non clear days is slightly high than clear days, more ambulances are needed in metro areas. And no time is left without an ambulance. time to generate a training and tesing data set

In [8]:
# testing how it pivots
final = (df.pivot_table(
    index = ['day','hour_bin','region','sky'],
    values = ['scaled_police', 'scaled_ambulance'],
    aggfunc = 'sum'
) / (num_of_weeks))

final

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,scaled_ambulance,scaled_police
day,hour_bin,region,sky,Unnamed: 4_level_1,Unnamed: 5_level_1
Friday,Afternoon,EASTERN REGION,Clear,0.761443,0.635138
Friday,Afternoon,EASTERN REGION,Not clear,1.185600,1.003200
Friday,Afternoon,METROPOLITAN NORTH WEST REGION,Clear,4.676923,3.767521
Friday,Afternoon,METROPOLITAN NORTH WEST REGION,Not clear,4.712000,3.830400
Friday,Afternoon,METROPOLITAN SOUTH EAST REGION,Clear,5.315670,4.334093
...,...,...,...,...,...
Wednesday,Night,NORTHERN REGION,Not clear,0.096922,0.096922
Wednesday,Night,SOUTH WESTERN REGION,Clear,0.165260,0.153168
Wednesday,Night,SOUTH WESTERN REGION,Not clear,0.129229,0.096922
Wednesday,Night,WESTERN REGION,Clear,0.092707,0.092707


In [9]:
def display_final_df(df_final, query = 'sky == "Clear"'):
    df_temp = df_final.reset_index().query(query).pivot_table(
        index = 'day',
        columns = 'region',
        values = 'scaled_ambulance',
        aggfunc = 'mean',
    ).T


    # create a heatmap
    fig = px.imshow(
        df_temp, 
        x=list(day_dict.values()), 
        y=list(df_temp.index), 
        labels={
                "x": "Day of the Week",
                "y": "region",
                "color": "% chance of needing an ambulace"
            }, 
        title="Average Chance of an Ambulance, sky is clear"
        )

    # add annotations
    fig.update_traces(hoverinfo='text', text=df_temp.values, texttemplate='%{text:.2f}')
    # show the figure
    fig.show()
    return fig

fig = display_final_df(final)


# creating train test split

In [16]:

min_date = pd.to_datetime(TRAIN_SPLIT_MIN_DATE)
split_date = pd.to_datetime(TEST_TRAIN_SPLIT_DATE)
max_date = pd.to_datetime(TEST_SPLIT_MAX_DATE)


# testing how it pivots
df_train = df\
    .query('date >= @min_date & date < @split_date')\
    .pivot_table(
        index = ['day','hour_bin','region','sky'],
        values = ['scaled_police', 'scaled_ambulance'],
        aggfunc = 'sum'
    ) / calc_num_weeks_between(min_date, split_date)



# testing how it pivots
df_test = df\
    .query('date >= @split_date & date < @max_date')\
    .pivot_table(
        index = ['day','hour_bin','region','sky'],
        values = ['scaled_police', 'scaled_ambulance'],
        aggfunc = 'sum'
    ) / calc_num_weeks_between(split_date, max_date)


fig = display_final_df(df_train)
fig = display_final_df(df_test)

In [11]:
# exploring the values of crashes at one friday afternoon with rainy skys in the south eastern metro region
df.query('hour_bin == "Afternoon" & day == "Friday" & sky == "Not clear" & region == "METROPOLITAN SOUTH EAST REGION"').groupby('date_stamp').sum()

Unnamed: 0_level_0,Unnamed: 0,police_needed,ambulance_needed,scaled_police,scaled_ambulance
date_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-02-13,3122,1,2,9.424,18.848
2015-04-17,48478,9,11,84.816,103.664
2015-05-08,24117,3,3,28.272,28.272
2015-06-05,37612,7,8,65.968,75.392
2015-07-17,15142,1,2,9.424,18.848
...,...,...,...,...,...
2019-11-08,58300,1,1,9.424,9.424
2019-12-06,225819,4,4,37.696,37.696
2019-12-20,189572,2,3,18.848,28.272
2020-02-14,521668,5,8,47.120,75.392


In [12]:
# exploring the values of crashes at one friday afternoon with clear skys in the south eastern metro region
df.query('hour_bin == "Afternoon" & day == "Friday" & sky == "Clear" & region == "METROPOLITAN SOUTH EAST REGION"').groupby('date_stamp').sum()

Unnamed: 0_level_0,Unnamed: 0,police_needed,ambulance_needed,scaled_police,scaled_ambulance
date_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-01-02,16453,3,4,3.356125,4.474834
2015-01-09,63162,6,7,6.712251,7.830959
2015-01-16,2579,4,5,4.474834,5.593542
2015-01-23,6068,5,6,5.593542,6.712251
2015-01-30,16435,6,7,6.712251,7.830959
...,...,...,...,...,...
2020-09-18,137636,2,2,2.237417,2.237417
2020-10-02,323773,5,5,5.593542,5.593542
2020-10-09,206109,3,3,3.356125,3.356125
2020-10-16,137727,2,2,2.237417,2.237417


In [13]:
area_type = 'region' 
# area_type = 'lga' 




# testing how it pivots
df_train = df\
    .query('date >= @min_date & date < @split_date')\
    .pivot_table(
        index = ['day','hour_bin',area_type,'sky'],
        values = ['scaled_police', 'scaled_ambulance'],
        aggfunc = 'sum'
    ) / calc_num_weeks_between(min_date, split_date)



# testing how it pivots
df_test = df\
    .query('date >= @split_date & date < @max_date')\
    .pivot_table(
        index = ['day','hour_bin',area_type,'sky'],
        values = ['scaled_police', 'scaled_ambulance'],
        aggfunc = 'sum'
    ) / calc_num_weeks_between(split_date, max_date)


columns = {
    'hour_bin': 'Part of Day',
    'day': 'Day of the Week',
    'sky': 'Sky',
    'region': 'Region',
    'lga': 'LGA',
    'scaled_police':'Police',
    'scaled_ambulance': 'Ambulance',
    }

df_train = df_train.reset_index().rename(columns = columns)
df_test = df_test.reset_index().rename(columns = columns)

# saving data

df_train.to_csv(ROOT_DIR + ('_' + area_type +'.').join(TRAINING_DATA_DIR.split('.')), index= False)
df_test.to_csv(ROOT_DIR + ('_' + area_type +'.').join(TESTING_DATA_DIR.split('.')), index= False)

df_train.head(20)

Unnamed: 0,Day of the Week,Part of Day,Region,Sky,Ambulance,Police
0,Friday,Afternoon,EASTERN REGION,Clear,0.79806,0.655549
1,Friday,Afternoon,EASTERN REGION,Not clear,1.500637,1.20051
2,Friday,Afternoon,METROPOLITAN NORTH WEST REGION,Clear,5.116132,4.08293
3,Friday,Afternoon,METROPOLITAN NORTH WEST REGION,Not clear,6.662828,5.222217
4,Friday,Afternoon,METROPOLITAN SOUTH EAST REGION,Clear,5.978321,4.638721
5,Friday,Afternoon,METROPOLITAN SOUTH EAST REGION,Not clear,9.303949,7.683261
6,Friday,Afternoon,NORTH EASTERN REGION,Clear,1.118708,0.819436
7,Friday,Afternoon,NORTH EASTERN REGION,Not clear,0.780331,0.66028
8,Friday,Afternoon,NORTHERN REGION,Clear,0.954821,0.783808
9,Friday,Afternoon,NORTHERN REGION,Not clear,1.320561,1.020433
