In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import sys
import datetime
import math

ROOT_DIR = '../'
sys.path.insert(1, '../production_code/')
from constants import *

# importing data

In [2]:
# importing data
df = pd.read_csv(ROOT_DIR + FULLY_CLEANED_DATA_DIR)

# fixing date type
df.loc[:,'date'] = pd.to_datetime(df.loc[:,'date'])

# adding pure date stamp
df.loc[:,'date_stamp'] =  pd.to_datetime(df.loc[:,'date'].dt.date)

day_dict = {0: "Monday", 1: "Tuesday", 2: "Wednesday", 3: "Thursday", 4: "Friday", 5: "Saturday", 6: "Sunday"}
hour_bin_dict = {0: 'Night', 1: 'Morning', 2:'Afternoon',3:'Evening'}

# displaying data
print(df.shape)
df.head(3)


(70367, 11)


Unnamed: 0.1,Unnamed: 0,ACCIDENT_NO,date,day,hour_bin,region,lga,sky,police_needed,ambulance_needed,date_stamp
0,0,T20150000056,2015-02-01 03:00:00,Sunday,Night,EASTERN REGION,BAW BAW,Clear,1,True,2015-02-01
1,1,T20150000060,2015-02-01 07:30:00,Sunday,Morning,WESTERN REGION,HEPBURN,Clear,1,True,2015-02-01
2,2,T20150000063,2015-02-01 08:30:00,Sunday,Morning,SOUTH WESTERN REGION,SOUTHERN GRAMPIANS,Clear,1,True,2015-02-01


# Main Issues

### initial pivot

after completing an intial pivot of the data, there are two main issues

1. there is no larger date grouping, so if we have 6 years of data, the calculated needed will be double that of 3 years
2. Rainy days dont apear as often as clear days, so currently, looks like rainy days are less dangerous
3. some less populated regions have very little need


In [3]:
# testing how it pivots
(df.pivot_table(
    index = ['day','hour_bin','region','sky'],
    values = ['police_needed', 'ambulance_needed'],
    aggfunc = 'sum'
))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,ambulance_needed,police_needed
day,hour_bin,region,sky,Unnamed: 4_level_1,Unnamed: 5_level_1
Friday,Afternoon,EASTERN REGION,Clear,218,178
Friday,Afternoon,EASTERN REGION,Not clear,39,33
Friday,Afternoon,METROPOLITAN NORTH WEST REGION,Clear,1326,1050
Friday,Afternoon,METROPOLITAN NORTH WEST REGION,Not clear,158,127
Friday,Afternoon,METROPOLITAN SOUTH EAST REGION,Clear,1501,1207
...,...,...,...,...,...
Wednesday,Night,NORTHERN REGION,Not clear,6,6
Wednesday,Night,SOUTH WESTERN REGION,Clear,42,38
Wednesday,Night,SOUTH WESTERN REGION,Not clear,8,6
Wednesday,Night,WESTERN REGION,Clear,23,23


In [4]:
# exploring the values of crashes at one friday afternoon with clear skys in the south eastern metro region
df.query('hour_bin == "Afternoon" & day == "Friday" & sky == "Clear" & region == "METROPOLITAN SOUTH EAST REGION"').groupby('date_stamp').sum()

Unnamed: 0_level_0,Unnamed: 0,police_needed,ambulance_needed
date_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-01-02,16727,3,4
2015-01-09,64278,6,7
2015-01-16,2621,4,5
2015-01-23,6144,5,6
2015-01-30,16711,6,7
...,...,...,...
2020-09-18,140561,2,2
2020-10-02,330458,5,5
2020-10-09,210492,3,3
2020-10-16,140655,2,2



### Issue 1. scaling by time span

The largest time scale is the 7 days of a week. Therefore if we had 1 year of data, there would be 52 potential weeks that would be combined into the one table. and for 3 years, 156

Therefore, we simply need to divide everything by the number of weeks in the data set

In [5]:
print(pd.to_datetime(df['date']).max())
print(pd.to_datetime(df['date']).min())

# small function to find num of weeks
def calc_num_weeks_between(min, max):
    return math.ceil(
        (max - min).days / 7
    )
    
num_of_weeks = calc_num_weeks_between(df['date'].min(), df['date'].max())

# then final result must be divided by this
num_of_weeks


2020-12-10 17:45:00
2015-01-02 00:04:00


310

### Issue 2: need to scale weather conditions

If we assume that the dataset is vast enough, for each date, there would be at least one crash, with the weather reported. Therefore, by looking at the number of times it is either clear or not, for each hourly bin (removing duplicates from the smallest time group to ignore multiple crashes). We can get a rough distribution of the percentage of time it is clear, grouped by hourly bin for some extra granuality

In [6]:
# calculating count of times its not clear per hourly bin
time_dist_rain = df[df['sky'] == 'Not clear']\
        .drop_duplicates(   # removing duplicates in smallest time group
            subset = ['date_stamp']
            )\
        .groupby(    # grouping by hourly_bin
            ['hour_bin']
            )['ACCIDENT_NO']\
        .count() 


# repeats the
#  same for clear weather
time_dist_clear = df[df['sky'] == 'Clear'].drop_duplicates(subset = ['date_stamp','hour_bin']).groupby(['hour_bin'])['ACCIDENT_NO'].count()

# calculating percentage clear
pr_clear = time_dist_clear / (time_dist_rain + time_dist_clear)
pr_clear

hour_bin
Afternoon    0.891617
Evening      0.921805
Morning      0.778569
Night        0.800400
Name: ACCIDENT_NO, dtype: float64

In [7]:
# need to use these weights to scale
def scale_need(row, clear_weights):
    
    # if its clear
    if row['sky'] == 'Clear':

        # scales based on clear weights
        return row[0] * 1/(pr_clear[row['hour_bin']])
    # else its raining
    else:

        # scales based on not clear weights
        return row[0] * 1/(1 - pr_clear[row['hour_bin']])

# uses above function to scale need
df.loc[:,'scaled_police'] = df.loc[:,['police_needed','sky','region','hour_bin']].apply(scale_need, axis = 1, clear_weights = pr_clear)
df.loc[:,'scaled_ambulance'] = df.loc[:,['ambulance_needed','sky','region','hour_bin']].apply(scale_need, axis = 1, clear_weights = pr_clear)

### Issue 3: low counts in regional areas

No scaling makes sense here, the counts are low because the number of crashes are low.

# final pivotted data

looking at the data now, it looks really good, the number needed for non clear days is slightly high than clear days, more ambulances are needed in metro areas. And no time is left without an ambulance. time to generate a training and tesing data set

In [8]:
# testing how it pivots
final = (df.pivot_table(
    index = ['day','hour_bin','region','sky'],
    values = ['scaled_police', 'scaled_ambulance'],
    aggfunc = 'sum'
) / (num_of_weeks))

final

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,scaled_ambulance,scaled_police
day,hour_bin,region,sky,Unnamed: 4_level_1,Unnamed: 5_level_1
Friday,Afternoon,EASTERN REGION,Clear,0.788708,0.643991
Friday,Afternoon,EASTERN REGION,Not clear,1.160761,0.982182
Friday,Afternoon,METROPOLITAN NORTH WEST REGION,Clear,4.797372,3.798824
Friday,Afternoon,METROPOLITAN NORTH WEST REGION,Not clear,4.702571,3.779914
Friday,Afternoon,METROPOLITAN SOUTH EAST REGION,Clear,5.430509,4.366838
...,...,...,...,...,...
Wednesday,Night,NORTHERN REGION,Not clear,0.096968,0.096968
Wednesday,Night,SOUTH WESTERN REGION,Clear,0.169270,0.153149
Wednesday,Night,SOUTH WESTERN REGION,Not clear,0.129291,0.096968
Wednesday,Night,WESTERN REGION,Clear,0.092696,0.092696


In [19]:
# function to display final data for comparisons
def display_final_df(df_final, query_ = 'Clear'):
    
    # pivotting data
    df_temp = df_final.reset_index().query('sky == @query_').pivot_table(
        index = 'hour_bin',
        columns = 'region',
        values = 'scaled_ambulance',
        aggfunc = 'mean',
    ).T

    # renaming columns
    df_temp.columns = [(x + " " + query_) for x in df_temp.columns] 

    # reordienr data
    df_temp = df_temp.loc[list(df_temp.index),[(x + " " + query_) for x in list(hour_bin_dict.values())]]


    # create a heatmap
    fig = px.imshow(
        df_temp, 
        x=[(x + " " + query_) for x in list(hour_bin_dict.values())], 
        y=list(df_temp.index), 
        labels={
                "x": "Part of Day",
                "y": "Region",
                "color": "Average Number of Ambulances"
            }, 
        title="Ambulances, Sky is " + query_,
        color_continuous_scale= POWERPOINT_COLOUR_SCALE
        )

    # add annotations
    fig.update_traces(hoverinfo='text', text=df_temp.values, texttemplate='%{text:.2f}')
    
    # shows and returns the graph
    fig.show()
    return fig

# for clear and not clear data
for query_ in ["Clear", "Not clear"]:

    # creates plot
    fig = display_final_df(final, query_ = query_)

    # saves graph
    fig.update_layout(font=dict(size=40))
    fig.write_image(ROOT_DIR + DATA_FURTHER_ANALYSIS_DIR + f'final_data_plot_{query_}.png', width=2500, height=1500)



# creating train test split

In [17]:
# train and test split
min_date = pd.to_datetime(TRAIN_SPLIT_MIN_DATE)
split_date = pd.to_datetime(TEST_TRAIN_SPLIT_DATE)
max_date = pd.to_datetime(TEST_SPLIT_MAX_DATE)


# testing how it pivots
df_train = df\
    .query('date >= @min_date & date < @split_date')\
    .pivot_table(
        index = ['day','hour_bin','region','sky'],
        values = ['scaled_police', 'scaled_ambulance'],
        aggfunc = 'sum'
    ) / calc_num_weeks_between(min_date, split_date)



# testing how it pivots
df_test = df\
    .query('date >= @split_date & date < @max_date')\
    .pivot_table(
        index = ['day','hour_bin','region','sky'],
        values = ['scaled_police', 'scaled_ambulance'],
        aggfunc = 'sum'
    ) / calc_num_weeks_between(split_date, max_date)

# displaying data
fig = display_final_df(df_train)
fig = display_final_df(df_test)

In [11]:
# exploring the values of crashes at one friday afternoon with rainy skys in the south eastern metro region
df.query('hour_bin == "Afternoon" & day == "Friday" & sky == "Not clear" & region == "METROPOLITAN SOUTH EAST REGION"').groupby('date_stamp').sum()

Unnamed: 0_level_0,Unnamed: 0,police_needed,ambulance_needed,scaled_police,scaled_ambulance
date_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-02-13,3162,1,2,9.226563,18.453125
2015-04-17,49298,9,11,83.039063,101.492188
2015-05-08,24537,3,3,27.679688,27.679688
2015-06-05,38249,7,8,64.585938,73.812500
2015-07-17,15408,1,2,9.226563,18.453125
...,...,...,...,...,...
2019-11-08,59425,1,1,9.226563,9.226563
2019-12-06,230134,4,4,36.906250,36.906250
2019-12-20,193421,2,3,18.453125,27.679688
2020-02-14,532508,5,8,46.132813,73.812500


In [12]:
# exploring the values of crashes at one friday afternoon with clear skys in the south eastern metro region
df.query('hour_bin == "Afternoon" & day == "Friday" & sky == "Clear" & region == "METROPOLITAN SOUTH EAST REGION"').groupby('date_stamp').sum()

Unnamed: 0_level_0,Unnamed: 0,police_needed,ambulance_needed,scaled_police,scaled_ambulance
date_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-01-02,16727,3,4,3.364672,4.486230
2015-01-09,64278,6,7,6.729345,7.850902
2015-01-16,2621,4,5,4.486230,5.607787
2015-01-23,6144,5,6,5.607787,6.729345
2015-01-30,16711,6,7,6.729345,7.850902
...,...,...,...,...,...
2020-09-18,140561,2,2,2.243115,2.243115
2020-10-02,330458,5,5,5.607787,5.607787
2020-10-09,210492,3,3,3.364672,3.364672
2020-10-16,140655,2,2,2.243115,2.243115


In [13]:
# saving pre split data for comparisons
df.to_csv(ROOT_DIR + PREPIVOT_TRAIN_TEST_DATA_DIR)

In [14]:
area_type = 'region' 
# area_type = 'lga'   # testing region vs lga split

# pivoting training data
df_train = df\
    .query('date >= @min_date & date < @split_date')\
    .pivot_table(
        index = ['day','hour_bin',area_type,'sky'],
        values = ['scaled_police', 'scaled_ambulance'],
        aggfunc = 'sum'
    ) / calc_num_weeks_between(min_date, split_date)


# pivoting test data
df_test = df\
    .query('date >= @split_date & date < @max_date')\
    .pivot_table(
        index = ['day','hour_bin',area_type,'sky'],
        values = ['scaled_police', 'scaled_ambulance'],
        aggfunc = 'sum'
    ) / calc_num_weeks_between(split_date, max_date)

# columns to rename
columns = {
    'hour_bin': 'Part of Day',
    'day': 'Day of the Week',
    'sky': 'Sky',
    'region': 'Region',
    'lga': 'LGA',
    'scaled_police':'Police',
    'scaled_ambulance': 'Ambulance',
    }

# renaming columns
df_train = df_train.reset_index().rename(columns = columns)
df_test = df_test.reset_index().rename(columns = columns)

# saving data
df_train.to_csv(ROOT_DIR + ('_' + area_type +'.').join(TRAINING_DATA_DIR.split('.')), index= False)
df_test.to_csv(ROOT_DIR + ('_' + area_type +'.').join(TESTING_DATA_DIR.split('.')), index= False)

# printing some data
df_train.head(20)

Unnamed: 0,Day of the Week,Part of Day,Region,Sky,Ambulance,Police
0,Friday,Afternoon,EASTERN REGION,Clear,0.880074,0.724451
1,Friday,Afternoon,EASTERN REGION,Not clear,1.368533,1.103656
2,Friday,Afternoon,METROPOLITAN NORTH WEST REGION,Clear,5.199948,4.164252
3,Friday,Afternoon,METROPOLITAN NORTH WEST REGION,Not clear,5.783156,4.591208
4,Friday,Afternoon,METROPOLITAN SOUTH EAST REGION,Clear,6.01026,4.743812
5,Friday,Afternoon,METROPOLITAN SOUTH EAST REGION,Not clear,7.813883,6.44535
6,Friday,Afternoon,NORTH EASTERN REGION,Clear,1.073261,0.826411
7,Friday,Afternoon,NORTH EASTERN REGION,Not clear,0.794632,0.70634
8,Friday,Afternoon,NORTHERN REGION,Clear,0.896173,0.724451
9,Friday,Afternoon,NORTHERN REGION,Not clear,1.103656,0.882925
