In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import sys
import datetime
import math

ROOT_DIR = '../'
sys.path.insert(1, '../production_code/')
from constants import *

In [None]:
df = pd.read_csv(ROOT_DIR + FULLY_CLEANED_DATA_DIR)

# fixing date type
df.loc[:,'date'] = pd.to_datetime(df.loc[:,'date'])

# adding pure date stamp
df.loc[:,'date_stamp'] =  pd.to_datetime(df.loc[:,'date'].dt.date)

day_dict = {0: "Monday", 1: "Tuesday", 2: "Wednesday", 3: "Thursday", 4: "Friday", 5: "Saturday", 6: "Sunday"}
hour_bin_dict = {0: 'Night', 1: 'Morning', 2:'Afternoon',3:'Evening'}

print(df.shape)
df.head(3)


# Main Issues

### initial pivot

after completing an intial pivot of the data, there are two main issues

1. there is no larger date grouping, so if we have 6 years of data, the calculated needed will be double that of 3 years
2. Rainy days dont apear as often as clear days, so currently, looks like rainy days are less dangerous
3. some less populated regions have very little need
4. 


In [None]:
# testing how it pivots
(df.pivot_table(
    index = ['day','hour_bin','region','sky'],
    values = ['police_needed', 'ambulance_needed'],
    aggfunc = 'sum'
))


### Issue 1. scaling by time span

The largest time scale is the 7 days of a week. Therefore if we had 1 year of data, there would be 52 potential weeks that would be combined into the one table. and for 3 years, 156

Therefore, we simply need to divide everything by the number of weeks in the data set

In [None]:
print(pd.to_datetime(df['date']).max())
print(pd.to_datetime(df['date']).min())

# small function to find num of weeks
def calc_num_weeks_between(min, max):
    return math.ceil(
        (max - min).days / 7
    )
    
num_of_weeks = calc_num_weeks_between(df['date'].min(), df['date'].max())

# then final result must be divided by this
num_of_weeks


### Issue 2: need to scale weather conditions

If we assume that the dataset is vast enough, for each date, there would be at least one crash, with the weather reported. Therefore, by looking at the number of times it is either clear or not, for each hourly bin (removing duplicates from the smallest time group to ignore multiple crashes). We can get a rough distribution of the percentage of time it is clear, grouped by hourly bin for some extra granuality

In [None]:
# calculating count of times its not clear per hourly bin
time_dist_rain = df[df['sky'] == 'Not clear']\
        .drop_duplicates(   # removing duplicates in smallest time group
            subset = ['date_stamp']
            )\
        .groupby(    # grouping by hourly_bin
            ['hour_bin']
            )['ACCIDENT_NO']\
        .count() 


# repeats the
#  same for clear weather
time_dist_clear = df[df['sky'] == 'Clear'].drop_duplicates(subset = ['date_stamp','hour_bin']).groupby(['hour_bin'])['ACCIDENT_NO'].count()

# calculating percentage clear
clear_weights = time_dist_clear / (time_dist_rain + time_dist_clear)
clear_weights

In [None]:
# need to use these weights to scale
def scale_need(row, clear_weights):
    
    # if its clear
    if row['sky'] == 'Clear':

        # scales based on clear weights
        return row[0] * (1 - clear_weights[row['hour_bin']])
    # else its raining
    else:

        # scales based on not clear weights
        return row[0] * clear_weights[row['hour_bin']]

# uses above function to scale need
df.loc[:,'scaled_police'] = df.loc[:,['police_needed','sky','day','hour_bin']].apply(scale_need, axis = 1, clear_weights = clear_weights)
df.loc[:,'scaled_ambulance'] = df.loc[:,['ambulance_needed','sky','day','hour_bin']].apply(scale_need, axis = 1, clear_weights = clear_weights)

### Issue 3: low counts in regional areas

No scaling makes sense here, the counts are low because the number of crashes are low.

# final pivotted data

looking at the data now, it looks really good, the number needed for non clear days is slightly high than clear days, more ambulances are needed in metro areas. And no time is left without an ambulance. time to generate a training and tesing data set

In [None]:
# testing how it pivots
final = (df.pivot_table(
    index = ['day','hour_bin','region','sky'],
    values = ['scaled_police', 'scaled_ambulance'],
    aggfunc = 'sum'
) * 100 / num_of_weeks)

final

In [None]:
def display_final_df(df_final, query = 'sky == "Clear"'):
    df_temp = df_final.reset_index().query(query).pivot_table(
        index = 'day',
        columns = 'region',
        values = 'scaled_ambulance',
        aggfunc = 'mean',
    ).T


    # create a heatmap
    fig = px.imshow(
        df_temp, 
        x=list(day_dict.values()), 
        y=list(df_temp.index), 
        labels={
                "x": "Day of the Week",
                "y": "region",
                "color": "% chance of needing an ambulace"
            }, 
        title="Average Chance of an Ambulance, sky is clear"
        )

    # add annotations
    fig.update_traces(hoverinfo='text', text=df_temp.values, texttemplate='%{text:.2f}')
    # show the figure
    fig.show()
    return fig

fig = display_final_df(final)


# creating train test split

In [None]:

min_date = pd.to_datetime(TRAIN_SPLIT_MIN_DATE)
split_date = pd.to_datetime(TEST_TRAIN_SPLIT_DATE)
max_date = pd.to_datetime(TEST_SPLIT_MAX_DATE)


# testing how it pivots
df_train = df\
    .query('date >= @min_date & date < @split_date')\
    .pivot_table(
        index = ['day','hour_bin','region','sky'],
        values = ['scaled_police', 'scaled_ambulance'],
        aggfunc = 'sum'
    ) / calc_num_weeks_between(min_date, split_date)



# testing how it pivots
df_test = df\
    .query('date >= @split_date & date < @max_date')\
    .pivot_table(
        index = ['day','hour_bin','region','sky'],
        values = ['scaled_police', 'scaled_ambulance'],
        aggfunc = 'sum'
    ) / calc_num_weeks_between(split_date, max_date)


fig = display_final_df(df_train)
fig = display_final_df(df_test)

In [None]:

# testing how it pivots
df_train = df\
    .query('date >= @min_date & date < @split_date')\
    .pivot_table(
        index = ['day','hour_bin','lga','sky'],
        values = ['scaled_police', 'scaled_ambulance'],
        aggfunc = 'sum'
    ) / calc_num_weeks_between(min_date, split_date)



# testing how it pivots
df_test = df\
    .query('date >= @split_date & date < @max_date')\
    .pivot_table(
        index = ['day','hour_bin','lga','sky'],
        values = ['scaled_police', 'scaled_ambulance'],
        aggfunc = 'sum'
    ) / calc_num_weeks_between(split_date, max_date)


# saving data

df_train.to_csv(ROOT_DIR + TRAINING_DATA_DIR)
df_test.to_csv(ROOT_DIR + TESTING_DATA_DIR)

df_train.head(20)