In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import sys
import datetime
import math

ROOT_DIR = '../'
sys.path.insert(1, '../production_code/')
from constants import *

In [None]:
df = pd.read_csv(ROOT_DIR + ROUGHLY_CLEANED_DATA_DIR)

print(df.shape)
df.head(3)

In [None]:
df.loc[:,'date'] = pd.to_datetime(df.loc[:,'date'])

# adding pure date stamp
df.loc[:,'date_stamp'] =  pd.to_datetime(df.loc[:,'date'].dt.date)

# turns day of week into a string
day_dict = {0: "Monday", 1: "Tuesday", 2: "Wednesday", 3: "Thursday", 4: "Friday", 5: "Saturday", 6: "Sunday"}
df.loc[:,'day'] = df.loc[:,'day'].map(day_dict)

# same for hour bin
hour_bin_dict = {0: 'Night', 1: 'Morning', 2:'Afternoon',3:'Evening'}
df.loc[:,'hour_bin'] = df.loc[:,'hour_bin'].map(hour_bin_dict)

df[['day','hour_bin']].head(3)

# checking correlations

In [None]:

# calculate the correlation matrix
corr = abs(df[["dry", "clear", "fog", "raining", "strong winds"]].corr(method = 'pearson'))
# create a heatmap
fig = px.imshow(corr, x=corr.columns, y=corr.columns, 
                title='Correlation matrix of weather variables',
                labels=dict(color='Correlation coefficient'))
# add annotations
fig.update_traces(hoverinfo='text', text=corr.values, texttemplate='%{text:.2f}')
# show the figure
fig.show()

In [None]:
# converting clear to catergorical


In [None]:
# create a histogram of all the columns


fig = px.histogram(
    df\
    .melt(
        id_vars = 'ACCIDENT_NO', 
        value_vars= ['dry', 'clear', 'fog', 'raining', 'strong winds']
        )\
    .query('value == 1')['variable'])
fig.show()

In [None]:

fig = px.histogram(
    df\
    .loc[:,'hour_bin'],
    labels={
            "variable": "Count of accidents",
            "value": "Part of Day"
        }, 
    title = "Crash Distribution by Part of Day",
    category_orders={
            "value": list(hour_bin_dict.values())
        }
)
fig.update_layout(yaxis_title = 'Count of Accidents')
fig.update_layout(showlegend=False)
fig.show()

In [None]:
fig = px.histogram(
    df\
        .loc[:,'day'],
    labels={
            "day": "Count of accidents",
            "value": "Day of week"
        }, 
    title = "Crash Distribution by Day",
    category_orders={
            "value": list(day_dict.values())
        }
)
fig.update_layout(yaxis_title = 'Count of Accidents')
fig.update_layout(showlegend=False)
fig.show()

In [None]:
corr.max()
corr 

In [None]:
corr =   df\
    .loc[:,['day','hour_bin','police_needed']]\
    .pivot_table(index = 'day', columns= ['hour_bin'], values = 'police_needed', aggfunc = 'count')

corr = (corr / corr.max()).T
    

# create a heatmap
fig = px.imshow(
    corr, 
    x=list(day_dict.values()), 
    y=list(hour_bin_dict.values()), 
    labels={
            "x": "Day of the Week",
            "y": "Time of Day",
            "color": "Relative Crashes per Time of Day"
        }, 
    title="Crash Distribution by Time"
    )
    
# add annotations
fig.update_traces(hoverinfo='text', text=corr.values, texttemplate='%{text:.2f}')
# show the figure
fig.show()

In [None]:
# adding sky condition
df.loc[:,'sky'] = df.loc[:,'clear'].apply(lambda clear: 'Clear' if clear else 'Not clear')



fig = px.histogram(
    df\
        .loc[:,'sky'],
    labels={
            "day": "Count of accidents",
            "value": "Clear skys"
        }, 
    title = "Crash Distribution by Day",
    category_orders={
            "value": ['Clear','Not clear']
        }
)
fig.update_layout(yaxis_title = 'Count of Accidents')
fig.update_layout(showlegend=False)
fig.show()

In [None]:


fig = px.line(
    pd\
        .concat(
            [
                df.groupby('date_stamp')['police_needed','ambulance_needed'].sum(), 
                df.groupby('date_stamp')['ACCIDENT_NO'].count()
            ]
            ,axis = 1)\
        .resample('W')\
        .sum()\
        .reset_index()\
        .rename(
            columns = {
                'police_needed':'Police',
                'ambulance_needed':'Ambulance'
            }
        ), 
    x="date_stamp", 
    y=["Police", "Ambulance"],#"ACCIDENT_NO"], 
    color_discrete_map={
        "Police": "blue",
        "Ambulance": "red",
        "ACCIDENT_NO": "green"
    },
    labels={
            "value": "Count of Accidents in Need",
            "date_stamp": "Time",
            "variable": "Type of service"
        }, 
    title = "Emergency Service Demand vs time, week by week",
    )
fig.show()

In [None]:

df[['ACCIDENT_NO','date','day','hour_bin','region','lga','sky','police_needed', 'ambulance_needed']].to_csv(ROOT_DIR + FULLY_CLEANED_DATA_DIR)