In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import sys
import datetime
import math

ROOT_DIR = '../'
sys.path.insert(1, '../production_code/')
from constants import *

In [2]:
df = pd.read_csv(ROOT_DIR + ROUGHLY_CLEANED_DATA_DIR)

print(df.shape)
df.head(3)

(68903, 22)


Unnamed: 0.1,Unnamed: 0,ACCIDENT_NO,date,date_stamp,day,hour,hour_bin,season,month,day_light,...,lga,region,lat,long,dry,clear,fog,raining,strong winds,ambulance_needed
0,0,T20150000056,2015-02-01 03:00:00,2015-02-01,6,3,0,1,2,0,...,BAW BAW,EASTERN REGION,-38.22038,145.798298,True,1.0,0.0,0.0,0.0,True
1,1,T20150000060,2015-02-01 07:30:00,2015-02-01,6,7,1,1,2,2,...,HEPBURN,WESTERN REGION,-37.341041,144.155961,True,1.0,0.0,0.0,0.0,True
2,2,T20150000063,2015-02-01 08:30:00,2015-02-01,6,8,1,1,2,2,...,SOUTHERN GRAMPIANS,SOUTH WESTERN REGION,-37.731233,142.018874,True,1.0,0.0,0.0,0.0,True


In [3]:
df.loc[:,'date'] = pd.to_datetime(df.loc[:,'date'])

# adding pure date stamp
df.loc[:,'date_stamp'] =  pd.to_datetime(df.loc[:,'date'].dt.date)

# turns day of week into a string
day_dict = {0: "Monday", 1: "Tuesday", 2: "Wednesday", 3: "Thursday", 4: "Friday", 5: "Saturday", 6: "Sunday"}
df.loc[:,'day'] = df.loc[:,'day'].map(day_dict)

# same for hour bin
hour_bin_dict = {0: 'Night', 1: 'Morning', 2:'Afternoon',3:'Evening'}
df.loc[:,'hour_bin'] = df.loc[:,'hour_bin'].map(hour_bin_dict)

df[['day','hour_bin']].head(3)

Unnamed: 0,day,hour_bin
0,Sunday,Night
1,Sunday,Morning
2,Sunday,Morning


# checking correlations

In [4]:

# calculate the correlation matrix
corr = abs(df[["dry", "clear", "fog", "raining", "strong winds"]].corr(method = 'pearson'))
# create a heatmap
fig = px.imshow(corr, x=corr.columns, y=corr.columns, 
                title='Correlation matrix of weather variables',
                labels=dict(color='Correlation coefficient'))
# add annotations
fig.update_traces(hoverinfo='text', text=corr.values, texttemplate='%{text:.2f}')
# show the figure
fig.show()

In [5]:
# converting clear to catergorical


In [6]:
# create a histogram of all the columns


fig = px.histogram(
    df\
    .melt(
        id_vars = 'ACCIDENT_NO', 
        value_vars= ['dry', 'clear', 'fog', 'raining', 'strong winds']
        )\
    .query('value == 1')['variable'])
fig.show()

In [7]:

fig = px.histogram(
    df\
    .loc[:,'hour_bin'],
    labels={
            "variable": "Count of accidents",
            "value": "Part of Day"
        }, 
    title = "Crash Distribution by Part of Day",
    category_orders={
            "value": list(hour_bin_dict.values())
        }
)
fig.update_layout(yaxis_title = 'Count of Accidents')
fig.update_layout(showlegend=False)
fig.show()

In [8]:
fig = px.histogram(
    df\
        .loc[:,'day'],
    labels={
            "day": "Count of accidents",
            "value": "Day of week"
        }, 
    title = "Crash Distribution by Day",
    category_orders={
            "value": list(day_dict.values())
        }
)
fig.update_layout(yaxis_title = 'Count of Accidents')
fig.update_layout(showlegend=False)
fig.show()

In [9]:
corr.max()
corr 

Unnamed: 0,dry,clear,fog,raining,strong winds
dry,1.0,0.768822,0.074902,0.789654,0.098264
clear,0.768822,1.0,0.292661,0.945251,0.150992
fog,0.074902,0.292661,1.0,0.022332,0.036313
raining,0.789654,0.945251,0.022332,1.0,0.111546
strong winds,0.098264,0.150992,0.036313,0.111546,1.0


In [10]:
corr =   df\
    .loc[:,['day','hour_bin','police_needed']]\
    .pivot_table(index = 'day', columns= ['hour_bin'], values = 'police_needed', aggfunc = 'count')

corr = (corr / corr.max()).T
    

# create a heatmap
fig = px.imshow(
    corr, 
    x=list(day_dict.values()), 
    y=list(hour_bin_dict.values()), 
    labels={
            "x": "Day of the Week",
            "y": "Time of Day",
            "color": "Relative Crashes per Time of Day"
        }, 
    title="Crash Distribution by Time"
    )
    
# add annotations
fig.update_traces(hoverinfo='text', text=corr.values, texttemplate='%{text:.2f}')
# show the figure
fig.show()

In [11]:
# adding sky condition
df.loc[:,'sky'] = df.loc[:,'clear'].apply(lambda clear: 'Clear' if clear else 'Not clear')



fig = px.histogram(
    df\
        .loc[:,'sky'],
    labels={
            "day": "Count of accidents",
            "value": "Clear skys"
        }, 
    title = "Crash Distribution by Day",
    category_orders={
            "value": ['Clear','Not clear']
        }
)
fig.update_layout(yaxis_title = 'Count of Accidents')
fig.update_layout(showlegend=False)
fig.show()

In [12]:


fig = px.line(
    pd\
        .concat(
            [
                df.groupby('date_stamp')['police_needed','ambulance_needed'].sum(), 
                df.groupby('date_stamp')['ACCIDENT_NO'].count()
            ]
            ,axis = 1)\
        .resample('W')\
        .sum()\
        .reset_index()\
        .rename(
            columns = {
                'police_needed':'Police',
                'ambulance_needed':'Ambulance'
            }
        ), 
    x="date_stamp", 
    y=["Police", "Ambulance"],#"ACCIDENT_NO"], 
    color_discrete_map={
        "Police": "blue",
        "Ambulance": "red",
        "ACCIDENT_NO": "green"
    },
    labels={
            "value": "Count of Accidents in Need",
            "date_stamp": "Time",
            "variable": "Type of service"
        }, 
    title = "Emergency Service Demand vs time, week by week",
    )
fig.show()


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



In [15]:

df[['ACCIDENT_NO','day','hour_bin','region','lga','sky','police_needed', 'ambulance_needed']].to_csv(ROOT_DIR + FULLY_CLEANED_DATA_DIR)

Unnamed: 0,ACCIDENT_NO,day,hour_bin,region,lga,sky,police_needed,ambulance_needed
0,T20150000056,Sunday,Night,EASTERN REGION,BAW BAW,Clear,1,True
1,T20150000060,Sunday,Morning,WESTERN REGION,HEPBURN,Clear,1,True
2,T20150000063,Sunday,Morning,SOUTH WESTERN REGION,SOUTHERN GRAMPIANS,Clear,1,True
3,T20150000075,Sunday,Afternoon,METROPOLITAN SOUTH EAST REGION,BAYSIDE,Clear,1,True
4,T20150000076,Sunday,Morning,EASTERN REGION,SOUTH GIPPSLAND,Clear,1,True
...,...,...,...,...,...,...,...,...
68898,T20200019031,Tuesday,Morning,METROPOLITAN NORTH WEST REGION,MARIBYRNONG,Clear,1,True
68899,T20200019195,Saturday,Afternoon,METROPOLITAN NORTH WEST REGION,YARRA,Clear,1,True
68900,T20200019239,Saturday,Afternoon,METROPOLITAN NORTH WEST REGION,MELBOURNE,Clear,1,True
68901,T20200019247,Saturday,Afternoon,SOUTH WESTERN REGION,GEELONG,Clear,1,True
