# inital analysis

After doing some basic cleaning, we should look at what we have. 

This file creates some basic graphs with the roughly cleaned data

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import sys
import datetime
import math

ROOT_DIR = '../'
sys.path.insert(1, '../production_code/')
from constants import *

In [None]:
# reading and printing data
df = pd.read_csv(ROOT_DIR + ROUGHLY_CLEANED_DATA_DIR)
print(df.shape)
df.head(3)

# adding some needed columns

In [None]:
df.loc[:,'date'] = pd.to_datetime(df.loc[:,'date'])

# adding pure date stamp
df.loc[:,'date_stamp'] =  pd.to_datetime(df.loc[:,'date'].dt.date)

# turns day of week into a string
day_dict = {0: "Monday", 1: "Tuesday", 2: "Wednesday", 3: "Thursday", 4: "Friday", 5: "Saturday", 6: "Sunday"}
df.loc[:,'day'] = df.loc[:,'day'].map(day_dict)

# same for hour bin
hour_bin_dict = {0: 'Night', 1: 'Morning', 2:'Afternoon',3:'Evening'}
df.loc[:,'hour_bin'] = df.loc[:,'hour_bin'].map(hour_bin_dict)

df[['day','hour_bin']].head(3)

# Accidents over time

In [None]:

# creating emergency callouts over time graph
fig = px.line(
    pd\
        .concat(
            [    # gropuing police, ambulance count with total row count
                df.groupby('date_stamp')['police_needed','ambulance_needed'].sum(), 
                df.groupby('date_stamp')['ACCIDENT_NO'].count()
            ]
            ,axis = 1)\
        .resample('2W')\
        .sum()\
        .reset_index()\
        .rename(
            columns = {
                'police_needed':'Police',
                'ambulance_needed':'Ambulance',
                "ACCIDENT_NO":'Total'
            }
        ), 
    x="date_stamp", 
    y=["Total","Police", "Ambulance"], 
    color_discrete_map={
        "Police": POWERPOINT_BLUE,
        "Ambulance": POWERPOINT_RED,
        "Total": "lime"
    },
    labels={
            "value": "Count of Accidents in Need",
            "date_stamp": "Time",
            "variable": "Type of service"
        }, 
    title = "Emergency Service Demand vs time, week by week",
    )
fig.show()

# saving to file
fig.update_layout(font=dict(size=40))
fig.write_image(ROOT_DIR + DATA_INITIAL_ANALYSIS_DIR + 'DemandOverTime.png', width=3500, height=1500)

# checking correlations

In [None]:
# calculate the correlation matrix of atmospheric and road conditions
corr = abs(df[["dry", "clear", "fog", "raining", "strong winds"]].corr(method = 'pearson'))

# create a heatmap
fig = px.imshow(
    corr, 
    x=corr.columns, 
    y=corr.columns, 
    title='Correlation matrix of weather variables',
    labels=dict(color='Absolute correlation coefficient'),
    color_continuous_scale= POWERPOINT_COLOUR_SCALE
    )    
# add annotations
fig.update_traces(hoverinfo='text', text=corr.values, texttemplate='%{text:.2f}')
# show the figure
fig.show()

# saves the file
fig.update_layout(font=dict(size=40))
fig.write_image(ROOT_DIR + DATA_INITIAL_ANALYSIS_DIR + 'WeatherCorrelation.png', width=2000, height=1500)

# histogram of road and sky conditions

In [None]:
# create a histogram of all the columns
fig = px.histogram(
    df\
    .melt(
        id_vars = 'ACCIDENT_NO', 
        value_vars= ['dry', 'clear', 'fog', 'raining', 'strong winds']
        )\
    .query('value == 1')['variable'],
    
    labels={
            "variable": "Count of accidents",
            "value": "Condition"
        }, 
    color_discrete_sequence=[POWERPOINT_RED],
    histnorm= 'percent',
)
fig.update_layout(yaxis_title = 'Percent of Accidents')
fig.update_yaxes(ticksuffix = '%')
fig.update_layout(showlegend=False)
fig.show()
fig.update_layout(font=dict(size=40))
fig.write_image(ROOT_DIR + DATA_INITIAL_ANALYSIS_DIR + 'CrashByAtmos.png', width=2000, height=1500)

# historgram of accidents per part of day

In [None]:
# histogram of accidents per part of day
fig = px.histogram(
    df\
    .loc[:,'hour_bin'],
    labels={
            "variable": "Count of accidents",
            "value": "Part of Day"
        }, 
    title = "Crash Distribution by Part of Day",
    category_orders={
            "value": list(hour_bin_dict.values())
        },
    color_discrete_sequence=[POWERPOINT_RED],
    histnorm= 'percent',
)
fig.update_layout(yaxis_title = 'Percent of Accidents')
fig.update_yaxes(ticksuffix = '%')
fig.update_layout(showlegend=False)

# plotting
fig.show()

# saving file
fig.update_layout(font=dict(size=40))
fig.write_image(ROOT_DIR + DATA_INITIAL_ANALYSIS_DIR + 'CrashByTime.png', width=2000, height=1500)

# histogram of accidents per day

In [None]:
# creates histogram of accidents per day
fig = px.histogram(
    df\
        .loc[:,'day'],
    labels={
            "day": "Percent of accidents",
            "value": "Day of week"
        }, 
    title = "Crashes by Day",
    category_orders={
            "value": list(day_dict.values())
        },
    color_discrete_sequence=[POWERPOINT_BLUE],
    histnorm= 'percent'
)
fig.update_layout(yaxis_title = 'Percent of Accidents')
fig.update_yaxes(ticksuffix = '%')
fig.update_layout(showlegend=False)
fig.show()

# saving plot
fig.update_layout(font=dict(size=40))
fig.write_image(ROOT_DIR + DATA_INITIAL_ANALYSIS_DIR + 'CrashByDay.png', width=3000, height=1500)

# distribution of accidents by day of week and part of day

In [None]:
# finding distibution of accident count
corr =   df\
    .loc[:,['day','hour_bin','police_needed']]\
    .pivot_table(index = 'day', columns= ['hour_bin'], values = 'police_needed', aggfunc = 'count')

# averaging accross each part of day
# corr = (corr / corr.max()).T * 100   

corr = (corr / corr.max()).T * 100   



In [None]:
# finding distibution of accident count
corr =   df\
    .loc[:,['day','hour_bin','police_needed']]\
    .pivot_table(index = 'day', columns= ['hour_bin'], values = 'police_needed', aggfunc = 'count')

# averaging accross each part of day
corr = (corr / corr.max()).T * 100   

# reordering
corr = corr.loc[list(hour_bin_dict.values()),list(day_dict.values())]

# create a heatmap
fig = px.imshow(
    corr, 
    x=list(day_dict.values()), 
    y=list(hour_bin_dict.values()), 
    labels={
            "x": "Day of the Week",
            "y": "Time of Day",
            "color": "Relative Crashes per Time of Day"
        }, 
    title="Crash Distribution by Time",
    color_continuous_scale= POWERPOINT_COLOUR_SCALE
    )
    
# add annotations
fig.update_traces(hoverinfo='text', text=corr.values, texttemplate='%{text:.0f}%')
fig.update_layout(coloraxis_colorbar = {'ticksuffix': '%'})
# show the figure
fig.show()

# saving figure
fig.update_layout(font=dict(size=40))
fig.write_image(ROOT_DIR + DATA_INITIAL_ANALYSIS_DIR + 'CrashCorrTime.png', width=2500, height=1500)

# distribution by just the sky conditions

In [None]:
# adding sky condition
df.loc[:,'sky'] = df.loc[:,'clear'].apply(lambda clear: 'Clear' if clear else 'Not clear')

# plotting newly created sky conditions
fig = px.histogram(
    df\
        .loc[:,'sky'],
    labels={
            "day": "Percent of accidents",
            "value": "Sky conditions"
        }, 
    title = "Crashes by Sky Conditions",
    category_orders={
            "value": ['Clear','Not clear']
        },
    color_discrete_sequence=[POWERPOINT_BLUE],
    histnorm= 'percent'
)
fig.update_layout(yaxis_title = 'Percent of Accidents')
fig.update_yaxes(ticksuffix = '%')
fig.update_layout(showlegend=False)
fig.show()

# saving
fig.update_layout(font=dict(size=40))
fig.write_image(ROOT_DIR + DATA_INITIAL_ANALYSIS_DIR + 'CrashBySkyCond.png', width=2000, height=1500)

# outputting data

In [None]:
# saving data to output
df[['ACCIDENT_NO','date','day','hour_bin','region','lga','sky','police_needed', 'ambulance_needed']].to_csv(ROOT_DIR + FULLY_CLEANED_DATA_DIR)