In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import sys

ROOT_DIR = '../'
sys.path.insert(1, '../production_code/')
from constants import *

In [None]:
# importing data
accidents = pd.read_csv(ROOT_DIR + ACCIDENT_DATA_GENERAL_DIR)
node = pd.read_csv(ROOT_DIR + ACCIDENT_DATA_NODE_DIR)
person = pd.read_csv(ROOT_DIR + ACCIDENT_DATA_PERSON_DIR)
atmospheric = pd.read_csv(ROOT_DIR + ACCIDENT_DATA_ATMOSPHERIC_DIR)
road_cond = pd.read_csv(ROOT_DIR + ACCIDENT_DATA_ROAD_COND_DIR)



# data model

features

| group name | column name | data type | description | original dataset |
| ---- | ----- | ---- | ---- | ---- |
| time | date | pd.datetime |  | accident
|  | day of week | pd.datetime |  | accident
|  | time of day (hour) | pd.datetime |  | accident
| | light level | int | dark (any) = 0, dawn/dusk = 1, day = 2 | LIGHT_COND, accident
| | | | | 
| location | node_id | int | | node |
| | lga | string | local area | node |
| | region | string | | node |
| | long | float | longitude | node |
| | lat | float | latitude | node |
| |  | | |
| atmospheric | 1: clear         | bool |            | atmostpheric
|             | 2: raining       | bool |            | atmostpheric
|             | 3: snowing       | bool |            | atmostpheric
|             | 4: fog           | bool |            | atmostpheric
|             | 5: smoke         | bool |            | atmostpheric
|             | 6: dust          | bool |            | atmostpheric
|             | 7: winds         | bool |            | atmostpheric
|             | 9: unknown       | bool | remove unknown | atmostpheric
| | | | |
| road_cond   | 1: dry           | bool |           | road condition
|             | 2: wet           | bool |           | road condition
|             | 3: muddy         | bool |           | road condition
|             | 4: snowy         | bool |           | road condition
|             | 5: icy           | bool |           | road condition
|             | 9: unknown       | bool | remove unknown  | road condition
| | | | |

labels

| column name | data type | description | original dataset |
| ----- | ---- | ---- | ---- |
| police_needed | int | number of police for colision bin | POLICE_ATTENDED, accident |
| ambulance_needed | int | number of ambulance needed for that colision bin | to investigate, mix of TAKEN_HOSPITAL in PERSON and if injuries are serious in ACCIDENT | 


 


 ### Abulance needed

assuming an ambulace is called if

there are more than 
- 5 people involved in a crash ? (didnt include)
- any person in the crash has an inj_level > 0
- any person was taken to the hospital

only calls one ambulance no matter number of people, to keep inline with number of police



In [None]:
accidents.groupby('ACCIDENT_NO')['ACCIDENTDATE'].count().sort_values()   # no duplicates for accidnets no, theyre unique


node.groupby('ACCIDENT_NO')['NODE_ID'].count().sort_values()
node.query('ACCIDENT_NO == "T20170021373"')    # dueplicated nodes per accident are due to issues with postcode double ups, not in multi location



person.groupby('ACCIDENT_NO')['PERSON_ID'].count().sort_values()     # duplicate ACCIDENT_NO for multiple perople in single colision
person.query('ACCIDENT_NO == "T20130018492"')   

atmospheric.groupby('ACCIDENT_NO')['ATMOSPH_COND'].count().sort_values()     
atmospheric.query('ACCIDENT_NO == "T20190001830"')           # several conditions like winds and rain, need to factor in, probably pivot as seperate columns



road_cond.groupby('ACCIDENT_NO')['SURFACE_COND'].count().sort_values()
road_cond.query('ACCIDENT_NO == "T20070019368"')          # same as atmospheric, several conditions can be met

# filtering and transforming 

### accidents

In [None]:
# date

# removing na dates and times
accidents = accidents.dropna(subset=['ACCIDENTDATE'])
accidents = accidents.dropna(subset=['ACCIDENTTIME'])

# only keeping more recent data from 2016 prior, new data will be more accurate
accidents = accidents[pd.to_datetime(accidents['ACCIDENTDATE']) > pd.to_datetime(EARLIEST_DATE)].reset_index(drop = True)
accidents.head(3)

accidents.loc[:,'date'] = pd.to_datetime(accidents[['ACCIDENTDATE','ACCIDENTTIME']].apply(lambda x: x[0] + " " + x[1] , axis = 1))

In [None]:
# light conditions

accidents = accidents.query("LIGHT_CONDITION != 9")

accidents.loc[:,'day_light'] = accidents['LIGHT_CONDITION'].apply(lambda light_level: 3 - min(light_level, 3))    # converts conditions to 0-2 scale of daylight
accidents.groupby('LIGHT_CONDITION')['day_light'].unique()

In [None]:
accidents = accidents.query("POLICE_ATTEND != 9")   # removing when unsure if police attened or not

accidents.loc[:,'police_needed'] = accidents['POLICE_ATTEND'].apply(lambda x: 2 - x)
accidents.groupby('POLICE_ATTEND')['police_needed'].unique()


In [None]:
accidents[['ACCIDENT_NO','date','police_needed']].head(3)

### person

In [None]:
# only keeping instances with ids that exist
person = person[person['ACCIDENT_NO'].isin(accidents['ACCIDENT_NO'])]

# converting injury level to number
person.loc[:,'injury_level'] = person['INJ_LEVEL'].apply(pd.to_numeric, errors= 'coerce')
person = person.dropna(subset = ['injury_level'])

person.loc[:,'ambulance_needed'] = person[['injury_level','TAKEN_HOSPITAL']].apply(lambda x: (x['injury_level'] < 4) or (x['TAKEN_HOSPITAL'] == 'Y'), axis = 1)    # true if not a non_injury

# person.groupby(['INJ_LEVEL'])['ambulance_needed'].unique()   # data looks good

# accidents where ambulance was needed
person_grouped = person.groupby('ACCIDENT_NO')['ambulance_needed'].any()

# pivots to summarize ambulance data per crash
person_pivotted = person\
    .pivot_table(index = "ACCIDENT_NO", values = 'ambulance_needed', aggfunc = "max")\
    .fillna(0)\
    .applymap(lambda x: min(x, 1))\
    .reset_index()

person_pivotted[['ACCIDENT_NO','ambulance_needed']].head(3)

### node

In [None]:
# only keeping instances with ids that exist
node = node[node['ACCIDENT_NO'].isin(accidents['ACCIDENT_NO'])]

node

node.loc[:,'lat'] = node['Lat'].apply(pd.to_numeric, errors= 'coerce')
node.loc[:,'long'] = node['Long'].apply(pd.to_numeric, errors= 'coerce')

node = node.rename(columns = {'REGION_NAME': 'region', 'LGA_NAME':'lga', 'NODE_ID':'node_id'})

node[['ACCIDENT_NO','node_id','lga','region_name','lat','long']].drop_duplicates().head(3)

### road_cond

In [None]:
# only keeping instances with ids that exist
road_cond = road_cond[road_cond['ACCIDENT_NO'].isin(accidents['ACCIDENT_NO'])]

# removes unknowns
road_cond = road_cond.query("SURFACE_COND != 9")
road_cond = road_cond.dropna(subset = ['SURFACE_COND'])

# making descrions lowercase
road_cond.loc[:,'Surface Cond Desc'] = road_cond.loc[:,'Surface Cond Desc'].str.lower()

# pivots to create new columns
road_cond_pivotted = road_cond\
    .pivot(index = "ACCIDENT_NO", columns = 'Surface Cond Desc', values = 'SURFACE_COND')\
    .fillna(0)\
    .applymap(lambda x: min(x, 1))\
    .reset_index()


road_cond_pivotted[['ACCIDENT_NO', 'dry', 'icy', 'muddy', 'snowy', 'wet']].drop_duplicates().head(3)

### atmospheric

In [None]:
# only keeping instances with ids that exist
atmospheric = atmospheric[atmospheric['ACCIDENT_NO'].isin(accidents['ACCIDENT_NO'])]

# removes unknowns
atmospheric = atmospheric.query("ATMOSPH_COND != 9")
atmospheric = atmospheric.dropna(subset = ['ATMOSPH_COND'])

# making descrions lowercase
atmospheric.loc[:,'Atmosph Cond Desc'] = atmospheric.loc[:,'Atmosph Cond Desc'].str.lower()

# pivots to create new columns
atmospheric_pivotted = atmospheric\
    .pivot(index = "ACCIDENT_NO", columns = 'Atmosph Cond Desc', values = 'ATMOSPH_COND')\
    .fillna(0)\
    .applymap(lambda x: min(x, 1))\
    .reset_index()

atmospheric_pivotted[['ACCIDENT_NO', 'clear', 'dust', 'fog', 'raining', 'smoke', 'snowing', 'strong winds']].drop_duplicates().head(3)



# combining data

In [None]:
output = accidents[['ACCIDENT_NO','date','police_needed']].drop_duplicates()\
    .merge(
        node[['ACCIDENT_NO','node_id','lga','region_name','lat','long']].drop_duplicates(), 
        how='inner')\
    .merge(
        road_cond_pivotted[['ACCIDENT_NO', 'dry', 'icy', 'muddy', 'snowy', 'wet']].drop_duplicates(), 
        how='inner')\
    .merge(
        atmospheric_pivotted[['ACCIDENT_NO', 'clear', 'dust', 'fog', 'raining', 'smoke', 'snowing', 'strong winds']].drop_duplicates(), 
        how='inner')\
    .merge(
        person_pivotted[['ACCIDENT_NO','ambulance_needed']].drop_duplicates(), 
        how='inner')

In [None]:
output.pivot_table(
    index = ['lga', 'region_name', 'dry', 'icy', 'muddy', 'snowy', 'wet', 'clear', 'dust',
       'fog', 'raining', 'smoke', 'snowing', 'strong winds',
       ],
    values = ['police_needed', 'ambulance_needed'],
    aggfunc = 'sum'
)