In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import sys

ROOT_DIR = '../'
sys.path.insert(1, '../production_code/')
from constants import *

In [None]:

accidents = pd.read_csv(ROOT_DIR + ACCIDENT_DATA_GENERAL_DIR)
node = pd.read_csv(ROOT_DIR + ACCIDENT_DATA_NODE_DIR)

In [None]:
node.iloc[0]

In [None]:
accidents.iloc[0]

In [None]:
accidents.loc[:,'ACCIDENTDATE'] = pd.to_datetime(accidents.loc[:,'ACCIDENTDATE'], format="%d/%m/%Y")
accidents.loc[:,'DAY_OF_WEEK'] = accidents.loc[:,'ACCIDENTDATE'].dt.day_of_week

# Comparing all the data stages

by grouping and pivotting the data, any stage can be broken down into the total number of calls made each day, averaged over all days in a certain date range. The groupped by region.

This way we can make sure the average number of emergency calls made in each region per day, stays constant accross all dates, as it should

# Comparing Original

In [None]:
date_min = pd.to_datetime(TRAIN_SPLIT_MIN_DATE) #TRAIN_SPLIT_MIN_DATE
date_max = pd.to_datetime(TEST_TRAIN_SPLIT_DATE) #TEST_SPLIT_MAX_DATE
accidents.loc[:,'POLICE_ATTEND'] = accidents['POLICE_ATTEND'].apply(lambda x: 2 - x)

accidents\
    .merge(node[['ACCIDENT_NO','REGION_NAME']].drop_duplicates(), how = 'inner')\
    .drop_duplicates(subset=['ACCIDENT_NO'])\
    .query('ACCIDENTDATE >= @date_min & ACCIDENTDATE < @date_max')\
    .pivot_table(    # sums the total number of police attended for each day, splitting by region
         index = ['ACCIDENTDATE','REGION_NAME'],
         values=['POLICE_ATTEND'],
         aggfunc='sum'
         )\
    .reset_index()\
    .groupby(['REGION_NAME'])['POLICE_ATTEND'].mean() 
    # .pivot_table(    # sums the total number of police attended for each day, splitting by region
    #     index = ['ACCIDENTDATE'],
    #     columns=['REGION_NAME'],
    #     values=['POLICE_ATTEND'],
    #     aggfunc='sum'
    #     )\
    # .sort_index(ascending=False)\
    # .head(60)\
    # .mean()

# Original with basic filtering

In [None]:
# accidents = accidents.dropna(subset=['ACCIDENTDATE'])
# accidents = accidents.dropna(subset=['ACCIDENTTIME'])
# accidents = accidents.query("LIGHT_CONDITION != 9")
accidents = accidents.query("POLICE_ATTEND != 9")   # removing when unsure if police attened or not
accidents = accidents[pd.to_datetime(accidents['ACCIDENTDATE']) > pd.to_datetime(EARLIEST_DATE)].reset_index(drop = True)

accidents\
    .merge(node[['NODE_ID','REGION_NAME']]\
        .drop_duplicates(), how = 'inner')\
    .drop_duplicates(subset=['ACCIDENT_NO'])\
    .query('ACCIDENTDATE >= @date_min & ACCIDENTDATE < @date_max')\
    .pivot_table(    # sums the total number of police attended for each day, splitting by region
         index = ['ACCIDENTDATE','REGION_NAME'],
         values=['POLICE_ATTEND'],
         aggfunc='sum'
         )\
    .reset_index()\
    .groupby(['REGION_NAME'])['POLICE_ATTEND'].mean() 


# roughly cleaned pre merge dat

In [None]:
ROUGHLY_CLEANED_PRE_MERGE_DATA_DIR

first_clean = pd.read_csv(ROOT_DIR + ROUGHLY_CLEANED_PRE_MERGE_DATA_DIR)
first_clean.loc[:,'POLICE_ATTEND'] = first_clean['POLICE_ATTEND'].apply(lambda x: 2 - x)
first_clean.loc[:,'date'] = pd.to_datetime(first_clean.loc[:,'date']).dt.date

first_clean\
    .merge(node[['ACCIDENT_NO','REGION_NAME']]\
        .drop_duplicates(),  how = 'inner')\
    .query('date >= @date_min & date < @date_max')\
    .pivot_table(    # sums the total number of police attended for each day, splitting by region
         index = ['date','REGION_NAME'],
         values=['POLICE_ATTEND'],
         aggfunc='sum'
         )\
    .reset_index()\
    .groupby(['REGION_NAME'])['POLICE_ATTEND'].mean() 

In [None]:

node = pd.read_csv(ROOT_DIR + ACCIDENT_DATA_NODE_DIR)
first_clean = pd.read_csv(ROOT_DIR + ROUGHLY_CLEANED_PRE_MERGE_DATA_DIR)
first_clean.loc[:,'date'] = pd.to_datetime(first_clean.loc[:,'date']).dt.date

# only keeping instances with ids that exist
print("initial shape: " + str(node.shape))
node = node[node['ACCIDENT_NO'].isin(accidents['ACCIDENT_NO'])]


node = node.rename(columns = {
    'REGION_NAME': 'region', 
    'LGA_NAME':'lga', 
    'NODE_ID':'node_id',
    'Lat':'lat',
    'Long':'long'
    })
node['region'] = node['region'].astype(str)
node['lga'] = node['lga'].astype(str)
node.loc[:,'lat'] = node.loc[:,'lat'].apply(pd.to_numeric)
node.loc[:,'long'] = node.loc[:,'long'].apply(pd.to_numeric)

# removing blank regions
node = node.query('region != " "')

accidents_of_interest = ['date']
accidents_label_columns = ['police_needed']
id_columns = ['ACCIDENT_NO']
node_of_interest = ['node_id','lga','region','lat','long']


first_clean.loc[:,'police_needed'] = first_clean['POLICE_ATTEND'].apply(lambda x: 2 - x)


first_clean[id_columns + accidents_of_interest + accidents_label_columns].drop_duplicates()\
    .merge(
        node[id_columns + node_of_interest].drop_duplicates(), 
        how='inner')\
    .query('date >= @date_min & date < @date_max')\
    .pivot_table(    # sums the total number of police attended for each day, splitting by region
         index = ['date','region'],
         values=['police_needed'],
         aggfunc='sum'
         )\
    .reset_index()\
    .groupby(['region'])['police_needed'].mean() 
        

# merge stages

In [None]:
stage_1 = pd.read_csv( ROOT_DIR + ROUGHLY_CLEANED_MERGE_1_DATA_DIR)
stage_1.loc[:,'date'] = pd.to_datetime(stage_1.loc[:,'date']).dt.date

stage_1\
    .query('date >= @date_min & date < @date_max')\
    .pivot_table(    # sums the total number of police attended for each day, splitting by region
         index = ['date','region'],
         values=['police_needed'],
         aggfunc='sum'
         )\
    .reset_index()\
    .groupby(['region'])['police_needed'].mean() 

In [None]:
stage_2 = pd.read_csv( ROOT_DIR + ROUGHLY_CLEANED_MERGE_2_DATA_DIR)
stage_2.loc[:,'date'] = pd.to_datetime(stage_2.loc[:,'date']).dt.date

stage_2\
    .query('date >= @date_min & date < @date_max')\
    .pivot_table(    # sums the total number of police attended for each day, splitting by region
         index = ['date','region'],
         values=['police_needed'],
         aggfunc='sum'
         )\
    .reset_index()\
    .groupby(['region'])['police_needed'].mean() 

In [None]:
stage_3 = pd.read_csv( ROOT_DIR + ROUGHLY_CLEANED_MERGE_2_DATA_DIR)
stage_3.loc[:,'date'] = pd.to_datetime(stage_3.loc[:,'date']).dt.date

stage_3\
    .query('date >= @date_min & date < @date_max')\
    .pivot_table(    # sums the total number of police attended for each day, splitting by region
         index = ['date','region'],
         values=['police_needed'],
         aggfunc='sum'
         )\
    .reset_index()\
    .groupby(['region'])['police_needed'].mean() 

# comparing to initial clean

In [None]:
first_clean = pd.read_csv(ROOT_DIR + ROUGHLY_CLEANED_DATA_DIR)
first_clean.loc[:,'date'] = pd.to_datetime(first_clean.loc[:,'date']).dt.date

first_clean\
    .query('date >= @date_min & date < @date_max')\
    .pivot_table(    # sums the total number of police attended for each day, splitting by region
         index = ['date','region'],
         values=['police_needed'],
         aggfunc='sum'
         )\
    .reset_index()\
    .groupby(['region'])['police_needed'].mean() 

# comparing to further clean

In [None]:
second_clean = pd.read_csv(ROOT_DIR + FULLY_CLEANED_DATA_DIR)
second_clean.loc[:,'date'] = pd.to_datetime(first_clean.loc[:,'date']).dt.date

second_clean\
    .query('date >= @date_min & date < @date_max')\
    .pivot_table(    # sums the total number of police attended for each day, splitting by region
         index = ['date','region'],
         values=['police_needed'],
         aggfunc='sum'
         )\
    .reset_index()\
    .groupby(['region'])['police_needed'].mean() 

# comparing to previotted

In [None]:
pre_pivot = pd.read_csv(ROOT_DIR + PREPIVOT_TRAIN_TEST_DATA_DIR)
pre_pivot.loc[:,'date'] = pd.to_datetime(pre_pivot.loc[:,'date']).dt.date

pre_pivot\
    .query('date >= @date_min & date < @date_max')\
    .pivot_table(    # sums the total number of police attended for each day, splitting by region
         index = ['date','region'],
         values=['police_needed'],
         aggfunc='sum'
         )\
    .reset_index()\
    .groupby(['region'])['police_needed'].mean() 

## pre pivotted with scaled police

In [None]:
pre_pivot\
    .query('date >= @date_min & date < @date_max')\
    .pivot_table(    # sums the total number of police attended for each day, splitting by region
         index = ['region','sky','date'],
         values=['scaled_police'],
         aggfunc='sum'
         )\
    .reset_index()\
    .pivot_table(    # sums the total number of police attended for each day, splitting by region
         index = ['region','date'],
         values=['scaled_police'],
         aggfunc='mean'
         )\
    .reset_index()\
    .groupby(['region'])['scaled_police'].mean()

# comparing to final data

In [None]:
file = REGION_TESTING_DATA_DIR
file = REGION_TRAINING_DATA_DIR

test_data = pd.read_csv(ROOT_DIR + file)
test_data\
    .pivot_table(      # averaging rainging or not
        index = ['Region','Part of Day','Day of the Week'],
        values=['Police'],
        aggfunc='mean'
    )\
    .reset_index()\
    .pivot_table(      # adding together parts of day
        index = ['Region','Day of the Week'],
        values=['Police'],
        aggfunc='sum'
    )\
    .reset_index()\
    .groupby(['Region'])['Police'].mean()
# averaging each day of the week

# test_data