# checking data

Along the way of pivotting and cleaning, we should make sure our data hasnt been 'destroyed'

this file is used to go through all stages of the data and pivot it the same way to make sure, at every stage, is the data in the same shape or have we lost a lot of information

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import sys

ROOT_DIR = '../'
sys.path.insert(1, '../production_code/')
from constants import *

In [None]:
#  imports original acident and node data
accidents = pd.read_csv(ROOT_DIR + ACCIDENT_DATA_GENERAL_DIR)
node = pd.read_csv(ROOT_DIR + ACCIDENT_DATA_NODE_DIR)

In [None]:
# splitting base accidents by date and day of week
accidents.loc[:,'ACCIDENTDATE'] = pd.to_datetime(accidents.loc[:,'ACCIDENTDATE'], format="%d/%m/%Y")
accidents.loc[:,'DAY_OF_WEEK'] = accidents.loc[:,'ACCIDENTDATE'].dt.day_of_week

# Comparing all the data stages

by grouping and pivotting the data, any stage can be broken down into 
- the total number of calls made each day, averaged over all days in a certain date range. 
- then groupped by region.

so the values you see are the average total number of daily callouts for each region

This way we can make sure the average number of emergency calls made in each region per day, stays constant accross all dates, as it should

In [None]:
# selecting what date range we are looking at
date_min = pd.to_datetime(TRAIN_SPLIT_MIN_DATE) #TRAIN_SPLIT_MIN_DATE
date_max = pd.to_datetime(TEST_TRAIN_SPLIT_DATE) #TEST_SPLIT_MAX_DATE


# Comparing Original

comparing with the very original

In [None]:
# small transformation to police value
accidents.loc[:,'POLICE_ATTEND'] = accidents['POLICE_ATTEND'].apply(lambda x: 2 - x)

# calculates and prints data
original_pivot = accidents\
     .merge(
         # first merges with location data 
          node[['ACCIDENT_NO','REGION_NAME']]\
               .drop_duplicates(), 
          how = 'inner'
     #and drops duplicates
          )\
     .drop_duplicates(subset=['ACCIDENT_NO'])\
     .query('ACCIDENTDATE >= @date_min & ACCIDENTDATE < @date_max')\
     .pivot_table(    # sums the total number of police attended for each day, splitting by region
         index = ['ACCIDENTDATE','REGION_NAME'],
         values=['POLICE_ATTEND'],
         aggfunc='sum',
         # averages police attened over all days
         )\
     .reset_index()\
     .groupby(['REGION_NAME'])['POLICE_ATTEND']\
     .mean()\
     .sort_index()

# displays data
original_pivot

# Original with basic filtering

same original data but with some of the basic filtering lines added

In [None]:
accidents = accidents.dropna(subset=['ACCIDENTDATE'])
accidents = accidents.dropna(subset=['ACCIDENTTIME'])
# accidents = accidents.query("LIGHT_CONDITION != 9")
accidents = accidents.query("POLICE_ATTEND != 9")   # removing when unsure if police attened or not
accidents = accidents[pd.to_datetime(accidents['ACCIDENTDATE']) > pd.to_datetime(EARLIEST_DATE)].reset_index(drop = True)

# calculates and prints data
# first merges with location data and drops duplicates
accidents\
    .merge(
         # first merges with location data 
        node[['ACCIDENT_NO','REGION_NAME']]\
            .drop_duplicates(), 
        how = 'inner'
     #and drops duplicates
        )\
    .drop_duplicates(subset=['ACCIDENT_NO'])\
    .query('ACCIDENTDATE >= @date_min & ACCIDENTDATE < @date_max')\
    .pivot_table(    # sums the total number of police attended for each day, splitting by region
         index = ['ACCIDENTDATE','REGION_NAME'],
         values=['POLICE_ATTEND'],
         aggfunc='sum'
         # averages police attened over all days for each region
         )\
    .reset_index()\
    .groupby(['REGION_NAME'])['POLICE_ATTEND']\
    .mean()\
    .sort_index() 


# roughly cleaned pre merge data

this is data still from the initial transofmring, before it was merged with other rows

In [None]:
# reads the data and applies some transformations
first_clean = pd.read_csv(ROOT_DIR + ROUGHLY_CLEANED_PRE_MERGE_DATA_DIR)
first_clean.loc[:,'POLICE_ATTEND'] = first_clean['POLICE_ATTEND'].apply(lambda x: 2 - x)
first_clean.loc[:,'date'] = pd.to_datetime(first_clean.loc[:,'date']).dt.date


# calculates and prints data
# first merges with location data and drops duplicates
first_clean\
    .merge(node[['ACCIDENT_NO','REGION_NAME']]\
        .drop_duplicates(),  how = 'inner')\
    .query('date >= @date_min & date < @date_max')\
    .pivot_table(    # sums the total number of police attended for each day, splitting by region
         index = ['date','REGION_NAME'],
         values=['POLICE_ATTEND'],
         aggfunc='sum'
         # averages police attened over all days for each region
         )\
    .reset_index()\
    .groupby(['REGION_NAME'])['POLICE_ATTEND']\
    .mean()\
    .sort_index()

# comparing to initial clean

This data is after the full initial clean, still have similar numbers as the original

In [None]:
# reads file and adds some data changes
first_clean = pd.read_csv(ROOT_DIR + ROUGHLY_CLEANED_DATA_DIR)
first_clean.loc[:,'date'] = pd.to_datetime(first_clean.loc[:,'date']).dt.date

# calculates and prints data
# first crops the data
first_clean\
    .query('date >= @date_min & date < @date_max')\
    .pivot_table(    # sums the total number of police attended for each day, splitting by region
         index = ['date','region'],
         values=['police_needed'],
         aggfunc='sum'
         # averages police attened over all days for each region
         )\
    .reset_index()\
    .groupby(['region'])['police_needed']\
    .mean()\
    .sort_index()

# comparing to further clean

fully cleaned data after all the visulizations and second cleaning stage

In [None]:
# reads file and adds some data changes
second_clean = pd.read_csv(ROOT_DIR + FULLY_CLEANED_DATA_DIR)
second_clean.loc[:,'date'] = pd.to_datetime(first_clean.loc[:,'date']).dt.date

# calculates and prints data
# first crops the data
second_clean\
    .query('date >= @date_min & date < @date_max')\
    .pivot_table(    # sums the total number of police attended for each day, splitting by region
         index = ['date','region'],
         values=['police_needed'],
         aggfunc='sum'
         # averages police attened over all days for each region
         )\
    .reset_index()\
    .groupby(['region'])['police_needed']\
    .mean()\
    .sort_index()

# comparing to pre-piviotted

this is the data extracted just before completing the final pivot

In [None]:
# reads file and adds some data changes
pre_pivot = pd.read_csv(ROOT_DIR + PREPIVOT_TRAIN_TEST_DATA_DIR)
pre_pivot.loc[:,'date'] = pd.to_datetime(pre_pivot.loc[:,'date']).dt.date

# calculates and prints data
# first crops the data
pre_pivot\
    .query('date >= @date_min & date < @date_max')\
    .pivot_table(    # sums the total number of police attended for each day, splitting by region
         index = ['date','region'],
         values=['police_needed'],
         aggfunc='sum'
         # averages police attened over all days for each region
         )\
    .reset_index()\
    .groupby(['region'])['police_needed']\
    .mean()\
    .sort_index()

## pre pivotted with scaled police

This data is the same as above, but looking at the scaled_police term instead of the police_needed term to make sure the scaling was correct

In [None]:

# calculates and prints data
# first crops the data
pre_pivot\
    .query('date >= @date_min & date < @date_max')\
    .pivot_table(    # sums the total number of police attended for each day, splitting by region
         index = ['date','region'],
         values=['scaled_police'],
         aggfunc='sum'
         # averages police attened over all days for each region
         )\
    .reset_index()\
    .groupby(['region'])['scaled_police']\
    .mean()\
    .sort_index()/2  # devide by two due to scaling police count to count for twice as many instances

# comparing to final data

final pivotted data, very close to the original even with the massive change in format

In [None]:
# importing either training or testing data
# file = REGION_TESTING_DATA_DIR
file = REGION_TRAINING_DATA_DIR
data = pd.read_csv(ROOT_DIR + file)

# pivots the data
final_pivot = data\
    .pivot_table(      # averaging rainging or not
        index = ['Region','Part of Day','Day of the Week'],
        values=['Police'],
        aggfunc='mean'
    )\
    .reset_index()\
    .pivot_table(      # adding together parts of day
        index = ['Region','Day of the Week'],
        values=['Police'],
        aggfunc='sum'
    )\
    .reset_index()\
    .groupby(['Region'])['Police']\
    .mean()\
    .sort_index()

# prints the data
final_pivot 

# comparing with original

This shows the percentage change between the final data and inital data
everything being 100% would be ideal
most are around that, overall, clearly there is a bit of lost data but nothing major

In [None]:
# calculating the percentage difference
100 * final_pivot / original_pivot