# Fix pathing

In [1]:
import sys


sys.path.append("..")


In [2]:
import constants

import os


constants.PROJECT_DIRECTORY_PATH = os.path.dirname(os.path.dirname(constants.PROJECT_DIRECTORY_PATH))


# Imports

In [3]:
import datahandler

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates


# Constants

In [4]:
data_preprocessor = datahandler.DataPreprocessorOUS_V2()
data_preprocessor.execute()

data_loader = datahandler.DataLoader(datahandler.DataPreprocessorOUS_V2)
data_loader.execute(clean=False, processed=True, enhanced=True)


Cleaning dataset: 100%|██████████| 2/2 [00:00<00:00, 1002.46it/s]
Processing dataset: 100%|██████████| 2/2 [00:00<00:00, 1006.79it/s]
Enhancing dataset: 100%|██████████| 2/2 [00:00<00:00, 1002.94it/s]
Loading dataset:   0%|          | 0/4 [00:00<?, ?it/s]

Loading dataset: 100%|██████████| 4/4 [00:27<00:00,  6.89s/it]


# Methods

In [5]:
def count_wrong_timestamps(dataframe: pd.DataFrame):
    datetime_columns = [
        "time_call_received", "time_call_processed", "time_ambulance_notified",
        "time_dispatch", "time_arrival_scene", "time_departure_scene",
        "time_arrival_hospital", "time_available"
    ]

    # Count violations for each pair of datetime columns
    for i in range(len(datetime_columns) - 1):
        first_col = datetime_columns[i]
        second_col = datetime_columns[i + 1]
        
        # Count where the first date is after the second date
        count = dataframe[dataframe[first_col] > dataframe[second_col]].shape[0]
        print(f"{first_col} to {second_col}: {count}")


In [6]:
def fix_timestamps(dataframe: pd.DataFrame) -> pd.DataFrame:
    # Convert columns to datetime if they're not already
    dataframe['time_call_received'] = pd.to_datetime(dataframe['time_call_received'])
    dataframe['time_call_processed'] = pd.to_datetime(dataframe['time_call_processed'])

    # Calculate the mean difference for each day
    valid_rows = dataframe['time_call_received'] <= dataframe['time_call_processed']
    dataframe['date'] = dataframe['time_call_received'].dt.date
    mean_diffs = dataframe[valid_rows].groupby('date').apply(
        lambda x: (x['time_call_processed'] - x['time_call_received']).mean()
    )

    # Adjust time_call_received for rows where it's after time_call_processed
    def adjust_time(row):
        if row['time_call_received'] > row['time_call_processed']:
            mean_diff = mean_diffs[row['date']]
            row['time_call_received'] = row['time_call_processed'] - mean_diff
        return row

    dataframe = dataframe.apply(adjust_time, axis=1)

    # Drop the temporary 'date' column
    dataframe = dataframe.drop(columns=['date'])

    return dataframe


In [None]:
def find_first_problematic_row(dataframe: pd.DataFrame):
    # Convert columns to datetime if they're not already
    dataframe['time_call_received'] = pd.to_datetime(dataframe['time_call_received'])
    dataframe['time_call_processed'] = pd.to_datetime(dataframe['time_call_processed'])

    # Find the first row where time_call_received is after time_call_processed
    problematic_rows = dataframe[dataframe['time_call_received'] > dataframe['time_call_processed']]
    if not problematic_rows.empty:
        first_problematic_index = problematic_rows.index[0]
        print(f"First problematic row index: {first_problematic_index}")
        return first_problematic_index
    else:
        print("No problematic rows found.")
        return None


# Main

In [7]:
count_wrong_timestamps(data_loader.processed_incidents_df)


time_call_received to time_call_processed: 69721
time_call_processed to time_ambulance_notified: 7688
time_ambulance_notified to time_dispatch: 13
time_dispatch to time_arrival_scene: 34
time_arrival_scene to time_departure_scene: 2
time_departure_scene to time_arrival_hospital: 2
time_arrival_hospital to time_available: 34


In [8]:
dataframe = data_preprocessor._remove_duplicates(data_loader.processed_incidents_df)
dataframe = data_preprocessor._remove_incomplete_years(dataframe)
dataframe = data_preprocessor._remove_outside_region(dataframe)
dataframe = data_preprocessor._remove_other_resource_types(dataframe)
dataframe = data_preprocessor._count_resources_sent(dataframe)
dataframe = data_preprocessor._remove_extra_resources(dataframe)

dataframe = dataframe.sort_values(by="time_call_received")


In [9]:
count_wrong_timestamps(dataframe)


time_call_received to time_call_processed: 29433
time_call_processed to time_ambulance_notified: 2496
time_ambulance_notified to time_dispatch: 5
time_dispatch to time_arrival_scene: 17
time_arrival_scene to time_departure_scene: 0
time_departure_scene to time_arrival_hospital: 0
time_arrival_hospital to time_available: 6


In [10]:
dataframe["triage_impression_during_call"].value_counts()


triage_impression_during_call
H     145959
A     137909
V1     41852
V2     32787
V         14
Name: count, dtype: int64

In [11]:
dataframe = data_preprocessor._remove_other_triage_impressions(dataframe)


In [12]:
dataframe["triage_impression_during_call"].value_counts()


triage_impression_during_call
H     145959
A     137909
V1     41852
Name: count, dtype: int64

In [13]:
count_wrong_timestamps(dataframe)


time_call_received to time_call_processed: 2372
time_call_processed to time_ambulance_notified: 2483
time_ambulance_notified to time_dispatch: 4
time_dispatch to time_arrival_scene: 14
time_arrival_scene to time_departure_scene: 0
time_departure_scene to time_arrival_hospital: 0
time_arrival_hospital to time_available: 4


In [14]:
dataframe = data_preprocessor._remove_wrong_timestamps(dataframe)


In [15]:
dataframe["triage_impression_during_call"].value_counts()


triage_impression_during_call
H     145320
A     136151
V1     41744
Name: count, dtype: int64

In [16]:
count_wrong_timestamps(dataframe)


time_call_received to time_call_processed: 2372
time_call_processed to time_ambulance_notified: 0
time_ambulance_notified to time_dispatch: 0
time_dispatch to time_arrival_scene: 0
time_arrival_scene to time_departure_scene: 0
time_departure_scene to time_arrival_hospital: 0
time_arrival_hospital to time_available: 0


In [17]:
dataframe = fix_timestamps(dataframe)


KeyboardInterrupt: 

In [None]:
count_wrong_timestamps(dataframe)
