# Fix pathing

In [1]:
import sys


sys.path.append("..")


In [2]:
import constants

import os


constants.PROJECT_DIRECTORY_PATH = os.path.dirname(os.path.dirname(constants.PROJECT_DIRECTORY_PATH))


# Imports

In [3]:
import utils

import regex
import pandas as pd
import numpy as np
import shapely.geometry


# Constants

In [4]:
DATASET_ID = "oslo"
RAW_INCIDENTS_PATH = utils.get_raw_incidents_path(DATASET_ID)
CLEAN_INCIDENTS_PATH = utils.get_clean_incidents_path(DATASET_ID)
PROCESSED_INCIDENTS_PATH = utils.get_processed_incidents_path(DATASET_ID)


# Methods

In [5]:
def fix_csv_errors(source_filepath: str, target_filepath: str):
    """Fixes common errors in a CSV file and saves the corrected version.

    Args:
        source_filepath (str): The path to the source CSV file.
        target_filepath (str): The path where the corrected CSV file will be saved.
    """
    with open(source_filepath, "r", encoding="windows-1252") as source_file, \
         open(target_filepath, "w", encoding="utf-8") as target_file:
        
        # fix empty header
        header = source_file.readline().replace('""', '"id"')
        target_file.write(header)
        
        # fix comma errors in the data lines
        for line in source_file:
            if regex.match(r".*\(.*,.*\).*", line):
                line = regex.sub(r"\([^,()]+\K,", "\\,", line)

            target_file.write(line)


In [6]:
def split_geometry(dataframe: pd.DataFrame) -> pd.DataFrame:
    """
    Splits the 'geometry' column of the DataFrame into two separate columns.

    Args:
        dataframe (pd.DataFrame): The DataFrame containing the 'geometry' column.

    Returns:
        pd.DataFrame: The DataFrame with the 'geometry' column split into 'geometry_x' and 'geometry_y'.
    """
    # splitting the 'geometry' column into two new columns
    geometry_split = dataframe["geometry"].str.replace("c\(|\)", "", regex=True).str.split(", ", expand=True)
    dataframe[["geometry_x", "geometry_y"]] = geometry_split

    # drop the problematic column
    dataframe.drop(["geometry"], axis=1, inplace=True)

    return dataframe


In [7]:
def drop_unnecessary_raw_columns(dataframe: pd.DataFrame) -> pd.DataFrame:
    columns_to_drop = [
        "utrykningstid",
        "responstid",
        "gml_id",
        "lokalId",
        "navnerom",
        "versjonId",
        "oppdateringsdato",
        "datauttaksdato",
        "opphav",
        "rsize",
        "col",
        "row",
        "statistikkÅr"
    ]
    dataframe.drop(columns_to_drop, axis=1, inplace=True)

    return dataframe


In [8]:
def fix_raw_types(dataframe: pd.DataFrame) -> pd.DataFrame:
    headers_types = {
        "id": "int64",
        "hastegrad": "object",
        "tidspunkt": "object",
        "tiltak_opprettet": "object",
        "ressurs_id": "object",
        "tiltak_type": "object",
        "varslet": "object",
        "rykker_ut": "object",
        "ank_hentested": "object",
        "avg_hentested": "object",
        "ank_levsted": "object",
        "ledig": "object",
        "ssbid1000M": "int64",
        "xcoor": "int64",
        "ycoor": "int64",
        "popTot": "int64",
        "popAve": "float64",
        "popFem": "int64",
        "popMal": "int64",
        "geometry_x": "int64",
        "geometry_y": "int64"
    }
    
    dataframe = dataframe.astype(headers_types)

    date_columns = [
        "tidspunkt",
        "tiltak_opprettet",
        "varslet",
        "rykker_ut",
        "ank_hentested",
        "avg_hentested",
        "ank_levsted",
        "ledig"
    ]

    for col in date_columns:
        dataframe[col] = pd.to_datetime(dataframe[col], format="%d.%m.%Y %H:%M:%S ", errors="coerce")

    return dataframe


In [9]:
def save_dataframe(dataframe: pd.DataFrame, filepath: str):
    dataframe.to_csv(filepath, index=False)


In [10]:
def load_clean_incident_dataframe() -> pd.DataFrame:
    column_types = {
        "id": "int64",
        "hastegrad": "object",
        "ressurs_id": "object",
        "tiltak_type": "object",
        "ssbid1000M": "int64",
        "xcoor": "int64",
        "ycoor": "int64",
        "popTot": "int64",
        "popAve": "float64",
        "popFem": "int64",
        "popMal": "int64",
        "geometry_x": "int64",
        "geometry_y": "int64"
    }
    column_index_dates = [2, 3, 6, 7, 8, 9, 10, 11]

    dataframe = pd.read_csv(CLEAN_INCIDENTS_PATH, dtype=column_types, na_values=[""], parse_dates=column_index_dates)

    return dataframe


In [11]:
def clean_incident_dataset():
    fix_csv_errors(RAW_INCIDENTS_PATH, CLEAN_INCIDENTS_PATH)
    dataframe = pd.read_csv(CLEAN_INCIDENTS_PATH, escapechar="\\", low_memory=False)
    dataframe = split_geometry(dataframe)
    dataframe = drop_unnecessary_raw_columns(dataframe)
    dataframe = fix_raw_types(dataframe)
    save_dataframe(dataframe, CLEAN_INCIDENTS_PATH)


In [12]:
def add_geo_data(dataframe: pd.DataFrame) -> pd.DataFrame:
    gdf_oslo_bounds = utils.get_bounds(file_paths=[os.path.join(constants.PROJECT_DIRECTORY_PATH, "data", "ssb_2019_oslo_polygon_epsg4326.geojson")])
    gdf_akershus_bounds = utils.get_bounds(file_paths=[os.path.join(constants.PROJECT_DIRECTORY_PATH, "data", "ssb_2019_akershus_polygon_epsg4326.geojson")])
    gdf_urban_settlement_bounds = utils.get_bounds(file_paths=[os.path.join(constants.PROJECT_DIRECTORY_PATH, "data", "ssb_2021_urban_settlements_polygon_epsg4326.geojson")])

    cached_geo_data_point = {}
    cached_geo_data_grid_id = {}

    for index, _ in dataframe.iterrows():
        grid_id = dataframe.at[index, "grid_id"]

        if grid_id in cached_geo_data_grid_id:
            longitude, latitude, region, urban_settlement = cached_geo_data_grid_id[grid_id]
        else:
            x = dataframe.at[index, "x"]
            y = dataframe.at[index, "y"]

            region_akershus_count = 0
            region_oslo_count = 0
            urban_settlement_count = 0

            for longitude, latitude in utils.get_cell_corners(x, y):
                if (longitude, latitude) in cached_geo_data_point:
                    region, urban_settlement = cached_geo_data_point[(longitude, latitude)]
                else:
                    point = shapely.geometry.Point(longitude, latitude)

                    region = None
                    if gdf_akershus_bounds.contains(point).any():
                        region = "Akershus"
                    elif gdf_oslo_bounds.contains(point).any():
                        region = "Oslo"

                    urban_settlement = gdf_urban_settlement_bounds.contains(point).any()

                    cached_geo_data_point[(longitude, latitude)] = (region, urban_settlement)

                if region == "Akershus":
                    region_akershus_count += 1
                elif region == "Oslo":
                    region_oslo_count += 1
                    
                urban_settlement_count += urban_settlement
            
            if (region_akershus_count + region_oslo_count) == 0:
                region = None
            elif region_oslo_count >= region_akershus_count:
                region = "Oslo"
            else:
                region = "Akershus"
            
            urban_settlement = urban_settlement_count != 0

            cached_geo_data_grid_id[grid_id] = (longitude, latitude, region, urban_settlement)
        
        dataframe.at[index, "longitude"] = longitude
        dataframe.at[index, "latitude"] = latitude
        dataframe.at[index, "region"] = region
        dataframe.at[index, "urban_settlement"] = urban_settlement
    
    return dataframe


In [13]:
def load_processed_incident_dataframe() -> pd.DataFrame:
    column_types = {
        "id": "int64",
        "triage_impression_during_call": "object",
        "grid_id": "int64",
        "x": "int64",
        "y": "int64",
        "x_accurate": "int64",
        "y_accurate": "int64",
        "longitude": "float64",
        "latitude": "float64",
        "region": "object",
        "urban_settlement": "bool"
    }
    column_index_dates = [2, 3, 4, 5, 6, 7, 8, 9]

    dataframe = pd.read_csv(
        PROCESSED_INCIDENTS_PATH,
        dtype=column_types,
        na_values=[""],
        parse_dates=column_index_dates
    )

    return dataframe


In [14]:
def Initialize_processed_incidents_dataframe() -> pd.DataFrame:
    dataframe_clean = load_clean_incident_dataframe()

    dataframe = pd.DataFrame()
    dataframe["id"] = dataframe_clean["id"]
    dataframe["triage_impression_during_call"] = dataframe_clean["hastegrad"]
    dataframe["time_call_received"] = dataframe_clean["tidspunkt"]
    dataframe["time_call_answered"] = dataframe_clean["tiltak_opprettet"]
    dataframe["time_ambulance_notified"] = dataframe_clean["varslet"]
    dataframe["time_dispatch"] = dataframe_clean["rykker_ut"]
    dataframe["time_arrival_scene"] = dataframe_clean["ank_hentested"]
    dataframe["time_departure_scene"] = dataframe_clean["avg_hentested"]
    dataframe["time_arrival_hospital"] = dataframe_clean["ank_levsted"]
    dataframe["time_available"] = dataframe_clean["ledig"]
    dataframe["grid_id"] = dataframe_clean["ssbid1000M"]
    dataframe["x"] = dataframe_clean["xcoor"]
    dataframe["y"] = dataframe_clean["ycoor"]
    dataframe["x_accurate"] = dataframe_clean["geometry_x"]
    dataframe["y_accurate"] = dataframe_clean["geometry_y"]
    dataframe["longitude"] = np.nan
    dataframe["latitude"] = np.nan
    dataframe["region"] = None
    dataframe["urban_settlement"] = False

    return dataframe


In [15]:
def process_incident_dataset():
    dataframe = Initialize_processed_incidents_dataframe()
    dataframe = add_geo_data(dataframe)

    save_dataframe(dataframe, PROCESSED_INCIDENTS_PATH)


# Main

In [16]:
clean_incident_dataset()
process_incident_dataset()


In [17]:
dataframe_clean = load_clean_incident_dataframe()

dataframe_clean.info()
print(dataframe_clean.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 754811 entries, 0 to 754810
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   id                754811 non-null  int64         
 1   hastegrad         754802 non-null  object        
 2   tidspunkt         754811 non-null  datetime64[ns]
 3   tiltak_opprettet  754811 non-null  datetime64[ns]
 4   ressurs_id        754811 non-null  object        
 5   tiltak_type       754811 non-null  object        
 6   varslet           754333 non-null  datetime64[ns]
 7   rykker_ut         731707 non-null  datetime64[ns]
 8   ank_hentested     665649 non-null  datetime64[ns]
 9   avg_hentested     560737 non-null  datetime64[ns]
 10  ank_levsted       557995 non-null  datetime64[ns]
 11  ledig             754747 non-null  datetime64[ns]
 12  ssbid1000M        754811 non-null  int64         
 13  xcoor             754811 non-null  int64         
 14  ycoo

In [18]:
dataframe_processed = load_processed_incident_dataframe()

dataframe_processed.info()
print(dataframe_processed.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 754811 entries, 0 to 754810
Data columns (total 19 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   id                             754811 non-null  int64         
 1   triage_impression_during_call  754802 non-null  object        
 2   time_call_received             754811 non-null  datetime64[ns]
 3   time_call_answered            754811 non-null  datetime64[ns]
 4   time_ambulance_notified        754333 non-null  datetime64[ns]
 5   time_dispatch                  731707 non-null  datetime64[ns]
 6   time_arrival_scene             665649 non-null  datetime64[ns]
 7   time_departure_scene           560737 non-null  datetime64[ns]
 8   time_arrival_hospital          557995 non-null  datetime64[ns]
 9   time_available                 754747 non-null  datetime64[ns]
 10  grid_id                        754811 non-null  int64         
 11  x