In [450]:
import pandas as pd
import os

directory = os.getcwd() + "/data/"

interventions_bxl = pd.read_parquet(directory + "interventions_bxl.parquet.gzip")
interventions_bxl2 = pd.read_parquet(directory + "interventions_bxl2.parquet.gzip")
interventions1 = pd.read_parquet(directory + "interventions1.parquet.gzip")
interventions2 = pd.read_parquet(directory + "interventions2.parquet.gzip")
interventions3 = pd.read_parquet(directory + "interventions3.parquet.gzip")
cad9 = pd.read_parquet(directory + "cad9.parquet.gzip")
aed_locations = pd.read_parquet(directory + "aed_locations.parquet.gzip")
ambulance_locations = pd.read_parquet(directory + "ambulance_locations.parquet.gzip")
mug_locations = pd.read_parquet(directory + "mug_locations.parquet.gzip")
pit_locations = pd.read_parquet(directory + "pit_locations.parquet.gzip")

In [451]:
print(len(aed_locations))

15227


In [453]:
# 2 instances with missing id's exist. Upon inspection these instances seem valid.
missing_id = aed_locations[aed_locations['id'].isnull()]
missing_id

Unnamed: 0,id,type,address,number,postal_code,municipality,province,location,public,available,hours
13019,,Appareil fixe-Vast apparaat,Antwerpsesteenweg,37.0,2450.0,Vosselaar,Antwerpen,Inkomhal guesthouse Janssen Pharmaceutica,Non-Nee,Oui-Ja,
13515,,Appareil fixe-Vast apparaat,Kapiteinstraat,55.0,9000.0,Gent,Oost-Vlaanderen,naast de poort aan de straatzijde,Oui-Ja,Oui-Ja,


In [454]:
na_replacement_values = iter([99998, 99999])

# replace NaN values with 99999, 99998 respectively
aed_locations['id'] = aed_locations['id'].apply(lambda x: next(na_replacement_values) if pd.isna(x) else x)

#aed_locations['id'] = aed_locations['id'].astype(int)

In [455]:
#2 duplicate instances existed in the dataset
aed_locations_no_dup = aed_locations.drop_duplicates()
aed_locations_no_dup

Unnamed: 0,id,type,address,number,postal_code,municipality,province,location,public,available,hours
0,13.0,,Blvd. Fr. Roosevelt,24.0,7060.0,SOIGNIES,Hainaut,,Y,,
1,70.0,,Ch. De Wégimont,76.0,4630.0,Ayeneux,Liège,,,,
2,71.0,,Place Saint - Lambert,,4020.0,Liège,Liège,,,,
3,72.0,,Rue du Doyard,,4990.0,Lierneux,Liège,,,,
4,73.0,,Fond Saint Servais,,4000.0,Liège,Liège,,,,
...,...,...,...,...,...,...,...,...,...,...,...
15222,16662.0,Appareil fixe-Vast apparaat,Nekkerspoel-Borcht,19.0,2800.0,Mechelen,Antwerpen,reddersgebouw aan zwemvijver,Non-Nee,Non-Nee,
15223,16664.0,Appareil fixe-Vast apparaat,Nieuwe Dreef,17.0,9160.0,Lokeren,Oost-Vlaanderen,Locatie : ingang poort hoofdgebouw,Oui-Ja,Oui-Ja,
15224,16665.0,Appareil fixe-Vast apparaat,Panterschipstraat,207.0,9000.0,Gent,Oost-Vlaanderen,,Oui-Ja,Non-Nee,
15225,16666.0,Appareil fixe-Vast apparaat,Albert Leemansplein,20.0,1050.0,Bruxelles,Bruxelles-Brussel,,Oui-Ja,Non-Nee,


In [456]:
#there are non-unique id's (i.e. some id's got inputted more than once). These instances are firstly inspected before taking any further steps
duplicate_ids = aed_locations[aed_locations['id'].duplicated(keep=False)]
duplicate_ids

#it seems that the duplicate id's are actually fine, but that the id value just got inputted more than once accidentally. One could ask the stakeholder
#what happened. We will simply give the duplicate id's other values, so that each AED has a unique id.

Unnamed: 0,id,type,address,number,postal_code,municipality,province,location,public,available,hours
331,846.0,Appareil fixe-Vast apparaat,Rue Mondron,40.0,6042.0,Lodelinsart,Hainaut,Salle de contrôle du coater,Non-Nee,Oui-Ja,
332,846.0,,Rue Pige-au-Croly,157.0,6042.0,Charleroi,Hainaut,,N,,
333,847.0,Appareil fixe-Vast apparaat,Rue Mondron,40.0,6042.0,Lodelinsart,Hainaut,Entrée du hall 2 : VIG,Non-Nee,Oui-Ja,
334,847.0,,Rue Pige-au-Croly,157.0,6042.0,CHARLEROI,Hainaut,,N,,
1074,1814.0,,Oude Dyck,1.0,9130.0,Beveren-Waas,Oost-Vlaanderen,"glvl, lokaal rechts receptie",,,
...,...,...,...,...,...,...,...,...,...,...,...
14823,16279.0,Appareil fixe-Vast apparaat,boombekelaan,12.0,2660.0,HOBOKEN,Antwerpen,Inkom werkplaats,Oui-Ja,,
14827,16278.0,Appareil fixe-Vast apparaat,Rue de Niederpallen,1.0,8506.0,Redange,Luxembourg,,Oui-Ja,Non-Nee,
14828,16279.0,Appareil fixe-Vast apparaat,Haneboesch,2.0,4563.0,Differdange,Luxembourg,,Oui-Ja,Non-Nee,
15015,16461.0,Appareil fixe-Vast apparaat,Gouvernementstraat,,9000.0,GENT,Oost-Vlaanderen,/,Oui-Ja,Oui-Ja,


In [457]:
#counter to create the new different id values
counter = {}

for index, row in duplicate_ids.iterrows():
    original_id = row['id']
    
    if original_id not in counter:
        counter[original_id] = 0
    else:
        counter[original_id] += 1
    
    # modify the id by adding 90000, 900000, 9000000, etc., according to how many times it occurs in the dataset
    aed_locations_no_dup.at[index, 'id'] = original_id + 90000 * 10 ** counter[original_id]

#test to see if it actually works
print(aed_locations_no_dup[aed_locations_no_dup['id'] >= 90000])

              id                         type             address  number  \
331      90846.0  Appareil fixe-Vast apparaat         Rue Mondron    40.0   
332     900846.0                         None   Rue Pige-au-Croly   157.0   
333      90847.0  Appareil fixe-Vast apparaat        Rue Mondron     40.0   
334     900847.0                         None   Rue Pige-au-Croly   157.0   
1074     91814.0                         None           Oude Dyck     1.0   
...          ...                          ...                 ...     ...   
14828   916279.0  Appareil fixe-Vast apparaat          Haneboesch     2.0   
15015   106461.0  Appareil fixe-Vast apparaat  Gouvernementstraat     NaN   
15017   916461.0  Appareil fixe-Vast apparaat         Diepvenneke    45.0   
13134  9010899.0                          NaN                 NaN     NaN   
13137  9010900.0                          NaN                 NaN     NaN   

       postal_code  municipality         province  \
331         6042.0   L

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aed_locations_no_dup.at[index, 'id'] = original_id + 90000 * 10 ** counter[original_id]


In [458]:
# we want to store all AED's considered poorly performing and the reason for why they are considered so. During the data cleaning, 
# some AED's will already get filtered, and are considered poorly performing.

# creating a dataframe to store these AED's. Only the ID, the reason for removal, province and municipality are stored.
removed_AED = pd.DataFrame(columns = ['id','reason', 'province', 'municipality'])

# function to filter and store the AED's
def filter_and_log_removed(df, filter_condition, reason):
    # apply the filter condition
    removed = df[filter_condition]
    
    # append removed devices to the removed_AED DataFrame
    global removed_AED
    removed_AED = pd.concat([removed_AED, pd.DataFrame({'id': removed['id'], 'reason': reason, 'province': removed['province'],
                                                        'municipality': removed['municipality']})])
    
    # remove the filtered devices from the original DataFrame
    df = df.drop(removed.index)
    
    return df

In [459]:
#count the number of missing addresses in aed_locations
aed_missing_location_count = aed_locations_no_dup['address'].isna().sum()
print(aed_missing_location_count)

#filter out instances where address is missing. Those instances are invalid, since we cannot know where the AED was placed
#first we inspect these instances, perhaps address was present under the wrong variable
aed_locations_with_na = aed_locations_no_dup[aed_locations['address'].isna()]
print(aed_locations_with_na)

#apply our function in order to filter out and store the filtered instances
aed_locations_cleaned = filter_and_log_removed(aed_locations_no_dup, aed_locations_no_dup['address'].isna(), 'address_missing')

4
          id  type address  number  postal_code  municipality    province  \
6092  7323.0  None    None     NaN       3910.0      Neerpelt     Limburg   
6212  7467.0  None    None     0.0       6870.0  Saint-Hubert  Luxembourg   

     location public available hours  
6092     None   None      None  None  
6212     None   None      None  None  


  aed_locations_with_na = aed_locations_no_dup[aed_locations['address'].isna()]
  removed_AED = pd.concat([removed_AED, pd.DataFrame({'id': removed['id'], 'reason': reason, 'province': removed['province'],


In [460]:
len(aed_locations_cleaned) #check

15223

In [461]:
aed_missing_number_count = aed_locations['number'].isna().sum()
print(aed_missing_number_count)

# proportion missing:
print(aed_missing_number_count/aed_locations_cleaned.shape[0])

# around 14% of the cleaned instances has a missing streetnumber. This is a significant amount, which means that simply removing them would have too
#much of an impact on the results. Therefore in the geocoding below, we simply input them without a street number.
#in a real world scenario, one might ask the stakeholder what happened with this data, perhaps the data is available somewhere else.

2142
0.14070813900019707


In [462]:
aed_type_counts = aed_locations['type'].value_counts()
print(aed_type_counts)

aed_missing_type_count = aed_locations['type'].isna().sum()
print(aed_missing_type_count)
print(len(aed_locations))
# the majority of the known AED types in the cleaned table is fixed apparatus (5166, or 33.93%) , with mobile apparatus only a very small part 
#(17, or 0.11%) of the group, and only one of M5066A type. The majority is of unknown type (10060, or 66.07%).
#in a real world scenario it would again be possible to ask the stakeholder about these missing values.
#we assume that all types of devices are useable.

type
Appareil fixe-Vast apparaat         5048
Appareil fixe                         66
Appareil Fixe                         19
Appareil Mobile- Mobiel apparaat      17
Vast apparaat                         16
M5066A                                 1
Name: count, dtype: int64
10060
15227


In [463]:
#we can remove the type column, we don't need it for further analysis
aed_locations_cleaned = aed_locations_cleaned.drop(columns = ['type'])

In [464]:
aed_locations.head(10)

Unnamed: 0,id,type,address,number,postal_code,municipality,province,location,public,available,hours
0,13.0,,Blvd. Fr. Roosevelt,24.0,7060.0,SOIGNIES,Hainaut,,Y,,
1,70.0,,Ch. De Wégimont,76.0,4630.0,Ayeneux,Liège,,,,
2,71.0,,Place Saint - Lambert,,4020.0,Liège,Liège,,,,
3,72.0,,Rue du Doyard,,4990.0,Lierneux,Liège,,,,
4,73.0,,Fond Saint Servais,,4000.0,Liège,Liège,,,,
5,74.0,,Rue des Prémontrès,12.0,4020.0,Liège,Liège,,,,
6,75.0,,Route de Bastogne,1.0,4920.0,Harzé,Liège,,,,
7,76.0,,Rue du Parc,1.0,4540.0,Jehay,Liège,,,,
8,77.0,,Blvd. De la Constitution,19.0,4020.0,Liège,Liège,,,,
9,78.0,,Place de la République française,,4000.0,Liège,Liège,,,,


In [465]:
aed_location_counts = aed_locations['location'].value_counts()
print(aed_location_counts)
#the location variable contains a description of where the AED is located exactly. This is not useable for the analysis, so the variable is simply
#removed later on
#aed_locations_cleaned.drop(columns = ['location'])

location
/                                              314
.                                              289
Accueil                                        196
                                                89
INKOM                                           81
                                              ... 
Gelijkvloers buiten tegen loods                  1
Hoofdgebouw, 1ste verdieping, thv cafetaria      1
reddersgebouw aan zwemvijver                     1
Locatie : ingang poort hoofdgebouw               1
 inkomhal, glvl                                  1
Name: count, Length: 5732, dtype: int64


In [466]:
aed_public_counts = aed_locations['public'].value_counts()
print(aed_public_counts)
print(aed_locations['public'].isna().sum())
#1220 devices are not publicly available. It would be useful to have more information on these, to decide whether or not they could still be used in
#case of a cardiac arrest.
aed_nonpublic_aed = aed_locations[(aed_locations['public'] == 'Non-Nee') | (aed_locations['public'] == 'N')]
aed_nonpublic_aed_with_available = aed_nonpublic_aed.dropna(subset=['available'])#only check for the ones that have a value for available, then we can
#decide for those if they are useable
print(aed_nonpublic_aed_with_available['available'].value_counts())
#there seem to be contradictions with the data, where it says for the public variable that it is not available, but for the available variable that it is
#available. Only a very small amount actually gives more details.

#we are optimistic, and asume that if there is one yes in either the public variable or the available variable, we assume that it is a useable AED.
#there are 14 other AED's which give the hours at which they are available. These are available most of the time, so they are considered to be 
#available.

public
Oui-Ja     3937
Y          2775
Non-Nee    1109
y           129
N           111
Oui          11
Ja            6
J             1
Name: count, dtype: int64
7148
available
Non-Nee                                                          748
Oui-Ja                                                           230
Pendant heures d ouverture du site                                 1
Nee                                                                1
8:00 - 17:00                                                       1
Heure de bureau en semaine                                         1
Heures de bureau                                                   1
Heures de bureau                                                   1
selon heures d ouverture d Euro-Délices                            1
enkel tijdens de kantooruren (8 - 19u)                             1
tijdens werkuren                                                   1
Accessible par toute personne présente dans l inrfastructure.    

In [467]:
#removing the AED's for which both the available and the public variables contain a negative
# the conditions
filter_condition = ((aed_locations['public'] == 'Non-Nee') | (aed_locations['public'] == 'N')) & ((aed_locations['available'] == 'Non-Nee') |
                                                                                                 (aed_locations['available'] == 'Nee'))
#there was a single entry for which the value of the variable available was simply "nee" instead of "Non-Nee"
aed_locations_cleaned = filter_and_log_removed(aed_locations_cleaned, filter_condition, 'not_available')

#we can now remove the available and the public variables, since we do not need them in the further analysis
aed_locations_cleaned = aed_locations_cleaned.drop(columns = ['available', 'public', 'location'])

  removed = df[filter_condition]


In [468]:
#checking the value of hours for the remaining entries
aed_hours_counts = aed_locations['hours'].value_counts()
print(aed_hours_counts)
print(aed_locations['hours'].isna().sum())
print(aed_locations_cleaned['hours'].isna().sum())
print(1 - aed_locations_cleaned['hours'].isna().sum()/len(aed_locations_cleaned))
#most values for the hours seem to overlap with office hours. only 5.42% of the instances has a value for hours. We consider the AED's available at
#office hours to be available when they are needed.

hours
8h-17h                                                                                                                            42
8h-16h                                                                                                                            20
24/24                                                                                                                             19
HEURE DE BUREAU                                                                                                                   13
Tijdens openingsuren recyclagepark                                                                                                10
                                                                                                                                  ..
ma-vrij 8u-17u                                                                                                                     1
En fonction des locations                                      

In [469]:
#we can remove the hours variable now
aed_locations_cleaned = aed_locations_cleaned.drop(columns = ['hours'])
aed_locations_cleaned

Unnamed: 0,id,address,number,postal_code,municipality,province
0,13.0,Blvd. Fr. Roosevelt,24.0,7060.0,SOIGNIES,Hainaut
1,70.0,Ch. De Wégimont,76.0,4630.0,Ayeneux,Liège
2,71.0,Place Saint - Lambert,,4020.0,Liège,Liège
3,72.0,Rue du Doyard,,4990.0,Lierneux,Liège
4,73.0,Fond Saint Servais,,4000.0,Liège,Liège
...,...,...,...,...,...,...
15220,16660.0,Chaussée de Marche,799.0,5100.0,Wierde,Namur
15221,16661.0,Nekkerspoel-Borcht,19.0,2800.0,Mechelen,Antwerpen
15223,16664.0,Nieuwe Dreef,17.0,9160.0,Lokeren,Oost-Vlaanderen
15224,16665.0,Panterschipstraat,207.0,9000.0,Gent,Oost-Vlaanderen


In [470]:
#751 AED's were removed from the analysis. 2 because their address was missing, 749 because they were not available. The ones 
#where address is missing do not necessarily need to be reallocated, the ones that are not available could be allocated. Perhaps the AED's not available
#to the public are placed in large facilities, e.g. sports facilities, where they actually do serve a purpose. This would need to be verified for each 
#of these AEDs individually.
removed_AED

Unnamed: 0,id,reason,province,municipality
6092,7323.0,address_missing,Limburg,Neerpelt
6212,7467.0,address_missing,Luxembourg,Saint-Hubert
13134,9010899.0,address_missing,,
13137,9010900.0,address_missing,,
750,1412.0,not_available,Hainaut,La Hestre
...,...,...,...,...
15211,16650.0,not_available,Limburg,Velm
15212,13572.0,not_available,Namur,anseremme
15213,16651.0,not_available,Bruxelles-Brussel,Neder-over-Heembeek
15222,16662.0,not_available,Antwerpen,Mechelen


In [471]:
#we now add the exact coordinates (longitude and latitude) of the AED's. This is needed later on in the analysis, to calculate
#distances and to plot them on a map.

#convert the number variable to int, geocoding doesn't work if the number is a float
#missing values for street numbers were replaced by a 0. you know how it goes, in a real world scenario we would ask what happened
aed_locations_cleaned['number'] = aed_locations_cleaned['number'].fillna(0).astype(int)

In [472]:
#this took 4 hours (regardless of how powerful your pc is). Don't run
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from geopy.exc import GeocoderTimedOut, GeocoderServiceError
import numpy as np

# initialize the Nominatim geocoder
geolocator = Nominatim(user_agent="aed_locator")

# add a rate limiter to avoid exceeding the service's rate limits
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

# function to get the latitude and longitude
def get_lat_long(row):
    try:
        location = geocode(f"{row['address']}, {row['number']}, {row['municipality']}")
        if location:
            return location.latitude, location.longitude
    except (GeocoderTimedOut, GeocoderServiceError) as e:
        print(f"Error: {e}")
    return None, None

# function to process in batches and save progress (so I could easily run it overnight without having to monitor it)
def process_batches(df, batch_size=100, output_file='geocoded_data.csv'):
    
    results = pd.DataFrame(columns=['id', 'address', 'number', 'municipality', 'latitude', 'longitude']) #HIER ID VAR AAN TOEGEVOEGD, want die was
    #ervoor gewoon weggelaten, waardoor de id var erna terug toegevoegd moest worden, wat wellicht het probleem was.
    
    for start in range(0, len(df), batch_size):
        end = min(start + batch_size, len(df))
        batch = df.iloc[start:end].copy()  # work on a copy of the batch
        
        # apply geocoding function
        lat_long = batch.apply(get_lat_long, axis=1, result_type='expand')
        batch[['latitude', 'longitude']] = lat_long
        
        results = pd.concat([results, batch], ignore_index=True)
        
        # save results to file
        results.to_csv(output_file, index=False)
        print(f"Processed batch {start // batch_size + 1} and saved to {output_file}")
    
    return results


geocoded_results = process_batches(aed_locations_cleaned, batch_size=100, output_file='geocoded_data_idfix.csv')

  results = pd.concat([results, batch], ignore_index=True)


Processed batch 1 and saved to geocoded_data_idfix.csv
Processed batch 2 and saved to geocoded_data_idfix.csv
Processed batch 3 and saved to geocoded_data_idfix.csv
Processed batch 4 and saved to geocoded_data_idfix.csv
Processed batch 5 and saved to geocoded_data_idfix.csv
Processed batch 6 and saved to geocoded_data_idfix.csv
Processed batch 7 and saved to geocoded_data_idfix.csv
Processed batch 8 and saved to geocoded_data_idfix.csv
Processed batch 9 and saved to geocoded_data_idfix.csv
Processed batch 10 and saved to geocoded_data_idfix.csv
Processed batch 11 and saved to geocoded_data_idfix.csv
Processed batch 12 and saved to geocoded_data_idfix.csv
Processed batch 13 and saved to geocoded_data_idfix.csv
Processed batch 14 and saved to geocoded_data_idfix.csv
Processed batch 15 and saved to geocoded_data_idfix.csv
Processed batch 16 and saved to geocoded_data_idfix.csv
Processed batch 17 and saved to geocoded_data_idfix.csv
Processed batch 18 and saved to geocoded_data_idfix.csv
P

RateLimiter caught an error, retrying (0/2 tries). Called with (*('Rue de la Chapelle Saint-Jean s/n, 0, Jodoigne',), **{}).
Traceback (most recent call last):
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\site-packages\urllib3\connectionpool.py", line 536, in _make_request
    response = conn.getresponse()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\site-packages\urllib3\connection.py", line 464, in getresponse
    httplib_response = super().getresponse()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 1377, in getresponse
    response.begin()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "C:\U

Processed batch 21 and saved to geocoded_data_idfix.csv


RateLimiter caught an error, retrying (0/2 tries). Called with (*('Rue Place N-D de la Paix, 5, Erpent',), **{}).
Traceback (most recent call last):
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\site-packages\urllib3\connectionpool.py", line 536, in _make_request
    response = conn.getresponse()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\site-packages\urllib3\connection.py", line 464, in getresponse
    httplib_response = super().getresponse()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 1377, in getresponse
    response.begin()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "C:\Users\eloua\

Processed batch 22 and saved to geocoded_data_idfix.csv
Processed batch 23 and saved to geocoded_data_idfix.csv


RateLimiter caught an error, retrying (0/2 tries). Called with (*('Rue de la Station, 142, Le',), **{}).
Traceback (most recent call last):
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\site-packages\urllib3\connectionpool.py", line 536, in _make_request
    response = conn.getresponse()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\site-packages\urllib3\connection.py", line 464, in getresponse
    httplib_response = super().getresponse()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 1377, in getresponse
    response.begin()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "C:\Users\eloua\anaconda3

Processed batch 24 and saved to geocoded_data_idfix.csv
Processed batch 25 and saved to geocoded_data_idfix.csv


RateLimiter caught an error, retrying (0/2 tries). Called with (*('Av. De la Croix Rouge 1 (rez,au fond du couloir), 1, Huy',), **{}).
Traceback (most recent call last):
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\site-packages\urllib3\connectionpool.py", line 536, in _make_request
    response = conn.getresponse()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\site-packages\urllib3\connection.py", line 464, in getresponse
    httplib_response = super().getresponse()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 1377, in getresponse
    response.begin()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  

Processed batch 26 and saved to geocoded_data_idfix.csv
Processed batch 27 and saved to geocoded_data_idfix.csv
Processed batch 28 and saved to geocoded_data_idfix.csv
Processed batch 29 and saved to geocoded_data_idfix.csv
Processed batch 30 and saved to geocoded_data_idfix.csv
Processed batch 31 and saved to geocoded_data_idfix.csv
Processed batch 32 and saved to geocoded_data_idfix.csv
Processed batch 33 and saved to geocoded_data_idfix.csv
Processed batch 34 and saved to geocoded_data_idfix.csv
Processed batch 35 and saved to geocoded_data_idfix.csv
Processed batch 36 and saved to geocoded_data_idfix.csv
Processed batch 37 and saved to geocoded_data_idfix.csv
Processed batch 38 and saved to geocoded_data_idfix.csv


RateLimiter caught an error, retrying (0/2 tries). Called with (*('Rue de l Eglise St Martin , 10, Philippeville',), **{}).
Traceback (most recent call last):
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\site-packages\urllib3\connectionpool.py", line 536, in _make_request
    response = conn.getresponse()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\site-packages\urllib3\connection.py", line 464, in getresponse
    httplib_response = super().getresponse()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 1377, in getresponse
    response.begin()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "C:\Us

Processed batch 39 and saved to geocoded_data_idfix.csv
Processed batch 40 and saved to geocoded_data_idfix.csv
Processed batch 41 and saved to geocoded_data_idfix.csv
Processed batch 42 and saved to geocoded_data_idfix.csv
Processed batch 43 and saved to geocoded_data_idfix.csv
Processed batch 44 and saved to geocoded_data_idfix.csv


RateLimiter caught an error, retrying (0/2 tries). Called with (*('Place de l Hôtel de Ville , 6, Châtelet',), **{}).
Traceback (most recent call last):
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\site-packages\urllib3\connectionpool.py", line 536, in _make_request
    response = conn.getresponse()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\site-packages\urllib3\connection.py", line 464, in getresponse
    httplib_response = super().getresponse()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 1377, in getresponse
    response.begin()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "C:\Users\el

Processed batch 45 and saved to geocoded_data_idfix.csv
Processed batch 46 and saved to geocoded_data_idfix.csv
Processed batch 47 and saved to geocoded_data_idfix.csv
Processed batch 48 and saved to geocoded_data_idfix.csv
Processed batch 49 and saved to geocoded_data_idfix.csv


RateLimiter caught an error, retrying (0/2 tries). Called with (*('Place de la Station, 13, Jemeppe-sur-Meuse',), **{}).
Traceback (most recent call last):
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\site-packages\urllib3\connectionpool.py", line 536, in _make_request
    response = conn.getresponse()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\site-packages\urllib3\connection.py", line 464, in getresponse
    httplib_response = super().getresponse()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 1377, in getresponse
    response.begin()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "C:\Users

Processed batch 50 and saved to geocoded_data_idfix.csv
Processed batch 51 and saved to geocoded_data_idfix.csv
Processed batch 52 and saved to geocoded_data_idfix.csv
Processed batch 53 and saved to geocoded_data_idfix.csv
Processed batch 54 and saved to geocoded_data_idfix.csv
Processed batch 55 and saved to geocoded_data_idfix.csv
Processed batch 56 and saved to geocoded_data_idfix.csv
Processed batch 57 and saved to geocoded_data_idfix.csv
Processed batch 58 and saved to geocoded_data_idfix.csv
Processed batch 59 and saved to geocoded_data_idfix.csv
Processed batch 60 and saved to geocoded_data_idfix.csv
Processed batch 61 and saved to geocoded_data_idfix.csv
Processed batch 62 and saved to geocoded_data_idfix.csv
Processed batch 63 and saved to geocoded_data_idfix.csv
Processed batch 64 and saved to geocoded_data_idfix.csv
Processed batch 65 and saved to geocoded_data_idfix.csv
Processed batch 66 and saved to geocoded_data_idfix.csv
Processed batch 67 and saved to geocoded_data_id

RateLimiter caught an error, retrying (0/2 tries). Called with (*('Pl. d/l République Franç. , 41, Liège',), **{}).
Traceback (most recent call last):
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\site-packages\urllib3\connectionpool.py", line 536, in _make_request
    response = conn.getresponse()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\site-packages\urllib3\connection.py", line 464, in getresponse
    httplib_response = super().getresponse()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 1377, in getresponse
    response.begin()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "C:\Users\elou

Processed batch 69 and saved to geocoded_data_idfix.csv
Processed batch 70 and saved to geocoded_data_idfix.csv


RateLimiter caught an error, retrying (0/2 tries). Called with (*('Rue du bois du four, 1, Boussu-lez-Walcourt',), **{}).
Traceback (most recent call last):
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\site-packages\urllib3\connectionpool.py", line 536, in _make_request
    response = conn.getresponse()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\site-packages\urllib3\connection.py", line 464, in getresponse
    httplib_response = super().getresponse()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 1377, in getresponse
    response.begin()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "C:\User

Processed batch 71 and saved to geocoded_data_idfix.csv
Processed batch 72 and saved to geocoded_data_idfix.csv
Processed batch 73 and saved to geocoded_data_idfix.csv
Processed batch 74 and saved to geocoded_data_idfix.csv
Processed batch 75 and saved to geocoded_data_idfix.csv
Processed batch 76 and saved to geocoded_data_idfix.csv


RateLimiter caught an error, retrying (0/2 tries). Called with (*('Place de la gare 1, 1, Braine l alleud',), **{}).
Traceback (most recent call last):
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\site-packages\urllib3\connectionpool.py", line 536, in _make_request
    response = conn.getresponse()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\site-packages\urllib3\connection.py", line 464, in getresponse
    httplib_response = super().getresponse()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 1377, in getresponse
    response.begin()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "C:\Users\elo

Processed batch 77 and saved to geocoded_data_idfix.csv
Processed batch 78 and saved to geocoded_data_idfix.csv
Processed batch 79 and saved to geocoded_data_idfix.csv
Processed batch 80 and saved to geocoded_data_idfix.csv
Processed batch 81 and saved to geocoded_data_idfix.csv
Processed batch 82 and saved to geocoded_data_idfix.csv
Processed batch 83 and saved to geocoded_data_idfix.csv
Processed batch 84 and saved to geocoded_data_idfix.csv
Processed batch 85 and saved to geocoded_data_idfix.csv
Processed batch 86 and saved to geocoded_data_idfix.csv


RateLimiter caught an error, retrying (0/2 tries). Called with (*('Rue du Parc, 32, La',), **{}).
Traceback (most recent call last):
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\site-packages\urllib3\connectionpool.py", line 536, in _make_request
    response = conn.getresponse()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\site-packages\urllib3\connection.py", line 464, in getresponse
    httplib_response = super().getresponse()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 1377, in getresponse
    response.begin()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "C:\Users\eloua\anaconda3\envs\M

Processed batch 87 and saved to geocoded_data_idfix.csv
Processed batch 88 and saved to geocoded_data_idfix.csv
Processed batch 89 and saved to geocoded_data_idfix.csv
Processed batch 90 and saved to geocoded_data_idfix.csv
Processed batch 91 and saved to geocoded_data_idfix.csv
Processed batch 92 and saved to geocoded_data_idfix.csv
Processed batch 93 and saved to geocoded_data_idfix.csv
Processed batch 94 and saved to geocoded_data_idfix.csv
Processed batch 95 and saved to geocoded_data_idfix.csv
Processed batch 96 and saved to geocoded_data_idfix.csv
Processed batch 97 and saved to geocoded_data_idfix.csv
Processed batch 98 and saved to geocoded_data_idfix.csv
Processed batch 99 and saved to geocoded_data_idfix.csv
Processed batch 100 and saved to geocoded_data_idfix.csv
Processed batch 101 and saved to geocoded_data_idfix.csv
Processed batch 102 and saved to geocoded_data_idfix.csv
Processed batch 103 and saved to geocoded_data_idfix.csv
Processed batch 104 and saved to geocoded_da

RateLimiter caught an error, retrying (0/2 tries). Called with (*('Rue Moulin de Royen, 2, La Roche-en-Ardenne',), **{}).
Traceback (most recent call last):
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\site-packages\urllib3\connectionpool.py", line 536, in _make_request
    response = conn.getresponse()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\site-packages\urllib3\connection.py", line 464, in getresponse
    httplib_response = super().getresponse()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 1377, in getresponse
    response.begin()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "C:\User

Processed batch 113 and saved to geocoded_data_idfix.csv
Processed batch 114 and saved to geocoded_data_idfix.csv
Processed batch 115 and saved to geocoded_data_idfix.csv
Processed batch 116 and saved to geocoded_data_idfix.csv
Processed batch 117 and saved to geocoded_data_idfix.csv
Processed batch 118 and saved to geocoded_data_idfix.csv


RateLimiter caught an error, retrying (0/2 tries). Called with (*('Rue Notre-Dame de Grâce, 1, Marche-en-Famenne',), **{}).
Traceback (most recent call last):
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\site-packages\urllib3\connectionpool.py", line 536, in _make_request
    response = conn.getresponse()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\site-packages\urllib3\connection.py", line 464, in getresponse
    httplib_response = super().getresponse()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 1377, in getresponse
    response.begin()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\http\client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Users\eloua\anaconda3\envs\MDAenv\lib\socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "C:\Us

Processed batch 119 and saved to geocoded_data_idfix.csv
Processed batch 120 and saved to geocoded_data_idfix.csv
Processed batch 121 and saved to geocoded_data_idfix.csv
Processed batch 122 and saved to geocoded_data_idfix.csv
Processed batch 123 and saved to geocoded_data_idfix.csv
Processed batch 124 and saved to geocoded_data_idfix.csv
Processed batch 125 and saved to geocoded_data_idfix.csv
Processed batch 126 and saved to geocoded_data_idfix.csv
Processed batch 127 and saved to geocoded_data_idfix.csv
Processed batch 128 and saved to geocoded_data_idfix.csv
Processed batch 129 and saved to geocoded_data_idfix.csv
Processed batch 130 and saved to geocoded_data_idfix.csv
Processed batch 131 and saved to geocoded_data_idfix.csv
Processed batch 132 and saved to geocoded_data_idfix.csv
Processed batch 133 and saved to geocoded_data_idfix.csv
Processed batch 134 and saved to geocoded_data_idfix.csv
Processed batch 135 and saved to geocoded_data_idfix.csv
Processed batch 136 and saved t

In [473]:
geocoded_results

Unnamed: 0,id,address,number,municipality,latitude,longitude,postal_code,province
0,13.0,Blvd. Fr. Roosevelt,24,SOIGNIES,,,7060.0,Hainaut
1,70.0,Ch. De Wégimont,76,Ayeneux,,,4630.0,Liège
2,71.0,Place Saint - Lambert,0,Liège,50.750721,5.933967,4020.0,Liège
3,72.0,Rue du Doyard,0,Lierneux,50.285043,5.785950,4990.0,Liège
4,73.0,Fond Saint Servais,0,Liège,50.646765,5.570987,4000.0,Liège
...,...,...,...,...,...,...,...,...
14469,16660.0,Chaussée de Marche,799,Wierde,50.426430,4.929522,5100.0,Namur
14470,16661.0,Nekkerspoel-Borcht,19,Mechelen,51.025217,4.504148,2800.0,Antwerpen
14471,16664.0,Nieuwe Dreef,17,Lokeren,51.097802,3.911744,9160.0,Oost-Vlaanderen
14472,16665.0,Panterschipstraat,207,Gent,,,9000.0,Oost-Vlaanderen


In [475]:
#checking if there are any missing ID's, which was a problem before, but got fixed
missing_id_test = geocoded_results[geocoded_results['id'].isnull()]
missing_id_test

Unnamed: 0,id,address,number,municipality,latitude,longitude,postal_code,province


In [476]:
#for a substantial amount the coordinates could not be found (almost 20%). Upon inspecting some instances for which this is the
#case, it seems to be so because the address was not properly named (for example Ch. De Wégimont, instead of chemin De Wégimont or De Wégimont street).
#for a data set of 14 476, it is unfeasable to manually correct these names. These AED's will thus be removed from the analysis.

nacount = geocoded_results['latitude'].isna().sum()
nacount

np.int64(2825)

In [479]:
#remove the instances with missing coordinates, since we simply cannot use them. Manually inspecting each instance with missing
#coordinates is unfeasable.
filter_condition = geocoded_results['latitude'].isna()

cleaned_geocoded_results = filter_and_log_removed(geocoded_results, filter_condition, "missing_coordinates")
cleaned_geocoded_results

Unnamed: 0,id,address,number,municipality,latitude,longitude,postal_code,province
2,71.0,Place Saint - Lambert,0,Liège,50.750721,5.933967,4020.0,Liège
3,72.0,Rue du Doyard,0,Lierneux,50.285043,5.785950,4990.0,Liège
4,73.0,Fond Saint Servais,0,Liège,50.646765,5.570987,4000.0,Liège
5,74.0,Rue des Prémontrès,12,Liège,50.638047,5.574046,4020.0,Liège
7,76.0,Rue du Parc,1,Jehay,50.577233,5.323038,4540.0,Liège
...,...,...,...,...,...,...,...,...
14466,16654.0,Farnières,4,Grand Halleux,50.310813,5.880418,6698.0,Liège
14469,16660.0,Chaussée de Marche,799,Wierde,50.426430,4.929522,5100.0,Namur
14470,16661.0,Nekkerspoel-Borcht,19,Mechelen,51.025217,4.504148,2800.0,Antwerpen
14471,16664.0,Nieuwe Dreef,17,Lokeren,51.097802,3.911744,9160.0,Oost-Vlaanderen


In [270]:
#for the other tables, we simply need to remove unnecessary variables, and merge the different tables.
#all the tables containing info can be merged into one big table, all the tables containing information from the first responders can be merged.

In [483]:
#first we verify whether the unit is permanently staffed. If it is not, we do not consider it useable because it is considered
#unreliable, therefore an AED close to it's location can still be useful.
ambulance_value_counts = ambulance_locations['occasional_permanence'].value_counts()
print(ambulance_value_counts)
ambulance_locations_filtered = ambulance_locations[ambulance_locations['occasional_permanence'] == "N"]
occasional_permanence_ambulance = ambulance_locations[ambulance_locations['occasional_permanence'] == "Y"]

occasional_permanence
N    251
Y     28
Name: count, dtype: int64


In [484]:
#get the latitude and longitude of the PIT locations, since these are missing from the table.

#some of the hospitals did not get recognized, so the address was manually looked up and inputted in the list. That way, the
#exact longitude and latitude could still be calculated.

from geopy.geocoders import Nominatim

# initialize geolocator
geolocator = Nominatim(user_agent="pit_locator")

# data in lists
# need to input the names manually in the list, since the useful part (for example "AZ RIVIERENLAND") is not consistently present in any variable.
#simply inputting the entire string value of the campus variable doesn't work, for example "104 - AZ RIVIERENLAND --- Campus/Site: 1270 - RUMST" yields
#no results.
data = {
    'name': ['AZ RIVIERENLAND ', 'AZ SINT-VINCENTIUS DEINZE', 'ZNA STUIVENBERG', ' HEILIG HART ZIEKENHUIS 1260',
             'HEILIG HART ZIEKENHUIS MOL', 'VITAZ CAMPUS LOKEREN', 'OLV ZIEKENHUIS ASSE', 'AZ SINT-JAN BRUGGE - OOSTENDE',
             'AZ SINT-LUCAS', 'AZ ZENO BLANKENBERGE', 'OOST-LIMBURG SINT-JAN GENK', 'UZ LEUVEN GASTHUISBERG',
             'HOPITAL IRIS SUD', 'CHU SAINT-PIERRE - PORTE DE HAL', 'UZ BRUSSEL', 'GRAND HOPITAL DE CHARLEROI SAINT- JOSEPH',
             'C.H.U. AMBROISE PARE', 'CHR SAMBRE ET MEUSE (SITE MEUSE)', 'CLINIQUE ANDRE RENARD', 'HOPITAL DE LA CITADELLE',
             'CHR VERVIERS - LA TOURELLE', None, None, None],
    'address': [None, None, None, None, None, None, None, None, None, None,
                None, 'Rue Jean Paquot 63', 'Boulevard de Waterloo 129' , '290, rue Haute', 'Rue Marguerite Depasse 6', 'Boulevard Président Kennedy 2',
                'Avenue Albert 1er 185', 'rue André Renard 1', None, None,
                None, 'Rue des Déportés 137', "Rue d'Harnoncourt 48", 'Avenue de Houffalize 35']
}

hospitals_df = pd.DataFrame(data)


hospitals_with_coords = pd.DataFrame(columns=['id', 'name', 'municipality', 'address', 'latitude', 'longitude'])


# function to geocode hospitals, can use the names of the hospitals or the addresses if the name is not enough 
def geocode_hospitals(df):
    for index, row in df.iterrows():
        hospital_name = row['name']
        address = row['address']
        
        # determine the query based on available information
        if address:
            query = address
        else:
            query = f"{hospital_name}, Belgium"
        
        try:
            location = geolocator.geocode(query, timeout=10)
            if location:
                df.loc[index, 'address'] = location.address
                df.loc[index, 'latitude'] = location.latitude
                df.loc[index, 'longitude'] = location.longitude
            else:
                df.loc[index, 'address'] = None
                df.loc[index, 'latitude'] = None
                df.loc[index, 'longitude'] = None
        except (GeocoderTimedOut, GeocoderServiceError) as e:
            print(f"Error geocoding {query}: {e}")
            df.loc[index, 'address'] = None
            df.loc[index, 'latitude'] = None
            df.loc[index, 'longitude'] = None
    
    return df

# geocode the hospitals
pit_hospitals_with_coords = geocode_hospitals(hospitals_df)

#drop columns, we only need the coordinates
pit_hospitals_with_coords = pit_hospitals_with_coords.loc[:, ['latitude', 'longitude']]

# display the hospitals DataFrame with addresses and coordinates
print(pit_hospitals_with_coords)

     latitude  longitude
0   51.105417   4.368792
1   50.986425   3.527572
2   51.223649   4.434875
3   50.810888   4.933485
4   51.187299   5.114834
5   51.101340   4.000929
6   50.909904   4.196023
7   51.222791   2.914147
8   51.062547   3.720756
9   51.307809   3.124371
10  50.957775   5.517608
11  50.825930   4.378332
12  50.833473   4.345943
13  50.835695   4.348181
14  50.418054   4.480988
15  43.219179   0.079410
16  50.467587   4.888018
17  50.430465   4.093855
18  50.674721   5.633436
19  50.652037   5.578166
20  50.582211   5.859296
21  50.521428   3.642525
22  49.552645   5.526049
23  49.927931   5.384295


In [485]:
# now get and store the mug hospital coordinates
mug_hospitals_with_coords = pd.DataFrame(columns=['latitude', 'longitude'])

def geocode_hospitals(df):
    for index, row in df.iterrows():
        address = row['address_campus']
        municipality = row['municipality']
        
        # construct the query using address and municipality
        query = f"{address}, {municipality}, Belgium"
        
        try:
            location = geolocator.geocode(query, timeout=10)
            if location:
                df.loc[index, 'latitude'] = location.latitude
                df.loc[index, 'longitude'] = location.longitude
            else:
                df.loc[index, 'latitude'] = None
                df.loc[index, 'longitude'] = None
        except (GeocoderTimedOut, GeocoderServiceError) as e:
            print(f"Error geocoding {query}: {e}")
            df.loc[index, 'latitude'] = None
            df.loc[index, 'longitude'] = None

    return df

# geocode the hospitals
mug_hospitals_with_coords = geocode_hospitals(mug_locations)

# display the hospitals DataFrame with addresses and coordinates
print(mug_hospitals_with_coords)


    hospital_id  mug_id  campus_id                      name_hospital  \
0             9  102000       6230  ZIEKENHUISNETWERK ANTWERPEN (ZNA)   
1            99  102000       2020                  GZA- ZIEKENHUIZEN   
2           682  102000       1210                          AZ MONICA   
3             9  103000       2000  ZIEKENHUISNETWERK ANTWERPEN (ZNA)   
4            99  103000       1290                  GZA- ZIEKENHUIZEN   
..          ...     ...        ...                                ...   
89           20  808000       3010        CHR VERVIERS - EAST BELGIUM   
90          168  901000       3720                           VIVALIA    
91          246  902000       3690                           VIVALIA    
92          168  903000       3240                           VIVALIA    
93          164  904000       3230                           VIVALIA    

             name_campus           address_campus  postal_code  \
0              ZNA CADIX         KEMPENSTRAAT 100        

In [486]:
#there are 6 missing coordinates. We add the coordinates manually.
missing_lat_mug_count = mug_hospitals_with_coords['latitude'].isnull().sum()
missing_lat_mug_count

np.int64(6)

In [487]:
missing_mug_coordinates = mug_hospitals_with_coords[mug_hospitals_with_coords['latitude'].isnull()]
missing_mug_coordinates

Unnamed: 0,hospital_id,mug_id,campus_id,name_hospital,name_campus,address_campus,postal_code,municipality,region,province,latitude,longitude
61,254,601000,2730,CHU AMBROISE PARE,,BOULEVARD KENNEDY 2,7000,MONS,W,Henegouwen,,
62,266,601000,2710,GROUPE JOLIMONT,HOPITAL DE MONS,AV. B. DE CONSTANTINOPLE 5,7000,MONS,W,Henegouwen,,
66,410,604000,3380,CENTRE HOSPITALIER EPICURA,HORNU,ROUTE DE MONS 63,7301,HORNU,W,Henegouwen,,
77,6,701000,3300,CHR SAMBRE ET MEUSE,MEUSE (CHR NAMUR),AVENUE ALBERT 1IER 185,5000,NAMUR,W,Namen,,
80,39,704000,3280,CHU-UCL NAMUR,GODINNE,AVENUE DR. G.THERASSE 1,5530,MONT-GODINNE,W,Namen,,
81,412,801000,3410,HOPITAL DE LA CITADELLE,CITADELLE,BOULEV. DU 12E DE LIGNE 1,4000,LIEGE,W,Luik,,


In [488]:
#notice that one of the hospitals was already present in the table of the PIT hospitals, so it simply gets dropped
mug_hospitals_with_coords = mug_hospitals_with_coords[mug_hospitals_with_coords['campus_id'] != 3300]
missing_mug_coordinates = missing_mug_coordinates[missing_mug_coordinates['campus_id'] != 3300]


In [489]:
#reuse the manual coordinate code used for the pit locations
from geopy.geocoders import Nominatim

# initialize geolocator
geolocator = Nominatim(user_agent="mug_locator")

# data in lists
# need to input the names manually in the list, since the useful part (for example "AZ RIVIERENLAND") is not consistently present in any variable.
#simply inputting the entire string value of the campus variable doesn't work, for example "104 - AZ RIVIERENLAND --- Campus/Site: 1270 - RUMST" yields
#no results.
data = {
    'address': ['BOULEVARD KENNEDY 2', 'Avenue Baudouin de Constantinople 5', 'ROUTE DE MONS 63', 'Avenue du Docteur Gaston Thérasse 1',
                'Boulevard du Douzième de Ligne 1']
}

hospitals_df = pd.DataFrame(data)


hospitals_with_coords = pd.DataFrame(columns=['address', 'latitude', 'longitude'])


# function to geocode hospitals, can use the names of the hospitals or the addresses if the name is not enough 
def geocode_hospitals(df):
    for index, row in df.iterrows():
        address = row['address']
        query = address
        
        try:
            location = geolocator.geocode(query, timeout=10)
            if location:
                df.loc[index, 'address'] = location.address
                df.loc[index, 'latitude'] = location.latitude
                df.loc[index, 'longitude'] = location.longitude
            else:
                df.loc[index, 'address'] = None
                df.loc[index, 'latitude'] = None
                df.loc[index, 'longitude'] = None
        except (GeocoderTimedOut, GeocoderServiceError) as e:
            print(f"Error geocoding {query}: {e}")
            df.loc[index, 'address'] = None
            df.loc[index, 'latitude'] = None
            df.loc[index, 'longitude'] = None
    
    return df

# geocode the hospitals
missing_mug_coordinates = geocode_hospitals(hospitals_df)

#drop columns, we only need the coordinates
#pit_hospitals_with_coords = pit_hospitals_with_coords.loc[:, ['latitude', 'longitude']]

print(missing_mug_coordinates)

                                             address   latitude  longitude
0  2, Boulevard Kennedy, Pléneuf, Pléneuf-Val-And...  48.587424  -2.539016
1  Hôpital de Mons - Site Constantinople, 5, Aven...  50.454379   3.959975
2  Multiflor, 63, Route de Mons, Faubourg Saint-J...  50.413840   4.164462
3                                               None        NaN        NaN
4                                               None        NaN        NaN


In [490]:
#two can still not be found. So these were added manually using Google Maps
#50.652152, 5.576917
#50.358172, 4.883299
import numpy as np
# Coordinates to append
manual_coordinates = [
    (50.652152, 5.576917),
    (50.358172, 4.883299)
]

missing_mug_coordinates = missing_mug_coordinates.iloc[:-2]


manually_added = pd.DataFrame({
    'address': [np.nan, np.nan],
    'latitude': [50.652152, 50.358172],
    'longitude': [5.576917, 4.883299]
})

# Append new rows to the DataFrame
missing_mug_coordinates = pd.concat([missing_mug_coordinates, manually_added], ignore_index=True)
print(missing_mug_coordinates)

                                             address   latitude  longitude
0  2, Boulevard Kennedy, Pléneuf, Pléneuf-Val-And...  48.587424  -2.539016
1  Hôpital de Mons - Site Constantinople, 5, Aven...  50.454379   3.959975
2  Multiflor, 63, Route de Mons, Faubourg Saint-J...  50.413840   4.164462
3                                                NaN  50.652152   5.576917
4                                                NaN  50.358172   4.883299


In [491]:
#remove all but the coordinate variables
missing_mug_coordinates = missing_mug_coordinates.loc[:, ['latitude', 'longitude']]
mug_hospitals_with_coords = mug_hospitals_with_coords.loc[:, ['latitude', 'longitude']]
print(mug_hospitals_with_coords)

     latitude  longitude
0   51.231381   4.415761
1   51.205787   4.412513
2   51.206556   4.470778
3   51.363865   4.985064
4   51.174354   4.420085
..        ...        ...
89  50.715566   6.007128
90  50.159132   5.683118
91  49.677372   5.820148
92  49.927931   5.384295
93  50.227536   5.323654

[93 rows x 2 columns]


In [492]:
mug_hospitals_with_coords = pd.concat([mug_hospitals_with_coords, missing_mug_coordinates], ignore_index=True)
print(mug_hospitals_with_coords)

     latitude  longitude
0   51.231381   4.415761
1   51.205787   4.412513
2   51.206556   4.470778
3   51.363865   4.985064
4   51.174354   4.420085
..        ...        ...
93  48.587424  -2.539016
94  50.454379   3.959975
95  50.413840   4.164462
96  50.652152   5.576917
97  50.358172   4.883299

[98 rows x 2 columns]


In [493]:
#remove the missing ones for which we looked up the coordinates manually
mug_hospitals_with_coords = mug_hospitals_with_coords.dropna()
print(mug_hospitals_with_coords)

     latitude  longitude
0   51.231381   4.415761
1   51.205787   4.412513
2   51.206556   4.470778
3   51.363865   4.985064
4   51.174354   4.420085
..        ...        ...
93  48.587424  -2.539016
94  50.454379   3.959975
95  50.413840   4.164462
96  50.652152   5.576917
97  50.358172   4.883299

[93 rows x 2 columns]


In [494]:
# put it into one big table, that contains all the coordinates
ambulance_locations_with_coords = ambulance_locations.loc[:, ['latitude', 'longitude']]
all_hospital_locations = pd.concat([mug_hospitals_with_coords, pit_hospitals_with_coords, ambulance_locations_with_coords], ignore_index = True)
all_hospital_locations

Unnamed: 0,latitude,longitude
0,51.231381,4.415761
1,51.205787,4.412513
2,51.206556,4.470778
3,51.363865,4.985064
4,51.174354,4.420085
...,...,...
391,50.025633,5.359641
392,50.674803,5.633665
393,50.652888,5.578356
394,50.622523,5.636381


In [495]:
#some amount of duplicates were present, probably because some hospitals were present in different tables.
all_hospital_locations = all_hospital_locations.drop_duplicates()
all_hospital_locations

Unnamed: 0,latitude,longitude
0,51.231381,4.415761
1,51.205787,4.412513
2,51.206556,4.470778
3,51.363865,4.985064
4,51.174354,4.420085
...,...,...
391,50.025633,5.359641
392,50.674803,5.633665
393,50.652888,5.578356
394,50.622523,5.636381


In [496]:
# the coordinates of the hospitals should match the coordinates from the calls. dus dat kan ik verifiëren.
# if the data is good, there should be a perfect overlap between the hospital permanence and the interventions permanence coordinates. This will be
#verified.

interventions_bxl_coords = interventions_bxl.loc[:, ['latitude_permanence', 'longitude_permanence']]
interventions_bxl2_coords = interventions_bxl2.loc[:, ['Latitude Permanence', 'Longitude Permanence']]
interventions1_coords = interventions1.loc[:, ['Latitude permanence', 'Longitude permanence']]
interventions2_coords = interventions2.loc[:, ['Latitude permanence', 'Longitude permanence']]
interventions3_coords = interventions3.loc[:, ['Latitude permanence', 'Longitude permanence']]
cad9_coords = cad9.loc[:, ['Latitude permanence', 'Longitude permanence']]

# need to divide, there was no decimal point in the coordinates for the BXL interventions
# the inputted coordinates are not consistent in the amount of precision, so simply dividing all by 10**5 would not work, for example one coordinate 
#would be 500.123456, and another would be 5.00123456, while we want 50.0123456
#interventions_bxl_coords = interventions_bxl_coords/10**5
#interventions_bxl2_coords = interventions_bxl2_coords/10**5



# rename columns to match between the different datasets. The column names from interventions_bxl are used as the standard.
interventions_bxl2_coords = interventions_bxl2_coords.rename(columns={
    'Latitude Permanence': 'latitude_permanence',
    'Longitude Permanence': 'longitude_permanence'
})
interventions1_coords = interventions1_coords.rename(columns={
    'Latitude permanence': 'latitude_permanence',
    'Longitude permanence': 'longitude_permanence'
})
interventions2_coords = interventions2_coords.rename(columns={
    'Latitude permanence': 'latitude_permanence',
    'Longitude permanence': 'longitude_permanence'
})
interventions3_coords = interventions3_coords.rename(columns={
    'Latitude permanence': 'latitude_permanence',
    'Longitude permanence': 'longitude_permanence'
})
cad9_coords = cad9_coords.rename(columns={
    'Latitude permanence': 'latitude_permanence',
    'Longitude permanence': 'longitude_permanence'
})

In [497]:
#for the interventions_bxl and interventions_bxl2 tables the coordinates were not given with their decimal points, so we need to
#convert those coordinate values.
#finding the minimum value, so we can divide all values and speed up the process in the following cell
min_lat_bxl = interventions_bxl_coords['latitude_permanence'].min()
min_long_bxl = interventions_bxl_coords['longitude_permanence'].min()

min_lat_bxl2 = interventions_bxl_coords['latitude_permanence'].min()
min_long_bxl2 = interventions_bxl_coords['longitude_permanence'].min()

print(min_lat_bxl)
print(min_long_bxl)
print(min_lat_bxl2)
print(min_long_bxl2)

#so for latitude the initial divide can be by 10**4, for longitude it can be 10**3, based on the printed values:
#508047
#43089
#508047
#43089



508047
43089
508047
43089


In [498]:
#adjusting the values for the bxl table, so that they are correctly formatted (with decimal point at the right place)
#latitude needs to be 2 numbers before the decimal, longitude 1 number before the decimal. We know that the entirety of Belgium lies  around rougly
#longitude 50.5 and latitude 4, so values deviating strongly from those (more than 2 units) are known to be invalid.

# fast solution to convert it, simply by dividing the values (using a simple while loop and iteratively dividing by 10 was too slow)
import math
def format_latitude(value):
    value / 10**4
    if value > 99:
        value = value / (10**math.floor(math.log10(value) - 1))
    return value

# longitude needs to be 1 number before the decimal
def format_longitude(value):
    value / 10**4
    if value > 9:
        value = value / (10**math.floor(math.log10(value)))
    return value

interventions_bxl_coords['latitude_permanence'] = interventions_bxl_coords['latitude_permanence'].apply(format_latitude)
interventions_bxl_coords['longitude_permanence'] = interventions_bxl_coords['longitude_permanence'].apply(format_longitude)

interventions_bxl2_coords['latitude_permanence'] = interventions_bxl2_coords['latitude_permanence'].apply(format_latitude)
interventions_bxl2_coords['longitude_permanence'] = interventions_bxl2_coords['longitude_permanence'].apply(format_longitude)


In [499]:
# the values seem withing range, realistic coordinates.
testbxl = interventions_bxl_coords.value_counts()
testbxl2 = interventions_bxl2_coords.value_counts()

print(testbxl)
print(testbxl2)

latitude_permanence  longitude_permanence
50.859460            4.351810                18934
50.832540            4.311990                12996
50.834330            4.345450                11607
50.869480            4.386490                 9979
50.818870            4.403110                 9037
50.850970            4.364110                 8983
50.890870            4.308670                 7048
50.852110            4.460400                 4172
50.815980            4.341524                 3072
50.804700            4.367630                 3010
50.835710            4.347930                 2701
50.842260            4.399250                 2608
50.884920            4.308900                 2470
50.837580            4.349150                 2400
50.869370            4.386460                 2230
50.825730            4.379190                 2020
50.804690            4.367640                 1892
50.783440            4.356330                 1772
50.852300            4.359880           

In [500]:
# concatenating it into one big table
all_permanence_locations = pd.concat([interventions_bxl_coords,
                                     interventions_bxl2_coords,
                                     interventions1_coords,
                                     interventions2_coords,
                                     interventions3_coords,
                                     cad9_coords], ignore_index = True)
all_permanence_locations_unique = all_permanence_locations.drop_duplicates()

In [501]:
permanence_value_counts = all_permanence_locations.value_counts()
print(permanence_value_counts)

latitude_permanence  longitude_permanence
50.85946             4.35181                 18934
50.83254             4.31199                 12996
50.44608             3.91916                 11742
50.83433             4.34545                 11607
50.39143             4.42896                 11192
                                             ...  
50.30147             4.64535                     1
51.24501             4.48938                     1
50.31327             4.90912                     1
50.36155             3.49716                     1
49.98963             4.71100                     1
Name: count, Length: 654, dtype: int64


In [502]:
#check if the number of digits after the decimal is equal for all values in the table, since that also influences the count of
#unique values, which might explain why there are more values here than in the hospital location table.

# check if the number of numbers after the decimal point is equal for all values

def count_decimal_places(x):
    s = str(x)
    if '.' in s:
        return len(s.split('.')[1])
    else:
        return 0

# apply the function to both columns
decimal_places_latitude = all_permanence_locations['latitude_permanence'].apply(count_decimal_places)
decimal_places_longitude = all_permanence_locations['longitude_permanence'].apply(count_decimal_places)

# combine the results and find the min and max
all_decimal_places = pd.concat([decimal_places_latitude, decimal_places_longitude])
min_decimal_places = all_decimal_places.min()
max_decimal_places = all_decimal_places.max()

# check if all values have the same number of decimal places
same_decimal_places = decimal_places_latitude.equals(decimal_places_longitude)

print(f"Minimum number of decimal places: {min_decimal_places}")
print(f"Maximum number of decimal places: {max_decimal_places}")
print(f"All values have the same number of decimal places: {same_decimal_places}")

Minimum number of decimal places: 0
Maximum number of decimal places: 14
All values have the same number of decimal places: False


In [503]:
#the amount of decimals is all over the place, which is very problematic.

# get the count of each unique number of decimal places
decimal_places_count = all_decimal_places.value_counts().sort_index()

print("Count of each number of decimal places:")
print(decimal_places_count)

Count of each number of decimal places:
0      258768
3      115429
4       65560
5     1010679
6        1761
7       68219
8      226239
9      220842
10      19957
12       6627
13      45210
14      51807
Name: count, dtype: int64


In [504]:
# check if the amount of numbers before the decimal is 1 for longitude, and 2 for latitude, as they should be. This could be 
#part of the cause of the differing number of digits.

def check_digits_before_decimal(x, expected_digits):
    s = str(x)
    if '.' in s:
        digits_before_decimal = len(s.split('.')[0])
        return digits_before_decimal == expected_digits
    else:
        return False

# apply the function to latitude (expecting 2 digits before decimal)
latitude_check = all_permanence_locations['latitude_permanence'].apply(check_digits_before_decimal, expected_digits=2)

# apply the function to longitude (expecting 1 digit before decimal)
longitude_check = all_permanence_locations['longitude_permanence'].apply(check_digits_before_decimal, expected_digits=1)

# check if all values meet the criteria
all_latitude_valid = latitude_check.all()
all_longitude_valid = longitude_check.all()

print(f"All 'latitude_permanence' values have 2 digits before the decimal point: {all_latitude_valid}")
print(f"All 'longitude_permanence' values have 1 digit before the decimal point: {all_longitude_valid}")

All 'latitude_permanence' values have 2 digits before the decimal point: False
All 'longitude_permanence' values have 1 digit before the decimal point: False


In [505]:
#we can simply apply the same function we used on the bxl coordinates again to all the data.

all_permanence_locations_formatted = all_permanence_locations
all_permanence_locations_formatted['latitude_permanence'] = all_permanence_locations_formatted['latitude_permanence'].apply(format_latitude)
all_permanence_locations_formatted['longitude_permanence'] = all_permanence_locations_formatted['longitude_permanence'].apply(format_longitude)

#there are 20 values less here, this implies that some coordinates were double coded but with different levels of precision (less decimals)
print(all_permanence_locations_formatted.value_counts())

latitude_permanence  longitude_permanence
50.85946             4.35181                 18934
50.83254             4.31199                 12996
50.44608             3.91916                 11742
50.83433             4.34545                 11607
50.39143             4.42896                 11192
                                             ...  
50.30147             4.64535                     1
50.36155             3.49716                     1
50.31327             4.90912                     1
51.20478             4.77508                     1
49.98963             4.71100                     1
Name: count, Length: 634, dtype: int64


In [506]:
#check again if the number of decimals is the same, to see if this solved the problem.

decimal_places_latitude = all_permanence_locations_formatted['latitude_permanence'].apply(count_decimal_places)
decimal_places_longitude = all_permanence_locations_formatted['longitude_permanence'].apply(count_decimal_places)

all_decimal_places = pd.concat([decimal_places_latitude, decimal_places_longitude])

min_decimal_places = all_decimal_places.min()
max_decimal_places = all_decimal_places.max()

# get the count of each unique number of decimal places
decimal_places_count = all_decimal_places.value_counts().sort_index()

# display results
print(f"Minimum number of decimal places: {min_decimal_places}")
print(f"Maximum number of decimal places: {max_decimal_places}")
print("Count of each number of decimal places:")
print(decimal_places_count)

Minimum number of decimal places: 0
Maximum number of decimal places: 16
Count of each number of decimal places:
0      258768
3       21589
4       73640
5     1068696
6        1761
7       68219
8      226239
9      220842
10      19957
12       6627
13      45210
14      51807
15       7817
16      19926
Name: count, dtype: int64


In [507]:
#the count of unique values is influenced by the number of numbers after the decimal point, e.g. 51.1234 is not the same as
# 51.123, with one less number after the decimal. Therefore the varying number of digits after the decimal point is a problem.
all_permanence_locations_unique_test = all_permanence_locations_formatted.drop_duplicates()
print(all_permanence_locations_unique_test)

         latitude_permanence  longitude_permanence
0                  50.850970              4.364110
2                  50.834330              4.345450
4                  50.852110              4.460400
6                  50.842260              4.399250
8                  50.869480              4.386490
...                      ...                   ...
952153             51.179015              4.361991
952385             51.156827              4.415969
978425             51.205074              4.396444
1030254            51.210317              4.051975
1036343            51.251317              4.248720

[925 rows x 2 columns]


In [508]:
#we cannot use the coordinates with fewer than 6 digits after the decimal point. It is possible however that all the coordinates
#with fewer than 6 digits have zeroes instead of other numbers, so for example 4.50000, where the zeroes have been removed. However, this is 
#extremely unlikely to be the case for all those cases.
#when removing all the coordinates with fewer than 6 numbers after the decimal, we end up with 226 unique values. This is fewer
#than the amount of coordinates from the hospital tables. Therefore, we will use the coordinates from the hospital table in all further analyses
#regarding the permanences.
#this again shows how inconsistent the data in general is.

#count and store the number of decimal places to use it to filter
decimal_places_latitude = all_permanence_locations_formatted['latitude_permanence'].apply(count_decimal_places)
decimal_places_longitude = all_permanence_locations_formatted['longitude_permanence'].apply(count_decimal_places)

# filter the DataFrame to keep rows where both columns have at least 6 decimal places
decimal_filtered_permanence_test = all_permanence_locations_formatted[(decimal_places_latitude >= 6) & (decimal_places_longitude >= 6)]


unique = decimal_filtered_permanence_test.drop_duplicates()
print(unique)

         latitude_permanence  longitude_permanence
2                  50.834330              4.345450
15                 50.825831              4.378717
26                 50.815980              4.341524
215                50.906541              4.388562
115647             50.842255              4.399238
...                      ...                   ...
952153             51.179015              4.361991
952385             51.156827              4.415969
978425             51.205074              4.396444
1030254            51.210317              4.051975
1036343            51.251317              4.248720

[226 rows x 2 columns]


Intervention locations

In [509]:
#rename to make the merging of the tables possible.

cad9 = cad9.rename(columns={'EventType Trip': 'eventType_trip'})
interventions_bxl = interventions_bxl.rename(columns={'eventtype_trip': 'eventType_trip'})
interventions_bxl2 = interventions_bxl2.rename(columns={'EventType and EventLevel': 'eventType_trip'})
interventions1 = interventions1.rename(columns={'EventType Trip': 'eventType_trip'})
interventions2 = interventions2.rename(columns={'EventType Trip': 'eventType_trip'})
interventions3 = interventions3.rename(columns={'EventType Trip': 'eventType_trip'})

In [510]:
#inspecting the variables to see which of them we should keep for further analysis

cad9_vars = cad9.columns.tolist()
interventions_bxl_vars = interventions_bxl.columns.tolist()
interventions_bxl2_vars = interventions_bxl2.columns.tolist()
interventions1_vars = interventions1.columns.tolist()
interventions2_vars = interventions2.columns.tolist()
interventions3_vars = interventions3.columns.tolist()

print(cad9_vars)
'province'
'eventType_trip'
'T0'
'T3'
'Latitude intervention'
'Longitude intervention'

print(interventions_bxl_vars)
'eventType_trip'
't0'
't3'
'number_of_transported_persons'
'abandon_reason'
'latitude_intervention'
'longitude_intervention'

print(interventions_bxl2_vars)
'T0'
'eventType_trip'
'Number of transported persons'
'Abandon reason NL', 'Abandon reason FR'
'T3'
'Latitude intervention'
'Longitude intervention'

print(interventions1_vars)
'Province intervention'
'eventType_trip'
'T0'
'T3'
'Number of transported persons'
'Abandon reason'
'Latitude intervention'
'Longitude intervention'

print(interventions2_vars)
'Province intervention'
'eventType_trip'
'T0'
'T3'
'Number of transported persons'
'Abandon reason'
'Latitude intervention'
'Longitude intervention'

print(interventions3_vars)
'Province intervention'
'eventType_trip'
'T0'
'T3'
'Number of transported persons'
'Abandon reason'
'Latitude intervention'
'Longitude intervention'

['province', 'Mission ID', 'Service Name', 'Latitude permanence', 'Longitude permanence', 'Permanence short name', 'Permanence long name', 'Vector Type', 'eventType_trip', 'EventSubType Trip', 'EventLevel Trip', 'CityName intervention', 'CitysectionName intervention', 'Latitude intervention', 'Longitude intervention', 'Province invervention', 'T0', 'T1', 'T1confirmed', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'Name destination hospital', 'Intervention time (T1Reported)', 'Intervention time (T1Confirmed)', 'Departure time (T1Reported)', 'Departure time (T1Confirmed)', 'UI', 'ID', 'MISSION_NR', 'AMBUCODE', 'UNIT_ID']
['mission_id', 'service_name', 'postalcode_permanence', 'cityname_permanence', 'streetname_permanence', 'housenumber_permanence', 'latitude_permanence', 'longitude_permanence', 'permanence_short_name', 'permanence_long_name', 'vector_type', 'eventtype_firstcall', 'eventLevel_firstcall', 'eventType_trip', 'eventlevel_trip', 'postalcode_intervention', 'cityname_intervention', 'lati

'Longitude intervention'

In [511]:
print(interventions_bxl2_vars)

['Mission ID', 'T0', 'Cityname Intervention', 'Longitude intervention', 'Latitude intervention', 'description_nl', 'ic_description_nl', 'eventType_trip', 'creationtime', 'Number of transported persons', 'Permanence long name NL', 'Permanence long name FR', 'Permanence short name NL', 'Permanence short name FR', 'Service Name NL', 'Service Name FR', 'Cityname Permanence', 'Streetname Permanence', 'Housenumber Permanence', 'Latitude Permanence', 'Longitude Permanence', 'Vector type NL', 'Vector type FR', 'Name destination hospital', 'Cityname destination hospital', 'Streetname destination hospital', 'Housenumber destination hospital', 'Abandon reason NL', 'Abandon reason FR', 'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7']


In [512]:
#renaming the other variables too

interventions_bxl = interventions_bxl.rename(columns={'t0': 'T0',
                                                     't3': 'T3',
                                                     'eventtype_trip': 'eventType_trip'})
interventions_bxl2 = interventions_bxl2.rename(columns={'Number of transported persons': 'number_of_transported_persons',
                                                       'Abandon reason NL': 'abandon_reason',
                                                        'Latitude intervention': 'latitude_intervention',
                                                        'Longitude intervention': 'longitude_intervention',
                                                        'EventType and EventLevel': 'eventType_trip'})
interventions1 = interventions1.rename(columns={'Number of transported persons': 'number_of_transported_persons',
                                                       'Abandon reason': 'abandon_reason',
                                                        'Latitude intervention': 'latitude_intervention',
                                                        'Longitude intervention': 'longitude_intervention',
                                                        'EventType Trip': 'eventType_trip',
                                                        'Province intervention': 'province'})
interventions2 = interventions2.rename(columns={'Number of transported persons': 'number_of_transported_persons',
                                                       'Abandon reason': 'abandon_reason',
                                                        'Latitude intervention': 'latitude_intervention',
                                                        'Longitude intervention': 'longitude_intervention',
                                                        'EventType Trip': 'eventType_trip',
                                                        'Province intervention': 'province'})
interventions3 = interventions3.rename(columns={'Number of transported persons': 'number_of_transported_persons',
                                                       'Abandon reason': 'abandon_reason',
                                                        'Latitude intervention': 'latitude_intervention',
                                                        'Longitude intervention': 'longitude_intervention',
                                                        'EventType Trip': 'eventType_trip',
                                                        'Province intervention': 'province'})
cad9 = cad9.rename(columns = {                          'Latitude intervention': 'latitude_intervention',
                                                        'Longitude intervention': 'longitude_intervention',
                                                         'EventType Trip': 'eventType_trip'})

In [513]:
cad9

Unnamed: 0,province,Mission ID,Service Name,Latitude permanence,Longitude permanence,Permanence short name,Permanence long name,Vector Type,eventType_trip,EventSubType Trip,...,Name destination hospital,Intervention time (T1Reported),Intervention time (T1Confirmed),Departure time (T1Reported),Departure time (T1Confirmed),UI,ID,MISSION_NR,AMBUCODE,UNIT_ID
0,VBR,21221520003,MV HVP VILV West,50.925277,4.423057,AVVILV01A,ZW VILVOORDE 1,Ambulance,P034 - SCHEDELTRAUMA,/,...,HV UR VILV AZ JAN PORTAELS,10.0,,2.0,,21221520003 1969661672,19189868,21221520003,240.0,MVVILV101
1,VBR,21221520004,MV HVP HALL West,50.743200,4.241053,AVHALL02A,ZW HALLE 2,Ambulance,P010 - ADEMHALINGSMOEILIJKHEDEN,/,...,HV UR HALL AZ ST MARIA,7.0,,3.0,,21221520004 1969661709,19189847,21221520004,240.0,MVHALL102
2,VBR,21221520007,MV HVP VILV West,50.925277,4.423057,AVVILV01A,ZW VILVOORDE 1,Ambulance,Y_TI BEVRIJDING PERSOON BEKLEMD/OPGESLOTEN+DGH,/,...,HV UR VILV AZ JAN PORTAELS,10.0,,4.0,,21221520007 1969664866,19190011,21221520007,240.0,MVVILV101
3,VBR,21221520007,HV UR VILV AZ Jan Portaels,50.926869,4.420968,UVVILV01A,MUG VILVOORDE,MUG,Y_TI BEVRIJDING PERSOON BEKLEMD/OPGESLOTEN+DGH,/,...,HV UR VILV AZ JAN PORTAELS,7.0,,1.0,,21221520007 1969665019,19190053,21221520007,0.0,MVVILV301
4,VBR,21221520008,BB BRUX Hôpital Militair KA,50.905331,4.387662,ABBRUX13A,AMB HMB 13,Ambulance,P020 - INTOXICATIE ALCOHOL,/,...,,14.0,,7.0,,21221520008 1969665449,19189967,21221520008,39.0,MBMILH101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289396,NAM,90221870001,BF SART BHN AMBU,,,AFSART02A,AMB SART-DAME-AVELINES 2,Ambulance,PERSONNES en danger / dans le besoin,/,...,,0.0,,,,90221870001 1972752051,19401306,90221870001,246.0,MFPRMA102
289397,NAM,90221870002,MF PDS BRAL Brabant Wallon,50.686351,4.395892,AFBRAL03A,AMB BRAINE-L'ALLEUD 3,Ambulance,PERSONNES en danger / dans le besoin,/,...,,0.0,,,,90221870002 1972753708,19401456,90221870002,200.0,MFSIBR103
289398,NAM,90221870003,MF PDS WAVR Brabant Wallon,50.698985,4.615385,AFWAVR01A,AMB WAVRE 1,Ambulance,PERSONNES en danger / dans le besoin,/,...,,0.0,,,,90221870003 1972754402,19401519,90221870003,200.0,MFSIWA101
289399,NAM,90221880001,BF SART BHN AMBU,,,AFSART01A,AMB SART-DAME-AVELINES 1,Ambulance,PERSONNES en danger / dans le besoin,/,...,,0.0,,,,90221880001 1972817859,19404988,90221880001,246.0,MFPRMA101


In [514]:
interventions2.columns.tolist()

['Mission ID',
 'Service Name',
 'PostalCode permanence',
 'CityName permanence',
 'StreetName permanence',
 'HouseNumber permanence',
 'Latitude permanence',
 'Longitude permanence',
 'Permanence short name',
 'Permanence long name',
 'Vector type',
 'EventType Firstcall',
 'EventLevel Firstcall',
 'eventType_trip',
 'EventLevel Trip',
 'PostalCode intervention',
 'CityName intervention',
 'latitude_intervention',
 'longitude_intervention',
 'province',
 'T0',
 'T1',
 'T1confirmed',
 'T2',
 'T3',
 'T4',
 'T5',
 'T6',
 'T7',
 'T9',
 'Intervention time (T1Reported)',
 'Intervention time (T1Confirmed)',
 'Waiting time',
 'Intervention duration',
 'Departure time (T1Reported)',
 'Departure time (T1Confirmed)',
 'Unavailable time',
 'Name destination hospital',
 'PostalCode destination hospital',
 'CityName destination hospital',
 'StreetName destination hospital',
 'HouseNumber destination hospital',
 'Calculated travelTime destinatio',
 'Calculated Distance destination',
 'number_of_tran

In [515]:
common_variables = ['eventType_trip', 'T0', 'T3', 'number_of_transported_persons', 'abandon_reason',
                    'latitude_intervention', 'longitude_intervention', 'province']
interventions_bxl_filtered = interventions_bxl.filter(common_variables)
interventions_bxl2_filtered = interventions_bxl2.filter(common_variables)
interventions1_filtered = interventions1.filter(common_variables)
interventions2_filtered = interventions2.filter(common_variables)
interventions3_filtered = interventions3.filter(common_variables)
cad9_filtered = cad9.filter(['eventType_trip', 'T0', 'T3', 'latitude_intervention', 'longitude_intervention', 'province'])

In [516]:
cad9_filtered

Unnamed: 0,eventType_trip,T0,T3,latitude_intervention,longitude_intervention,province
0,P034 - SCHEDELTRAUMA,2022-06-01 00:12:50.000,2022-06-01 00:24:46.000,50.896027,4.439554,VBR
1,P010 - ADEMHALINGSMOEILIJKHEDEN,2022-06-01 00:11:02.000,2022-06-01 00:22:34.000,50.745594,4.276564,VBR
2,Y_TI BEVRIJDING PERSOON BEKLEMD/OPGESLOTEN+DGH,2022-07-14 16:54:37.000,2022-06-01 01:17:45.000,50.931427,4.436373,VBR
3,Y_TI BEVRIJDING PERSOON BEKLEMD/OPGESLOTEN+DGH,2022-07-14 16:54:37.000,2022-06-01 01:17:45.000,50.931427,4.436373,VBR
4,P020 - INTOXICATIE ALCOHOL,2022-06-01 01:14:59.000,2022-06-01 01:31:13.000,50.884240,4.440077,VBR
...,...,...,...,...,...,...
289396,PERSONNES en danger / dans le besoin,,2022-07-06 18:40:52.000,50.563888,4.490994,NAM
289397,PERSONNES en danger / dans le besoin,,2022-07-06 19:08:28.000,50.689942,4.197239,NAM
289398,PERSONNES en danger / dans le besoin,,2022-07-06 19:20:02.000,50.698750,4.614590,NAM
289399,PERSONNES en danger / dans le besoin,,2022-07-07 12:57:39.000,50.474562,4.435452,NAM


In [517]:
from datetime import datetime
from dateutil import parser

#the +02:00 or +00:00 parts are removed, because it simply is unclear what it means in the first place
def remove_plus(dt_str):
    if pd.isna(dt_str):
        return None
    # find the index of the '+' character
    plus_index = dt_str.find('+')
    if plus_index != -1:
        dt_str = dt_str[:plus_index].strip()  # Remove timezone part
    return dt_str


# remove timezone information (convert to naive datetime)
interventions_bxl_filtered['T0'] = interventions_bxl_filtered['T0'].apply(remove_plus)
interventions_bxl_filtered['T3'] = interventions_bxl_filtered['T3'].apply(remove_plus)

#the data is unnecessarily precise
def truncate_to_microseconds(dt_str):
    if pd.isna(dt_str):
        return None
    # truncate the string to microseconds (6 digits) and remove any remaining part
    if '.' in dt_str:
        # split the seconds part and take only the first 6 digits after the decimal
        dt_str = dt_str.split('.')[0] + '.' + dt_str.split('.')[1][:6]
    return dt_str
interventions_bxl_filtered['T0'] = interventions_bxl_filtered['T0'].apply(truncate_to_microseconds)
interventions_bxl_filtered['T3'] = interventions_bxl_filtered['T3'].apply(truncate_to_microseconds)

In [518]:
def truncate_to_microseconds(dt_str):
    if pd.isna(dt_str):
        return None
    # Truncate the string to microseconds (6 digits) and remove any remaining part
    if '.' in dt_str:
        # Split the seconds part and take only the first 6 digits after the decimal
        dt_str = dt_str.split('.')[0] + '.' + dt_str.split('.')[1][:6]
    return dt_str
interventions_bxl_filtered['T0'] = interventions_bxl_filtered['T0'].apply(truncate_to_microseconds)
interventions_bxl_filtered['T3'] = interventions_bxl_filtered['T3'].apply(truncate_to_microseconds)
interventions_bxl_filtered

Unnamed: 0,eventType_trip,T0,T3,number_of_transported_persons,abandon_reason,latitude_intervention,longitude_intervention
0,P033 - Trauma,2022-09-06 11:49:21.586859,,,Error,5085139.0,436918.0
1,P033 - Trauma,2022-09-06 11:49:21.586859,2022-09-06 10:07:00.784280,,,5085139.0,436918.0
2,P059 - Dizziness - Nausea,2022-09-06 11:55:35.793679,,,Error,5083336.0,434504.0
3,P019 - Unconscious - syncope,2022-09-06 12:39:23.433732,,,Weigering van vervoer,5085076.0,436359.0
4,P033 - Trauma,2022-09-06 13:26:48.337914,,,Geannuleerd,508561.0,443169.0
...,...,...,...,...,...,...,...
115642,P069 - Wounds,2023-05-31 23:33:23.818779,2023-05-31 21:42:56.021592,,Weigering van vervoer,5086697.0,436657.0
115643,P069 - Wounds,2023-05-31 23:33:23.818779,,,Error,5086697.0,436657.0
115644,P002 - Agression - fight - rape,2023-05-31 23:41:50.181845,,,Error,5083525.0,43078.0
115645,P002 - Agression - fight - rape,2023-05-31 23:41:50.181845,2023-05-31 21:47:42.004570,,,5083525.0,43078.0


In [519]:
#parsing functions for each table
from datetime import datetime

def parse_cad9_filtered(dt_str):
    if pd.isna(dt_str):
        return None
    return datetime.strptime(dt_str, "%Y-%m-%d %H:%M:%S.%f")

def parse_interventions_bxl_filtered(dt_str):
    if pd.isna(dt_str):
        return None
    return datetime.strptime(dt_str, "%Y-%m-%d %H:%M:%S.%f")

def parse_interventions_bxl2_filtered(dt_str):
    if pd.isna(dt_str):
        return None
    return datetime.strptime(dt_str, "%d%b%y:%H:%M:%S")


#time in multiple formats is present in the interventions1, interventions2 and interventions3 tables.

def parse_interventions1_filtered(dt_str):
    if pd.isna(dt_str):
        return None
    formats = [
        "%Y-%m-%d %H:%M:%S.%f",  # format with microseconds
        "%d%b%y:%H:%M:%S"        # format with abbreviated month and no date separator
    ]
    for fmt in formats:
        try:
            return datetime.strptime(dt_str, fmt)
        except ValueError:
            continue
    return None 

def parse_interventions2_filtered(dt_str):
    if pd.isna(dt_str):
        return None
    formats = [
        "%Y-%m-%d %H:%M:%S.%f",  # format with microseconds
        "%d%b%y:%H:%M:%S"        # format with abbreviated month and no date separator
    ]
    for fmt in formats:
        try:
            return datetime.strptime(dt_str, fmt)
        except ValueError:
            continue
    return None 

def parse_interventions3_filtered(dt_str):
    if pd.isna(dt_str):
        return None
    formats = [
        "%Y-%m-%d %H:%M:%S.%f",  # format with microseconds
        "%d%b%y:%H:%M:%S"        # format with abbreviated month and no date separator
    ]
    for fmt in formats:
        try:
            return datetime.strptime(dt_str, fmt)
        except ValueError:
            continue
    return None 


In [520]:
# List of dataframes and their corresponding parsing functions
data_frames = [
    ('cad9_filtered', cad9_filtered, parse_cad9_filtered),
    ('interventions_bxl_filtered', interventions_bxl_filtered, parse_interventions_bxl_filtered),
    ('interventions_bxl2_filtered', interventions_bxl2_filtered, parse_interventions_bxl2_filtered),
    ('interventions1_filtered', interventions1_filtered, parse_interventions1_filtered),
    ('interventions2_filtered', interventions2_filtered, parse_interventions2_filtered),
    ('interventions3_filtered', interventions3_filtered, parse_interventions3_filtered)
]

# process each dataframe
for name, df, parse_func in data_frames:
    df['parsed_T0'] = df['T0'].apply(parse_func)
    df['parsed_T3'] = df['T3'].apply(parse_func)
    
    # Calculate time difference in seconds, handling None values
    def calculate_time_difference(row):
        if row['parsed_T0'] is None or row['parsed_T3'] is None:
            return None
        return (row['parsed_T3'] - row['parsed_T0']).total_seconds()
    
    df['time_difference_seconds'] = df.apply(calculate_time_difference, axis=1)
    
    print(f"DataFrame: {name}")
    print(df[['T0', 'T3', 'time_difference_seconds']].head())
    print("\n")

DataFrame: cad9_filtered
                        T0                       T3  time_difference_seconds
0  2022-06-01 00:12:50.000  2022-06-01 00:24:46.000                    716.0
1  2022-06-01 00:11:02.000  2022-06-01 00:22:34.000                    692.0
2  2022-07-14 16:54:37.000  2022-06-01 01:17:45.000               -3771412.0
3  2022-07-14 16:54:37.000  2022-06-01 01:17:45.000               -3771412.0
4  2022-06-01 01:14:59.000  2022-06-01 01:31:13.000                    974.0


DataFrame: interventions_bxl_filtered
                           T0                          T3  \
0  2022-09-06 11:49:21.586859                        None   
1  2022-09-06 11:49:21.586859  2022-09-06 10:07:00.784280   
2  2022-09-06 11:55:35.793679                        None   
3  2022-09-06 12:39:23.433732                        None   
4  2022-09-06 13:26:48.337914                        None   

   time_difference_seconds  
0                      NaN  
1             -6140.802579  
2                  

In [521]:
#add provinces to the bxl data frames, so the province variable doesn't get lost
interventions_bxl_filtered['province'] = 'BRU'
interventions_bxl2_filtered['province'] = 'BRU'

In [522]:
#nu nog de locaties van de harstilstanden. Eerst filteren op hartstilstand, dan de locaties eruit halen.
card_arrest = pd.concat([interventions_bxl_filtered,
                        interventions_bxl2_filtered,
                        interventions1_filtered,
                        interventions2_filtered,
                        interventions3_filtered,
                        cad9_filtered
                        ],
                        ignore_index = True)
card_arrest
#evnttype is de var waar het staat, nu nog de code vinden voor de hartstilstanden.
#eventtype trip is used, since it provides the latest information on the reason for the call

Unnamed: 0,eventType_trip,T0,T3,number_of_transported_persons,abandon_reason,latitude_intervention,longitude_intervention,parsed_T0,parsed_T3,time_difference_seconds,province
0,P033 - Trauma,2022-09-06 11:49:21.586859,,,Error,5.085139e+06,436918.000000,2022-09-06 11:49:21.586859,NaT,,BRU
1,P033 - Trauma,2022-09-06 11:49:21.586859,2022-09-06 10:07:00.784280,,,5.085139e+06,436918.000000,2022-09-06 11:49:21.586859,2022-09-06 10:07:00.784280,-6140.802579,BRU
2,P059 - Dizziness - Nausea,2022-09-06 11:55:35.793679,,,Error,5.083336e+06,434504.000000,2022-09-06 11:55:35.793679,NaT,,BRU
3,P019 - Unconscious - syncope,2022-09-06 12:39:23.433732,,,Weigering van vervoer,5.085076e+06,436359.000000,2022-09-06 12:39:23.433732,NaT,,BRU
4,P033 - Trauma,2022-09-06 13:26:48.337914,,,Geannuleerd,5.085610e+05,443169.000000,2022-09-06 13:26:48.337914,NaT,,BRU
...,...,...,...,...,...,...,...,...,...,...,...
1045544,PERSONNES en danger / dans le besoin,,2022-07-06 18:40:52.000,,,5.056389e+01,4.490994,NaT,2022-07-06 18:40:52.000000,,NAM
1045545,PERSONNES en danger / dans le besoin,,2022-07-06 19:08:28.000,,,5.068994e+01,4.197239,NaT,2022-07-06 19:08:28.000000,,NAM
1045546,PERSONNES en danger / dans le besoin,,2022-07-06 19:20:02.000,,,5.069875e+01,4.614590,NaT,2022-07-06 19:20:02.000000,,NAM
1045547,PERSONNES en danger / dans le besoin,,2022-07-07 12:57:39.000,,,5.047456e+01,4.435452,NaT,2022-07-07 12:57:39.000000,,NAM


In [523]:
#By checking the "Belgian manual for medical regulation", we found that the following codes are relevant to our analysis:
#P039 - CARDIAAL PROBLEEM (NIET PIJN OP DE BORST)
#P011 - PIJN OP DE BORST
#P003 - HARTSTILSTAND - DOOD - OVERLEDEN
#P008 - PATIËNT MET DEFIBRILLATOR OF PACEMAKER
#mainly 003 is relevant, but the other codes mentioned can also play a role in deciding where to place an AED. It could be useful to verify with the
#stakeholder that the other cases besides P003 are also relevant, we will consider them to be so in this analysis.

In [524]:
#filteren op eventType_trip eerst, dan de not transported eruit filteren, dan de tijden berekenen

In [525]:
#inspecting the data first, get all unique values
card_arrest['eventType_trip'].unique()

array(['P033 - Trauma', 'P059 - Dizziness - Nausea',
       'P019 - Unconscious - syncope', 'P020 - Intoxication alcohol',
       'P002 - Agression - fight - rape', 'P074 - Palliative patient',
       'P015 - Epilepsy - convulsions', 'P021 - Intoxication drugs',
       'P097 - Collocation (planned)', 'P013 - Non-traumatic back pain',
       'P068 - Urogenital problem', 'P011 - Chest pain',
       'P012 - Non-traumatic abdominal pain', 'P026 - Unclear problem',
       'P001 - Traffic accident', 'P031 - Psychiatric problem',
       'P036 - Heat stroke - solar stroke', 'P075 - Cancer patient',
       'P010 - Respiratory problems', 'P009 - Diabetes',
       'P005 - Wounded by weapon', 'P069 - Wounds',
       'P032 - Allergic reactions', 'P004 - Stroke',
       'P018 - Long-term immobilisation - crushing of body and limbs',
       'P039 - Cardiac problem (other than thoracic pain)',
       'FI (1.3.0) fire building', 'P066 - Post-operative problem',
       'P061 - Limb hot or cold', 'HG (2.

In [526]:
substrings = ['p003', 'p008', 'p011', 'p039']
pattern = '|'.join(substrings) #regex pattern

#filter, case insensitive
card_arrest_filtered = card_arrest[card_arrest['eventType_trip'].str.contains(pattern, case=False, na=False)]

In [527]:
card_arrest_filtered

Unnamed: 0,eventType_trip,T0,T3,number_of_transported_persons,abandon_reason,latitude_intervention,longitude_intervention,parsed_T0,parsed_T3,time_difference_seconds,province
27,P011 - Chest pain,2022-09-06 14:51:41.968677,,,Error,5.087798e+06,439229.000000,2022-09-06 14:51:41.968677,NaT,,BRU
28,P011 - Chest pain,2022-09-06 14:51:41.968677,2022-09-06 13:00:11.171741,,,5.087798e+06,439229.000000,2022-09-06 14:51:41.968677,2022-09-06 13:00:11.171741,-6690.796936,BRU
29,P011 - Chest pain,2022-09-06 14:51:41.968677,2022-09-07 12:57:00.843000,1.0,,5.087798e+06,439229.000000,2022-09-06 14:51:41.968677,2022-09-07 12:57:00.843000,79518.874323,BRU
44,P011 - Chest pain,2022-09-06 15:08:52.857352,2022-09-06 14:05:04.521346,1.0,,5.089578e+06,435817.000000,2022-09-06 15:08:52.857352,2022-09-06 14:05:04.521346,-3828.336006,BRU
94,P011 - Chest pain,2022-09-06 16:36:15.795918,2022-09-06 14:50:46.344005,,,5.085592e+06,442592.000000,2022-09-06 16:36:15.795918,2022-09-06 14:50:46.344005,-6329.451913,BRU
...,...,...,...,...,...,...,...,...,...,...,...
1045503,P003 - HARTSTILSTAND - DOOD - OVERLEDEN,2023-05-31 20:45:48.000,2023-05-31 20:57:17.000,,,5.089621e+01,3.856032,2023-05-31 20:45:48.000000,2023-05-31 20:57:17.000000,689.000000,OVL
1045504,P003 - HARTSTILSTAND - DOOD - OVERLEDEN,2023-05-31 20:45:48.000,2023-05-31 20:57:17.000,,,5.089621e+01,3.856032,2023-05-31 20:45:48.000000,2023-05-31 20:57:17.000000,689.000000,OVL
1045516,P039 - CARDIAAL PROBLEEM (NIET PIJN OP DE BORST),2023-05-31 21:38:03.000,2023-05-31 21:54:16.000,,,5.096558e+01,3.758418,2023-05-31 21:38:03.000000,2023-05-31 21:54:16.000000,973.000000,OVL
1045540,P039 - CARDIAAL PROBLEEM (NIET PIJN OP DE BORST),2023-05-31 23:32:03.000,2023-05-31 23:52:06.000,,,5.082759e+01,4.045310,2023-05-31 23:32:03.000000,2023-05-31 23:52:06.000000,1203.000000,OVL


In [528]:
#it is uncertain how "Error" should be interpreted for the abandon_reason variable. It could be that it indicates a technical
#error on the level of the informatics behind the system, or it could mean that the call itself was an error, and therefore no one was transported.
#The "Number_of_transported_persons variable" seems inconsistent, there is no reason given for the refusal of transport yet there is also no value for
#it, so it is not clear whether or not someone was actually transported or not. Therefore we will not use the variable for further filtering.

abandon_reasons = card_arrest_filtered['abandon_reason'].unique()
abandon_reasons
#some of the reasons mean we should not take the data up in further analysis. Such reasons are for example that the call was a false alarm, or that the
#patient refuses to be transported
filter_reasons = ['Weigering van vervoer','Weigering vervoer', 'Geannuleerd', 'Geannuleerde rit', 'Kwaadwillig', 'Loos alarm goed bedoeld']
#we remove the instances where the patient was not transported because the call itself was bad
card_arrest_filtered_abandon = card_arrest_filtered[~card_arrest_filtered['abandon_reason'].isin(filter_reasons)]
len(card_arrest_filtered_abandon)

card_arrest_filtered_abandon = card_arrest_filtered_abandon.drop(columns = ['number_of_transported_persons', 'abandon_reason'])


In [529]:
card_arrest_filtered_abandon

Unnamed: 0,eventType_trip,T0,T3,latitude_intervention,longitude_intervention,parsed_T0,parsed_T3,time_difference_seconds,province
27,P011 - Chest pain,2022-09-06 14:51:41.968677,,5.087798e+06,439229.000000,2022-09-06 14:51:41.968677,NaT,,BRU
28,P011 - Chest pain,2022-09-06 14:51:41.968677,2022-09-06 13:00:11.171741,5.087798e+06,439229.000000,2022-09-06 14:51:41.968677,2022-09-06 13:00:11.171741,-6690.796936,BRU
29,P011 - Chest pain,2022-09-06 14:51:41.968677,2022-09-07 12:57:00.843000,5.087798e+06,439229.000000,2022-09-06 14:51:41.968677,2022-09-07 12:57:00.843000,79518.874323,BRU
44,P011 - Chest pain,2022-09-06 15:08:52.857352,2022-09-06 14:05:04.521346,5.089578e+06,435817.000000,2022-09-06 15:08:52.857352,2022-09-06 14:05:04.521346,-3828.336006,BRU
94,P011 - Chest pain,2022-09-06 16:36:15.795918,2022-09-06 14:50:46.344005,5.085592e+06,442592.000000,2022-09-06 16:36:15.795918,2022-09-06 14:50:46.344005,-6329.451913,BRU
...,...,...,...,...,...,...,...,...,...
1045503,P003 - HARTSTILSTAND - DOOD - OVERLEDEN,2023-05-31 20:45:48.000,2023-05-31 20:57:17.000,5.089621e+01,3.856032,2023-05-31 20:45:48.000000,2023-05-31 20:57:17.000000,689.000000,OVL
1045504,P003 - HARTSTILSTAND - DOOD - OVERLEDEN,2023-05-31 20:45:48.000,2023-05-31 20:57:17.000,5.089621e+01,3.856032,2023-05-31 20:45:48.000000,2023-05-31 20:57:17.000000,689.000000,OVL
1045516,P039 - CARDIAAL PROBLEEM (NIET PIJN OP DE BORST),2023-05-31 21:38:03.000,2023-05-31 21:54:16.000,5.096558e+01,3.758418,2023-05-31 21:38:03.000000,2023-05-31 21:54:16.000000,973.000000,OVL
1045540,P039 - CARDIAAL PROBLEEM (NIET PIJN OP DE BORST),2023-05-31 23:32:03.000,2023-05-31 23:52:06.000,5.082759e+01,4.045310,2023-05-31 23:32:03.000000,2023-05-31 23:52:06.000000,1203.000000,OVL


In [530]:
#remove interventions with missing or incomplete coordinates. We should again ask what happened to those. We cannot use these.
#a very large amount of instances get dropped because of missing coordinates, yet again demonstrating how messy the data is.
card_arrest_dropna = card_arrest_filtered_abandon.dropna(subset = ['latitude_intervention', 'longitude_intervention'])
card_arrest_dropna

Unnamed: 0,eventType_trip,T0,T3,latitude_intervention,longitude_intervention,parsed_T0,parsed_T3,time_difference_seconds,province
27,P011 - Chest pain,2022-09-06 14:51:41.968677,,5.087798e+06,439229.000000,2022-09-06 14:51:41.968677,NaT,,BRU
28,P011 - Chest pain,2022-09-06 14:51:41.968677,2022-09-06 13:00:11.171741,5.087798e+06,439229.000000,2022-09-06 14:51:41.968677,2022-09-06 13:00:11.171741,-6690.796936,BRU
29,P011 - Chest pain,2022-09-06 14:51:41.968677,2022-09-07 12:57:00.843000,5.087798e+06,439229.000000,2022-09-06 14:51:41.968677,2022-09-07 12:57:00.843000,79518.874323,BRU
44,P011 - Chest pain,2022-09-06 15:08:52.857352,2022-09-06 14:05:04.521346,5.089578e+06,435817.000000,2022-09-06 15:08:52.857352,2022-09-06 14:05:04.521346,-3828.336006,BRU
94,P011 - Chest pain,2022-09-06 16:36:15.795918,2022-09-06 14:50:46.344005,5.085592e+06,442592.000000,2022-09-06 16:36:15.795918,2022-09-06 14:50:46.344005,-6329.451913,BRU
...,...,...,...,...,...,...,...,...,...
1045503,P003 - HARTSTILSTAND - DOOD - OVERLEDEN,2023-05-31 20:45:48.000,2023-05-31 20:57:17.000,5.089621e+01,3.856032,2023-05-31 20:45:48.000000,2023-05-31 20:57:17.000000,689.000000,OVL
1045504,P003 - HARTSTILSTAND - DOOD - OVERLEDEN,2023-05-31 20:45:48.000,2023-05-31 20:57:17.000,5.089621e+01,3.856032,2023-05-31 20:45:48.000000,2023-05-31 20:57:17.000000,689.000000,OVL
1045516,P039 - CARDIAAL PROBLEEM (NIET PIJN OP DE BORST),2023-05-31 21:38:03.000,2023-05-31 21:54:16.000,5.096558e+01,3.758418,2023-05-31 21:38:03.000000,2023-05-31 21:54:16.000000,973.000000,OVL
1045540,P039 - CARDIAAL PROBLEEM (NIET PIJN OP DE BORST),2023-05-31 23:32:03.000,2023-05-31 23:52:06.000,5.082759e+01,4.045310,2023-05-31 23:32:03.000000,2023-05-31 23:52:06.000000,1203.000000,OVL


In [531]:
#remove duplicates
card_arrest_dupe = card_arrest_dropna.drop_duplicates()
card_arrest_dupe

Unnamed: 0,eventType_trip,T0,T3,latitude_intervention,longitude_intervention,parsed_T0,parsed_T3,time_difference_seconds,province
27,P011 - Chest pain,2022-09-06 14:51:41.968677,,5.087798e+06,439229.000000,2022-09-06 14:51:41.968677,NaT,,BRU
28,P011 - Chest pain,2022-09-06 14:51:41.968677,2022-09-06 13:00:11.171741,5.087798e+06,439229.000000,2022-09-06 14:51:41.968677,2022-09-06 13:00:11.171741,-6690.796936,BRU
29,P011 - Chest pain,2022-09-06 14:51:41.968677,2022-09-07 12:57:00.843000,5.087798e+06,439229.000000,2022-09-06 14:51:41.968677,2022-09-07 12:57:00.843000,79518.874323,BRU
44,P011 - Chest pain,2022-09-06 15:08:52.857352,2022-09-06 14:05:04.521346,5.089578e+06,435817.000000,2022-09-06 15:08:52.857352,2022-09-06 14:05:04.521346,-3828.336006,BRU
94,P011 - Chest pain,2022-09-06 16:36:15.795918,2022-09-06 14:50:46.344005,5.085592e+06,442592.000000,2022-09-06 16:36:15.795918,2022-09-06 14:50:46.344005,-6329.451913,BRU
...,...,...,...,...,...,...,...,...,...
1045496,P039 - CARDIAAL PROBLEEM (NIET PIJN OP DE BORST),2023-05-31 19:47:06.000,2023-05-31 19:56:44.000,5.098011e+01,3.982742,2023-05-31 19:47:06.000000,2023-05-31 19:56:44.000000,578.000000,OVL
1045502,P039 - CARDIAAL PROBLEEM (NIET PIJN OP DE BORST),2023-05-31 20:41:28.000,2023-05-31 20:53:46.000,5.113124e+01,4.111499,2023-05-31 20:41:28.000000,2023-05-31 20:53:46.000000,738.000000,OVL
1045503,P003 - HARTSTILSTAND - DOOD - OVERLEDEN,2023-05-31 20:45:48.000,2023-05-31 20:57:17.000,5.089621e+01,3.856032,2023-05-31 20:45:48.000000,2023-05-31 20:57:17.000000,689.000000,OVL
1045516,P039 - CARDIAAL PROBLEEM (NIET PIJN OP DE BORST),2023-05-31 21:38:03.000,2023-05-31 21:54:16.000,5.096558e+01,3.758418,2023-05-31 21:38:03.000000,2023-05-31 21:54:16.000000,973.000000,OVL


In [532]:
#there are still duplicates present in the dataset. We will remove duplicates by their coordinates and their time,
#since it is extremely unlikely that two interventions take place at the exact same coordinates, at the exact same time
card_arrest_unique = card_arrest_dupe.drop_duplicates(subset = ['latitude_intervention', 'longitude_intervention', 'T0'])
card_arrest_unique

Unnamed: 0,eventType_trip,T0,T3,latitude_intervention,longitude_intervention,parsed_T0,parsed_T3,time_difference_seconds,province
27,P011 - Chest pain,2022-09-06 14:51:41.968677,,5.087798e+06,439229.000000,2022-09-06 14:51:41.968677,NaT,,BRU
44,P011 - Chest pain,2022-09-06 15:08:52.857352,2022-09-06 14:05:04.521346,5.089578e+06,435817.000000,2022-09-06 15:08:52.857352,2022-09-06 14:05:04.521346,-3828.336006,BRU
94,P011 - Chest pain,2022-09-06 16:36:15.795918,2022-09-06 14:50:46.344005,5.085592e+06,442592.000000,2022-09-06 16:36:15.795918,2022-09-06 14:50:46.344005,-6329.451913,BRU
110,P011 - Chest pain,2022-09-06 17:14:30.554547,2022-09-06 15:28:15.574171,5.080938e+06,434743.000000,2022-09-06 17:14:30.554547,2022-09-06 15:28:15.574171,-6374.980376,BRU
136,P039 - Cardiac problem (other than thoracic pain),2022-09-06 18:21:26.211055,,5.086944e+06,43608.000000,2022-09-06 18:21:26.211055,NaT,,BRU
...,...,...,...,...,...,...,...,...,...
1045496,P039 - CARDIAAL PROBLEEM (NIET PIJN OP DE BORST),2023-05-31 19:47:06.000,2023-05-31 19:56:44.000,5.098011e+01,3.982742,2023-05-31 19:47:06.000000,2023-05-31 19:56:44.000000,578.000000,OVL
1045502,P039 - CARDIAAL PROBLEEM (NIET PIJN OP DE BORST),2023-05-31 20:41:28.000,2023-05-31 20:53:46.000,5.113124e+01,4.111499,2023-05-31 20:41:28.000000,2023-05-31 20:53:46.000000,738.000000,OVL
1045503,P003 - HARTSTILSTAND - DOOD - OVERLEDEN,2023-05-31 20:45:48.000,2023-05-31 20:57:17.000,5.089621e+01,3.856032,2023-05-31 20:45:48.000000,2023-05-31 20:57:17.000000,689.000000,OVL
1045516,P039 - CARDIAAL PROBLEEM (NIET PIJN OP DE BORST),2023-05-31 21:38:03.000,2023-05-31 21:54:16.000,5.096558e+01,3.758418,2023-05-31 21:38:03.000000,2023-05-31 21:54:16.000000,973.000000,OVL


In [533]:
#change the coordinates to the correct format
card_arrest_unique['latitude_intervention'] = card_arrest_unique['latitude_intervention'].apply(format_latitude)
card_arrest_unique['longitude_intervention'] = card_arrest_unique['longitude_intervention'].apply(format_longitude)
card_arrest_unique

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  card_arrest_unique['latitude_intervention'] = card_arrest_unique['latitude_intervention'].apply(format_latitude)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  card_arrest_unique['longitude_intervention'] = card_arrest_unique['longitude_intervention'].apply(format_longitude)


Unnamed: 0,eventType_trip,T0,T3,latitude_intervention,longitude_intervention,parsed_T0,parsed_T3,time_difference_seconds,province
27,P011 - Chest pain,2022-09-06 14:51:41.968677,,50.877980,4.392290,2022-09-06 14:51:41.968677,NaT,,BRU
44,P011 - Chest pain,2022-09-06 15:08:52.857352,2022-09-06 14:05:04.521346,50.895780,4.358170,2022-09-06 15:08:52.857352,2022-09-06 14:05:04.521346,-3828.336006,BRU
94,P011 - Chest pain,2022-09-06 16:36:15.795918,2022-09-06 14:50:46.344005,50.855920,4.425920,2022-09-06 16:36:15.795918,2022-09-06 14:50:46.344005,-6329.451913,BRU
110,P011 - Chest pain,2022-09-06 17:14:30.554547,2022-09-06 15:28:15.574171,50.809380,4.347430,2022-09-06 17:14:30.554547,2022-09-06 15:28:15.574171,-6374.980376,BRU
136,P039 - Cardiac problem (other than thoracic pain),2022-09-06 18:21:26.211055,,50.869440,4.360800,2022-09-06 18:21:26.211055,NaT,,BRU
...,...,...,...,...,...,...,...,...,...
1045496,P039 - CARDIAAL PROBLEEM (NIET PIJN OP DE BORST),2023-05-31 19:47:06.000,2023-05-31 19:56:44.000,50.980111,3.982742,2023-05-31 19:47:06.000000,2023-05-31 19:56:44.000000,578.000000,OVL
1045502,P039 - CARDIAAL PROBLEEM (NIET PIJN OP DE BORST),2023-05-31 20:41:28.000,2023-05-31 20:53:46.000,51.131239,4.111499,2023-05-31 20:41:28.000000,2023-05-31 20:53:46.000000,738.000000,OVL
1045503,P003 - HARTSTILSTAND - DOOD - OVERLEDEN,2023-05-31 20:45:48.000,2023-05-31 20:57:17.000,50.896206,3.856032,2023-05-31 20:45:48.000000,2023-05-31 20:57:17.000000,689.000000,OVL
1045516,P039 - CARDIAAL PROBLEEM (NIET PIJN OP DE BORST),2023-05-31 21:38:03.000,2023-05-31 21:54:16.000,50.965585,3.758418,2023-05-31 21:38:03.000000,2023-05-31 21:54:16.000000,973.000000,OVL


In [534]:
card_arrest_unique = card_arrest_unique.drop(columns = ['T0', 'T3'])

In [539]:
#store the final processed tables
card_arrest_unique.to_csv('card_arrest_processed.csv', index = False)
all_hospital_locations.to_csv('hospital_coordinates_processed.csv', index = False)
removed_AED.to_csv('removed_AED.csv', index = False)
cleaned_geocoded_results.to_csv('aed_locations_coordinates_processed.csv', index = False)