In [187]:
import pandas as pd
import os

# Define the directory and file names
directory = os.getcwd() + "/data/"

# Read the parquet files
interventions_bxl = pd.read_parquet(directory + "interventions_bxl.parquet.gzip")
interventions_bxl2 = pd.read_parquet(directory + "interventions_bxl2.parquet.gzip")
interventions1 = pd.read_parquet(directory + "interventions1.parquet.gzip")
interventions2 = pd.read_parquet(directory + "interventions2.parquet.gzip")
interventions3 = pd.read_parquet(directory + "interventions3.parquet.gzip")
cad9 = pd.read_parquet(directory + "cad9.parquet.gzip")
aed_locations = pd.read_parquet(directory + "aed_locations.parquet.gzip")
ambulance_locations = pd.read_parquet(directory + "ambulance_locations.parquet.gzip")
mug_locations = pd.read_parquet(directory + "mug_locations.parquet.gzip")
pit_locations = pd.read_parquet(directory + "pit_locations.parquet.gzip")

In [52]:
print(len(aed_locations))

15227


In [53]:
# we want to store all AED's considered poorly performing and the reason for why they are considered so. During the data cleaning, 
# some AED's will already get filtered, and are considered poorly performing.

# creating a dataframe to store these AED's. Only the ID and the reason for removal are stored.
removed_AED = pd.DataFrame(columns = ['id','reason'])

# function to filter and store the AED's
def filter_and_log_removed(df, filter_condition, reason):
    # apply the filter condition
    removed = df[filter_condition]
    
    # append removed devices to the removed_AED DataFrame
    global removed_AED
    removed_AED = pd.concat([removed_AED, pd.DataFrame({'id': removed['id'], 'reason': reason})])
    
    # Remove the filtered devices from the original DataFrame
    df = df.drop(removed.index)
    
    return df

In [54]:
# count the number of missing addresses in aed_locations
aed_missing_location_count = aed_locations['address'].isna().sum()
print(aed_missing_location_count)

# filter out instances where address is missing. Those instances are invalid, since we cannot know where the AED was placed
# first we inspect these instances, perhaps address was present under the wrong variable
aed_locations_with_na = aed_locations[aed_locations['address'].isna()]
print(aed_locations_with_na)

# apply our function in order to filter out and store the filtered instances
aed_locations_cleaned = filter_and_log_removed(aed_locations, aed_locations['address'].isna(), 'address_missing')

2
          id  type address  number  postal_code  municipality    province  \
6092  7323.0  None    None     NaN       3910.0      Neerpelt     Limburg   
6212  7467.0  None    None     0.0       6870.0  Saint-Hubert  Luxembourg   

     location public available hours  
6092     None   None      None  None  
6212     None   None      None  None  


  removed_AED = pd.concat([removed_AED, pd.DataFrame({'id': removed['id'], 'reason': reason})])


In [55]:
len(aed_locations_cleaned) #check

15225

In [56]:
aed_missing_number_count = aed_locations['number'].isna().sum()
print(aed_missing_number_count)

# proportion missing:
print(aed_missing_number_count/aed_locations_cleaned.shape[0])

# around 14% of the cleaned instances has a missing streetnumber. This is a significant amount, which means that simply removing them would have too
#much of an impact on the results. Therefore an alternative solution needs to be found.(midden van de straat, maar dat is wel vrij intensief? hoe weet
#je wat het straatnummer is in het midden van de straat?)
#in a real world scenario, one might ask the stakeholder what happened with this data, perhaps the data is available somewhere else.

2142
0.1406896551724138


In [57]:
aed_type_counts = aed_locations['type'].value_counts()
print(aed_type_counts)

aed_missing_type_count = aed_locations['type'].isna().sum()
print(aed_missing_type_count)
print(len(aed_locations))
# the majority of the known AED types in the cleaned table is fixed apparatus (5166, or 33.93%) , with mobile apparatus only a very small part 
#(17, or 0.11%) of the group, and only #one of M5066A type. The majority is of unknown type (2142, or 66.07%).
#in a real world scenario it would again be possible to ask the stakeholder about these missing values.
#we assume that all types of devices are useable.

type
Appareil fixe-Vast apparaat         5048
Appareil fixe                         66
Appareil Fixe                         19
Appareil Mobile- Mobiel apparaat      17
Vast apparaat                         16
M5066A                                 1
Name: count, dtype: int64
10060
15227


In [58]:
#we can remove the type column, we don't need it for further analysis
aed_locations_cleaned = aed_locations_cleaned.drop(columns = ['type'])

In [59]:
aed_locations.head(10)

Unnamed: 0,id,type,address,number,postal_code,municipality,province,location,public,available,hours
0,13.0,,Blvd. Fr. Roosevelt,24.0,7060.0,SOIGNIES,Hainaut,,Y,,
1,70.0,,Ch. De Wégimont,76.0,4630.0,Ayeneux,Liège,,,,
2,71.0,,Place Saint - Lambert,,4020.0,Liège,Liège,,,,
3,72.0,,Rue du Doyard,,4990.0,Lierneux,Liège,,,,
4,73.0,,Fond Saint Servais,,4000.0,Liège,Liège,,,,
5,74.0,,Rue des Prémontrès,12.0,4020.0,Liège,Liège,,,,
6,75.0,,Route de Bastogne,1.0,4920.0,Harzé,Liège,,,,
7,76.0,,Rue du Parc,1.0,4540.0,Jehay,Liège,,,,
8,77.0,,Blvd. De la Constitution,19.0,4020.0,Liège,Liège,,,,
9,78.0,,Place de la République française,,4000.0,Liège,Liège,,,,


In [60]:
aed_location_counts = aed_locations['location'].value_counts()
print(aed_location_counts)
#the location variable contains a description of where the AED is located exactly. This is not useable for the analysis, so the variable is simply
#removed later on
#aed_locations_cleaned.drop(columns = ['location'])

location
/                                              314
.                                              289
Accueil                                        196
                                                89
INKOM                                           81
                                              ... 
Gelijkvloers buiten tegen loods                  1
Hoofdgebouw, 1ste verdieping, thv cafetaria      1
reddersgebouw aan zwemvijver                     1
Locatie : ingang poort hoofdgebouw               1
 inkomhal, glvl                                  1
Name: count, Length: 5732, dtype: int64


In [61]:
aed_public_counts = aed_locations['public'].value_counts()
print(aed_public_counts)
print(aed_locations['public'].isna().sum())
#1220 devices are not publicly available. It would be useful to have more information on these, to decide whether or not they could still be used in
#case of a cardiac arrest.
aed_nonpublic_aed = aed_locations[(aed_locations['public'] == 'Non-Nee') | (aed_locations['public'] == 'N')]
aed_nonpublic_aed_with_available = aed_nonpublic_aed.dropna(subset=['available'])#only check for the ones that have a value for available, then we can
#decide for those if they are useable
print(aed_nonpublic_aed_with_available['available'].value_counts())
#there seem to be contradictions with the data, where it says for the public variable that it is not available, but for the available variable that it is
#available. Only a very small amount actually gives more details.
#remove degene die op beide nee zeggen? en ook diegene die enkel op bepaalde uren beschikbaar zijn? in principe zou je ze nog altijd kunnen gebruiken,
#je zou ze kunnen markeren, maar dat is zoveel moeite, zoveel aanpassingen aan het project, voor slechts 15 aeds ofzo. kijk ook eens naar de counts voor
#de andere, die wel 'ja' publically available zijn.
#we are optimistic, and asume that if there is one yes in either the public variable or the available variable, we assume that it is a useable AED.
#there are 14 other AED's which give the hours at which they are available. These are available most of the time, so they are considered to be 
#available.

public
Oui-Ja     3937
Y          2775
Non-Nee    1109
y           129
N           111
Oui          11
Ja            6
J             1
Name: count, dtype: int64
7148
available
Non-Nee                                                          748
Oui-Ja                                                           230
Pendant heures d ouverture du site                                 1
Nee                                                                1
8:00 - 17:00                                                       1
Heure de bureau en semaine                                         1
Heures de bureau                                                   1
Heures de bureau                                                   1
selon heures d ouverture d Euro-Délices                            1
enkel tijdens de kantooruren (8 - 19u)                             1
tijdens werkuren                                                   1
Accessible par toute personne présente dans l inrfastructure.    

In [62]:
#removing the AED's for which both the available and the public variables contain a negative
# the conditions
filter_condition = ((aed_locations['public'] == 'Non-Nee') | (aed_locations['public'] == 'N')) & ((aed_locations['available'] == 'Non-Nee') |
                                                                                                 (aed_locations['available'] == 'Nee'))
#there was a single entry for which the value of the variable available was simply "nee" instead of "Non-Nee"
aed_locations_cleaned = filter_and_log_removed(aed_locations_cleaned, filter_condition, 'not_available')

#we can now remove the available and the public variables, since we do not need them in the further analysis
aed_locations_cleaned = aed_locations_cleaned.drop(columns = ['available', 'public', 'location'])

  removed = df[filter_condition]


In [63]:
#checking the value of hours for the remaining entries
aed_hours_counts = aed_locations['hours'].value_counts()
print(aed_hours_counts)
print(aed_locations['hours'].isna().sum())
print(aed_locations_cleaned['hours'].isna().sum())
print(1 - aed_locations_cleaned['hours'].isna().sum()/len(aed_locations_cleaned))
#most values for the hours seem to overlap with office hours. only 5.42% of the instances has a value for hours. We consider the AED's available at
#office hours to be available when they are needed.

hours
8h-17h                                                                                                                            42
8h-16h                                                                                                                            20
24/24                                                                                                                             19
HEURE DE BUREAU                                                                                                                   13
Tijdens openingsuren recyclagepark                                                                                                10
                                                                                                                                  ..
ma-vrij 8u-17u                                                                                                                     1
En fonction des locations                                      

In [64]:
#we can remove the hours variable now
aed_locations_cleaned.drop(columns = ['hours'])

Unnamed: 0,id,address,number,postal_code,municipality,province
0,13.0,Blvd. Fr. Roosevelt,24.0,7060.0,SOIGNIES,Hainaut
1,70.0,Ch. De Wégimont,76.0,4630.0,Ayeneux,Liège
2,71.0,Place Saint - Lambert,,4020.0,Liège,Liège
3,72.0,Rue du Doyard,,4990.0,Lierneux,Liège
4,73.0,Fond Saint Servais,,4000.0,Liège,Liège
...,...,...,...,...,...,...
15220,16660.0,Chaussée de Marche,799.0,5100.0,Wierde,Namur
15221,16661.0,Nekkerspoel-Borcht,19.0,2800.0,Mechelen,Antwerpen
15223,16664.0,Nieuwe Dreef,17.0,9160.0,Lokeren,Oost-Vlaanderen
15224,16665.0,Panterschipstraat,207.0,9000.0,Gent,Oost-Vlaanderen


In [66]:
#751 AED's were removed from the analysis. 2 because their address was missing, 749 because they were not available. The ones 
#where address is missing do not necessarily need to be reallocated, the ones that are not available could be allocated. Perhaps the AED's not available
#to the public are placed in large facilities, e.g. sports facilities, where they actually do serve a purpose. This would need to be verified for each 
#of these AEDs individually.
removed_AED

Unnamed: 0,id,reason
6092,7323.0,address_missing
6212,7467.0,address_missing
750,1412.0,not_available
1123,1881.0,not_available
3050,4024.0,not_available
...,...,...
15211,16650.0,not_available
15212,13572.0,not_available
15213,16651.0,not_available
15222,16662.0,not_available


In [None]:
#for the other tables, we simply need to remove unnecessary variables, and merge the different tables.
#all the tables containing info can be merged into one big table, all the tables containing information from the first responders can be merged.

#de interventies tabellen bevatten ook info over de permanence. Die kan ook meegenomen worden, hoewel in principe die info al aanwezig is in de 
#locations tabellen.

In [72]:
#first we verify whether the unit is permanently staffed. If it is not, we do not consider it useable because it is considered
#unreliable, therefore an AED close to it's location can still be useful.
ambulance_value_counts = ambulance_locations['occasional_permanence'].value_counts()
print(ambulance_value_counts)
ambulance_locations_filtered = ambulance_locations[ambulance_locations['occasional_permanence'] == "N"]



occasional_permanence
N    251
Y     28
Name: count, dtype: int64


251

In [None]:
mug_id enzo

#degene die gelinkt zijn aan ziekenhuizen

In [74]:
!pip install geopy

Collecting geopy
  Downloading geopy-2.4.1-py3-none-any.whl.metadata (6.8 kB)
Collecting geographiclib<3,>=1.52 (from geopy)
  Downloading geographiclib-2.0-py3-none-any.whl.metadata (1.4 kB)
Downloading geopy-2.4.1-py3-none-any.whl (125 kB)
   ---------------------------------------- 0.0/125.4 kB ? eta -:--:--
   ---------------------------------------- 0.0/125.4 kB ? eta -:--:--
   --- ------------------------------------ 10.2/125.4 kB ? eta -:--:--
   --- ------------------------------------ 10.2/125.4 kB ? eta -:--:--
   --------- ----------------------------- 30.7/125.4 kB 259.2 kB/s eta 0:00:01
   ------------------------- ------------- 81.9/125.4 kB 508.4 kB/s eta 0:00:01
   -------------------------------------- 125.4/125.4 kB 669.7 kB/s eta 0:00:00
Downloading geographiclib-2.0-py3-none-any.whl (40 kB)
   ---------------------------------------- 0.0/40.3 kB ? eta -:--:--
   ---------------------------------------- 40.3/40.3 kB 1.9 MB/s eta 0:00:00
Installing collected packages

In [130]:
#get the latitude and longitude of the PIT locations, since these are missing from the table.

#might not be 100% accurate

#some of the hospitals did not get recognized, so the address was manually looked up and inputted in the list. That way, the
#exact longitude and latitude could still be calculated.



# initialize geolocator
geolocator = Nominatim(user_agent="pit_locator")

# data in lists
# need to input the names manually in the list, since the useful part (for example "AZ RIVIERENLAND") is not consistently present in any variable.
#simply inputting the entire string value of the campus variable doesn't work, for example "104 - AZ RIVIERENLAND --- Campus/Site: 1270 - RUMST" yields
#no results.
data = {
    'name': ['AZ RIVIERENLAND ', 'AZ SINT-VINCENTIUS DEINZE', 'ZNA STUIVENBERG', ' HEILIG HART ZIEKENHUIS 1260',
             'HEILIG HART ZIEKENHUIS MOL', 'VITAZ CAMPUS LOKEREN', 'OLV ZIEKENHUIS ASSE', 'AZ SINT-JAN BRUGGE - OOSTENDE',
             'AZ SINT-LUCAS', 'AZ ZENO BLANKENBERGE', 'OOST-LIMBURG SINT-JAN GENK', 'UZ LEUVEN GASTHUISBERG',
             'HOPITAL IRIS SUD', 'CHU SAINT-PIERRE - PORTE DE HAL', 'UZ BRUSSEL', 'GRAND HOPITAL DE CHARLEROI SAINT- JOSEPH',
             'C.H.U. AMBROISE PARE', 'CHR SAMBRE ET MEUSE (SITE MEUSE)', 'CLINIQUE ANDRE RENARD', 'HOPITAL DE LA CITADELLE',
             'CHR VERVIERS - LA TOURELLE', None, None, None],
    'address': [None, None, None, None, None, None, None, None, None, None,
                None, 'Rue Jean Paquot 63', 'Boulevard de Waterloo 129' , '290, rue Haute', 'Rue Marguerite Depasse 6', 'Boulevard Président Kennedy 2',
                'Avenue Albert 1er 185', 'rue André Renard 1', None, None,
                None, 'Rue des Déportés 137', "Rue d'Harnoncourt 48", 'Avenue de Houffalize 35']
}
# create a DataFrame
hospitals_df = pd.DataFrame(data)

# DataFrame to store pit hospital coordinates
hospitals_with_coords = pd.DataFrame(columns=['id', 'name', 'municipality', 'address', 'latitude', 'longitude'])


# function to geocode hospital names or addresses if hospital names themselves are not sufficient
def geocode_hospitals(df):
    for index, row in df.iterrows():
        hospital_name = row['name']
        address = row['address']
        
        # determine the query based on available information
        if address:
            query = address
        else:
            query = f"{hospital_name}, Belgium"
        
        try:
            location = geolocator.geocode(query, timeout=10)
            if location:
                df.loc[index, 'address'] = location.address
                df.loc[index, 'latitude'] = location.latitude
                df.loc[index, 'longitude'] = location.longitude
            else:
                df.loc[index, 'address'] = None
                df.loc[index, 'latitude'] = None
                df.loc[index, 'longitude'] = None
        except (GeocoderTimedOut, GeocoderServiceError) as e:
            print(f"Error geocoding {query}: {e}")
            df.loc[index, 'address'] = None
            df.loc[index, 'latitude'] = None
            df.loc[index, 'longitude'] = None
    
    return df

# geocode the hospitals
pit_hospitals_with_coords = geocode_hospitals(hospitals_df)

#drop columns, we only need the coordinates
pit_hospitals_with_coords = pit_hospitals_with_coords.loc[:, ['latitude', 'longitude']]

# display the hospitals DataFrame with addresses and coordinates
print(pit_hospitals_with_coords)

     latitude  longitude
0   51.099783   4.232622
1   50.986425   3.527572
2   51.223649   4.434875
3   50.810888   4.933485
4   51.187526   5.114405
5   51.101340   4.000929
6   50.909904   4.196023
7   51.222791   2.914147
8   51.062547   3.720756
9   51.307809   3.124371
10  50.957775   5.517608
11  50.825930   4.378332
12  50.833473   4.345943
13  50.835695   4.348181
14  50.418054   4.480988
15  43.219179   0.079410
16  50.467587   4.888018
17  50.599613   5.631759
18  50.674721   5.633436
19  50.652037   5.578166
20  50.582211   5.859296
21  49.584723   3.054645
22  49.552645   5.526049
23  49.927931   5.384295


In [120]:
# dataFrame to store hospital details with addresses and coordinates
mug_hospitals_with_coords = pd.DataFrame(columns=['latitude', 'longitude'])

def geocode_hospitals(df):
    for index, row in df.iterrows():
        address = row['address_campus']
        municipality = row['municipality']
        
        # construct the query using address and municipality
        query = f"{address}, {municipality}, Belgium"
        
        try:
            location = geolocator.geocode(query, timeout=10)
            if location:
                df.loc[index, 'latitude'] = location.latitude
                df.loc[index, 'longitude'] = location.longitude
            else:
                df.loc[index, 'latitude'] = None
                df.loc[index, 'longitude'] = None
        except (GeocoderTimedOut, GeocoderServiceError) as e:
            print(f"Error geocoding {query}: {e}")
            df.loc[index, 'latitude'] = None
            df.loc[index, 'longitude'] = None

    return df

# geocode the hospitals
mug_hospitals_with_coords = geocode_hospitals(mug_locations)

# display the hospitals DataFrame with addresses and coordinates
print(mug_hospitals_with_coords)


    hospital_id  mug_id  campus_id                      name_hospital  \
0             9  102000       6230  ZIEKENHUISNETWERK ANTWERPEN (ZNA)   
1            99  102000       2020                  GZA- ZIEKENHUIZEN   
2           682  102000       1210                          AZ MONICA   
3             9  103000       2000  ZIEKENHUISNETWERK ANTWERPEN (ZNA)   
4            99  103000       1290                  GZA- ZIEKENHUIZEN   
..          ...     ...        ...                                ...   
89           20  808000       3010        CHR VERVIERS - EAST BELGIUM   
90          168  901000       3720                           VIVALIA    
91          246  902000       3690                           VIVALIA    
92          168  903000       3240                           VIVALIA    
93          164  904000       3230                           VIVALIA    

             name_campus           address_campus  postal_code  \
0              ZNA CADIX         KEMPENSTRAAT 100        

In [122]:
#remove all but the coordinate variables
mug_hospitals_with_coords = mug_hospitals_with_coords.loc[:, ['latitude', 'longitude']]
print(mug_hospitals_with_coords)

     latitude  longitude
0   51.231422   4.416284
1   51.205856   4.412845
2   51.206556   4.470778
3   51.401310   4.763188
4   51.174354   4.420085
..        ...        ...
89  50.715566   6.007128
90  50.159132   5.683118
91  49.678031   5.820700
92  49.927931   5.384295
93  50.228223   5.323121

[94 rows x 2 columns]


In [131]:
ambulance_locations_with_coords = ambulance_locations.loc[:, ['latitude', 'longitude']]
all_hospital_locations = pd.concat([mug_hospitals_with_coords, pit_hospitals_with_coords, ambulance_locations_with_coords], ignore_index = True)
all_hospital_locations

Unnamed: 0,latitude,longitude
0,51.231422,4.416284
1,51.205856,4.412845
2,51.206556,4.470778
3,51.401310,4.763188
4,51.174354,4.420085
...,...,...
392,50.025633,5.359641
393,50.674803,5.633665
394,50.652888,5.578356
395,50.622523,5.636381


In [None]:
#we can simply put all the coordinates in one table, to get a single table of hospital coordinates.



In [188]:
# the coordinates should match the coordinates from the calls. dus dat kan ik verifiëren.
# if the data is good, there should be a perfect overlap between the hospital permanence and the interventions permanence

interventions_bxl_coords = interventions_bxl.loc[:, ['latitude_permanence', 'longitude_permanence']]
interventions_bxl2_coords = interventions_bxl2.loc[:, ['Latitude Permanence', 'Longitude Permanence']]
interventions1_coords = interventions1.loc[:, ['Latitude permanence', 'Longitude permanence']]
interventions2_coords = interventions2.loc[:, ['Latitude permanence', 'Longitude permanence']]
interventions3_coords = interventions3.loc[:, ['Latitude permanence', 'Longitude permanence']]
cad9_coords = cad9.loc[:, ['Latitude permanence', 'Longitude permanence']]

# need to divide, there was no decimal point in the coordinates for the BXL interventions
# the inputted coordinates are not consistent in the amount of precision, so simply dividing all by 10**5 would not work, for example one coordinate 
#would be 500.123456, and another would be 5.00123456, while we want 50.0123456
#interventions_bxl_coords = interventions_bxl_coords/10**5
#interventions_bxl2_coords = interventions_bxl2_coords/10**5

"""
# Apply the function to the DataFrame
interventions_bxl_coords['latitude_permanence'] = interventions_bxl_coords['latitude_permanence'].apply(format_coordinate)
interventions_bxl_coords['longitude_permanence'] = interventions_bxl_coords['longitude_permanence'].apply(format_coordinate)

interventions_bxl2_coords['latitude_permanence'] = interventions_bxl2_coords['latitude_permanence'].apply(format_coordinate)
interventions_bxl2_coords['longitude_permanence'] = interventions_bxl2_coords['longitude_permanence'].apply(format_coordinate)
"""


# rename columns to match
interventions_bxl2_coords = interventions_bxl2_coords.rename(columns={
    'Latitude Permanence': 'latitude_permanence',
    'Longitude Permanence': 'longitude_permanence'
})
interventions1_coords = interventions1_coords.rename(columns={
    'Latitude permanence': 'latitude_permanence',
    'Longitude permanence': 'longitude_permanence'
})
interventions2_coords = interventions2_coords.rename(columns={
    'Latitude permanence': 'latitude_permanence',
    'Longitude permanence': 'longitude_permanence'
})
interventions3_coords = interventions3_coords.rename(columns={
    'Latitude permanence': 'latitude_permanence',
    'Longitude permanence': 'longitude_permanence'
})
cad9_coords = cad9_coords.rename(columns={
    'Latitude permanence': 'latitude_permanence',
    'Longitude permanence': 'longitude_permanence'
})

In [189]:
#finding the minimum value, so we can divide all values and speed up the process in the following cell
min_lat_bxl = interventions_bxl_coords['latitude_permanence'].min()
min_long_bxl = interventions_bxl_coords['longitude_permanence'].min()

min_lat_bxl2 = interventions_bxl_coords['latitude_permanence'].min()
min_long_bxl2 = interventions_bxl_coords['longitude_permanence'].min()

print(min_lat_bxl)
print(min_long_bxl)
print(min_lat_bxl2)
print(min_long_bxl2)

#so for latitude the initial divide can be by 10**4, for longitude it can be 10**3, based on the printed values:
#508047
#43089
#508047
#43089



508047
43089
508047
43089


'\n508047\n43089\n508047\n43089\n'

In [190]:
#adjusting the values for the bxl table, so that they are correctly formatted (with decimal point at the right place)
#latitude needs to be 2 numbers before the decimal

# using a while loop to divide by 10 is very slow, this solution is very fast
import math
def format_latitude(value):
    value / 10**4   #DIT KAN IK EIGENLIJK ERVOOR DOEN, AANGEZIEN HET HETZELFDE IS VOOR BEIDE KOLOMMEN, AL ERVOOR DELEN.
    if value > 99:
        value = value / (10**math.floor(math.log10(value) - 1))
    return value

# longitude needs to be 1 number before the decimal
def format_longitude(value):
    value / 10**4
    if value > 9:
        value = value / (10**math.floor(math.log10(value)))
    return value


#die shit met een loop doen, if waarde > 45 then /10 telkens opnieuw, totdat het niet meer groter is dan 45. je kan starten met al direct delen door
#een bepaalde grote waarde, kijk naar de kleinste waarde die er is 




interventions_bxl_coords['latitude_permanence'] = interventions_bxl_coords['latitude_permanence'].apply(format_latitude)
interventions_bxl_coords['longitude_permanence'] = interventions_bxl_coords['longitude_permanence'].apply(format_longitude)

interventions_bxl2_coords['latitude_permanence'] = interventions_bxl2_coords['latitude_permanence'].apply(format_latitude)
interventions_bxl2_coords['longitude_permanence'] = interventions_bxl2_coords['longitude_permanence'].apply(format_longitude)


In [191]:
#concatenating it into one big table
all_permanence_locations = pd.concat([interventions_bxl_coords,
                                     interventions_bxl2_coords,
                                     interventions1_coords,
                                     interventions2_coords,
                                     interventions3_coords,
                                     cad9_coords], ignore_index = True)
all_permanence_locations_unique = all_permanence_locations.drop_duplicates()

In [192]:
#there are more locations here than there are for the hospitals, which should not have happened. This shows yet again how messy
#the data is. Both will be used, merged into one big table.
#CONTROLEER OOK OF HET NIET AAN MEZELF LIGT, DAT IK GEEN FOUT HEB GEMAAKT
all_permanence_locations_unique

#de lengte van alle dinks zou ook gelijk moeten zijn. anders komen er op die manier mogelijk duplicates in. dus check dat ook.

Unnamed: 0,latitude_permanence,longitude_permanence
0,50.850970,4.364110
2,50.834330,4.345450
4,50.852110,4.460400
6,50.842260,4.399250
8,50.869480,4.386490
...,...,...
952153,51.179015,4.361991
952385,51.156827,4.415969
978425,51.205074,4.396444
1030254,51.210317,4.051975


In [193]:
#sort, to inspect it easier
sorted_latitude = all_permanence_locations_unique.sort_values('latitude_permanence')
sorted_latitude
#waarden die niet kloppen, te hoog zijn, NaN. uitzoeken waar ze vandaan komen



Unnamed: 0,latitude_permanence,longitude_permanence
667087,49.53729,5.78139
510267,49.55051,5.51832
349892,49.56358,5.82436
411251,49.56499,5.53480
345579,49.67429,5.59904
...,...,...
664543,,518.61500
664666,,546.42200
665759,,551.87300
665809,,528.09200


In [194]:
sorted_longitude = all_permanence_locations_unique.sort_values('longitude_permanence')
sorted_longitude

Unnamed: 0,latitude_permanence,longitude_permanence
337925,51.09634,2.58895
358534,51.06213,2.66631
350083,50.85924,2.72573
491463,51.12847,2.75747
337756,50.74621,2.82250
...,...,...
480350,,612.71800
582029,,612.73900
115649,,
296752,50.74320,


In [198]:
import pandas as pd

# Assuming all_permanence_locations_unique is already loaded

# Count NaN values for each column
nan_counts = all_permanence_locations_unique.isna().sum()
print("Count of NaN values for each column:")
print(nan_counts)

# Remove rows with any NaN values
df_cleaned = all_permanence_locations_unique.dropna()
print("\nDataFrame after removing rows with NaN values:")
print(df_cleaned)

# Count longitude values out of range (either above 9 or below 3)
longitude_out_of_range_count = (
    ((all_permanence_locations_unique['longitude_permanence'] > 9) |
     (all_permanence_locations_unique['longitude_permanence'] < 3))
).sum()
print("\nCount of longitude values out of range:")
print(longitude_out_of_range_count)

# Count latitude values out of range (either above 60 or below 40)
latitude_out_of_range_count = (
    ((all_permanence_locations_unique['latitude_permanence'] > 60) |
     (all_permanence_locations_unique['latitude_permanence'] < 40))
).sum()
print("\nCount of latitude values out of range:")
print(latitude_out_of_range_count)

#MOETEN DAN OOK DE NA VERWIJDEREN, EN DAN DE FOUTE WAARDES INSPECTEREN EN AANPASSEN INDIEN GEPAST

Count of NaN values for each column:
latitude_permanence     289
longitude_permanence      3
dtype: int64

DataFrame after removing rows with NaN values:
         latitude_permanence  longitude_permanence
0                  50.850970              4.364110
2                  50.834330              4.345450
4                  50.852110              4.460400
6                  50.842260              4.399250
8                  50.869480              4.386490
...                      ...                   ...
952153             51.179015              4.361991
952385             51.156827              4.415969
978425             51.205074              4.396444
1030254            51.210317              4.051975
1036343            51.251317              4.248720

[654 rows x 2 columns]

Count of longitude values out of range:
332

Count of latitude values out of range:
32
