# Locations

This step explores and cleans the locations.</br>
Locations without area are dropped. Area is converted to km2.
At the end, the result is written to a .parquet file to preserve the data types

In [17]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np

# set the max columns to none
pd.set_option('display.max_columns', None)
# set the max columns to none
pd.set_option('display.max_rows', None)

## Data exploring

In [18]:
file = f'../1_scraping/scraped_data/cleaned/location_details_clean.csv'

# Load the data
df_locations = pd.read_csv(file, header=0, index_col="location_id")

# Preview
df_locations.head(5)

Unnamed: 0_level_0,location_name,province,province_id,municipality,municipality_id,location_type,page,area
location_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
241620,A Fladjon,Liège,22.0,Ans (entité),43743.0,Gebied,1,4.780 m2
252470,A la Creux (réserve naturelle),Luxembourg,26.0,Léglise (entité),43953.0,Gebied,1,"18,13 ha"
241688,A St-Jacques,Namur,24.0,Somme-Leuze (entité),43891.0,Gebied,1,"30,22 ha"
27478,Aaigem (Dg),Oost-Vlaanderen,16.0,Erpe-Mere,24153.0,Deelgemeente,1,"7,41 km2"
31885,Aalbeke - Allartpark,West-Vlaanderen,15.0,Aalbeke (Dg),24033.0,Gebied,1,"2,16 ha"


In [19]:
df_locations.index

Index([241620, 252470, 241688,  27478,  31885,  71763,  83947,  24033, 724399,
        83946,
       ...
        29903, 191328,  28347, 191310,  29933,  29905, 191315,  29906,  29904,
        28705],
      dtype='int64', name='location_id', length=14040)

In [20]:
df_locations.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14040 entries, 241620 to 28705
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   location_name    14040 non-null  object 
 1   province         14040 non-null  object 
 2   province_id      14039 non-null  float64
 3   municipality     14040 non-null  object 
 4   municipality_id  13646 non-null  float64
 5   location_type    14040 non-null  object 
 6   page             14040 non-null  int64  
 7   area             13966 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 987.2+ KB


In [21]:
df_locations.describe(include='all')

Unnamed: 0,location_name,province,province_id,municipality,municipality_id,location_type,page,area
count,14040,14040,14039.0,14040,13646.0,14040,14040.0,13966
unique,13994,12,,716,,4,,7152
top,Ancienne Carrière de l'Eau Bleue (La Bruyère),Oost-Vlaanderen,,—,,Gebied,,"1,1 km2"
freq,2,2270,,394,,10895,,25
mean,,,19.169243,,33974.214568,,281.300214,
std,,,3.881507,,15465.777904,,162.125842,
min,,,14.0,,23089.0,,1.0,
25%,,,16.0,,23441.0,,141.0,
50%,,,17.0,,24169.5,,281.0,
75%,,,23.0,,43850.0,,422.0,


## Cleaning

In [22]:
df_locations_cleaned = df_locations.copy()

### Area
Only keep locations with a valid area. Transform everything to km2


In [23]:
df_locations_cleaned = df_locations_cleaned.dropna(subset = ['area']) 

In [24]:
def transform_area_to_km2(area):
    area_value = area.split(" ")[0]
    area_value = area_value.replace(".", "")
    area_value = area_value.replace(",", ".")
    
    area_unit = area.split(" ")[1]
    if area_unit == "ha":
        area_value = float(area_value) / 100
    elif area_unit == "km2":
        area_value = float(area_value)
    elif area_unit == "m2":
        area_value = float(area_value) / 1000000
    elif area_unit == "a":
        area_value = float(area_value) / 10000
    else:
        raise ValueError(f"Unknown area unit: {area_unit}")
    return area_value
    

df_locations_cleaned["area_km2"] = df_locations_cleaned["area"].apply(lambda x: transform_area_to_km2(x))

In [25]:
df_locations_cleaned[df_locations_cleaned["area"].str.contains(".", regex=False)].head(5)

Unnamed: 0_level_0,location_name,province,province_id,municipality,municipality_id,location_type,page,area,area_km2
location_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
241620,A Fladjon,Liège,22.0,Ans (entité),43743.0,Gebied,1,4.780 m2,0.00478
126250,Aarschot - Sleedoornpage - Biezenhuiskes - Hou...,Vlaams-Brabant,20.0,Aarschot,23400.0,Gebied,3,1.724 m2,0.001724
126249,Aarschot - Sleedoornpage - Biezenhuiskes - Hou...,Vlaams-Brabant,20.0,Aarschot,23400.0,Gebied,3,1.065 m2,0.001065
191799,Aarsele - Spoorweg tssn overgang N459 en veldweg,West-Vlaanderen,15.0,Tielt,24063.0,Gebied,3,2.869 m2,0.002869
191595,Aarsele - Spoorwegberm voorbij overgang tot pa...,West-Vlaanderen,15.0,Tielt,24063.0,Gebied,3,8.480 m2,0.00848


In [26]:
df_locations_cleaned[df_locations_cleaned["area"].str.contains(",")].head(5)

Unnamed: 0_level_0,location_name,province,province_id,municipality,municipality_id,location_type,page,area,area_km2
location_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
252470,A la Creux (réserve naturelle),Luxembourg,26.0,Léglise (entité),43953.0,Gebied,1,"18,13 ha",0.1813
241688,A St-Jacques,Namur,24.0,Somme-Leuze (entité),43891.0,Gebied,1,"30,22 ha",0.3022
27478,Aaigem (Dg),Oost-Vlaanderen,16.0,Erpe-Mere,24153.0,Deelgemeente,1,"7,41 km2",7.41
31885,Aalbeke - Allartpark,West-Vlaanderen,15.0,Aalbeke (Dg),24033.0,Gebied,1,"2,16 ha",0.0216
71763,Aalbeke - Potyzer,West-Vlaanderen,15.0,Aalbeke (Dg),24033.0,Gebied,1,"2,08 ha",0.0208


In [27]:
df_locations_cleaned.drop(columns=["area"], inplace=True)

## Final check and write to parquet file

In [28]:
df_locations_cleaned.head(20)

Unnamed: 0_level_0,location_name,province,province_id,municipality,municipality_id,location_type,page,area_km2
location_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
241620,A Fladjon,Liège,22.0,Ans (entité),43743.0,Gebied,1,0.00478
252470,A la Creux (réserve naturelle),Luxembourg,26.0,Léglise (entité),43953.0,Gebied,1,0.1813
241688,A St-Jacques,Namur,24.0,Somme-Leuze (entité),43891.0,Gebied,1,0.3022
27478,Aaigem (Dg),Oost-Vlaanderen,16.0,Erpe-Mere,24153.0,Deelgemeente,1,7.41
31885,Aalbeke - Allartpark,West-Vlaanderen,15.0,Aalbeke (Dg),24033.0,Gebied,1,0.0216
71763,Aalbeke - Potyzer,West-Vlaanderen,15.0,Aalbeke (Dg),24033.0,Gebied,1,0.0208
83947,Aalbeke - Vijver Vandecasteele,West-Vlaanderen,15.0,Kortrijk,24030.0,Gebied,1,0.2079
24033,Aalbeke (Dg),West-Vlaanderen,15.0,Kortrijk,24030.0,Deelgemeente,1,7.35
724399,"Aalbeke/Marke - Kobbepoel, Preshoekbos",West-Vlaanderen,15.0,Kortrijk,24030.0,Gebied,1,0.0534
83946,Aalbeke/Marke - Preshoekbos en omgeving,West-Vlaanderen,15.0,Aalbeke (Dg),24033.0,Gebied,1,4.09


In [29]:
df_locations_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13966 entries, 241620 to 28705
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   location_name    13966 non-null  object 
 1   province         13966 non-null  object 
 2   province_id      13965 non-null  float64
 3   municipality     13966 non-null  object 
 4   municipality_id  13572 non-null  float64
 5   location_type    13966 non-null  object 
 6   page             13966 non-null  int64  
 7   area_km2         13966 non-null  float64
dtypes: float64(3), int64(1), object(4)
memory usage: 982.0+ KB


In [30]:
df_locations_cleaned.to_parquet(f'./clean_data/locations_clean.parquet', index=True, engine="pyarrow")