# Yearly Observations by municipality

Om de regionale verschillen ook in kaart te kunnen brengen, groeperen we niet enkel per jaar, maar ook per gemeente

In [17]:
import pandas as pd
import geopandas as gpd
import folium
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm # Logaritmische verdeling
import matplotlib.cm as cm
import matplotlib.colors as colors

# set the max columns to none
pd.set_option('display.max_columns', None)
# set the max columns to none
pd.set_option('display.max_rows', None)

## Gemeenten in België

In [18]:
## Geodataframe gemeenten in belgië
belgium = gpd.read_file('../resources/geodata/kontur_boundaries_BE_20230628.gpkg')
belgium_gemeenten = belgium[(belgium["osm_admin_level"] == '8')].copy()
belgium_gemeenten.head(5)

Unnamed: 0,admin_level,osm_admin_level,name,name_en,population,hasc,geometry
55,9,8,Tenneville,,2861.0,,"MULTIPOLYGON (((5.39024 50.07851, 5.39032 50.0..."
56,9,8,Rendeux,,2596.0,,"MULTIPOLYGON (((5.44198 50.22504, 5.44237 50.2..."
57,9,8,Boechout,,12318.0,,"MULTIPOLYGON (((4.47911 51.16276, 4.47986 51.1..."
58,9,8,Ville de Bruxelles - Stad Brussel,Brussels,220447.0,,"MULTIPOLYGON (((4.314 50.8934, 4.318 50.89451,..."
59,9,8,Spiere-Helkijn,,1939.0,,"MULTIPOLYGON (((3.32376 50.72324, 3.32429 50.7..."


## Load clean or gold data

In [19]:
yearly = f'../2_cleaning/clean_data/observations_yearly_clean.parquet'
boomklever = f'../3_transformation/gold/observations_bk.parquet'
halsbandparkiet = f'../3_transformation/gold/observations_hp.parquet'

df_yearly_birds = pd.read_parquet(yearly, engine="pyarrow")
df_observations_bk = pd.read_parquet(boomklever, engine="pyarrow")
df_observations_hp = pd.read_parquet(halsbandparkiet, engine="pyarrow")

## Load and transform clean data

In [20]:
# Maak dataframe met alle combinaties van gemeente en periode 'name' and 'year'
all_names = belgium_gemeenten['name'].unique()
all_years = df_yearly_birds.index.unique()
all_combinations = pd.MultiIndex.from_product([all_names, all_years], names=["name", "year"]).to_frame(index=False)

def group_by_year_and_municipality_and_calculate_fields(df_observations):
    df_observations["year"] = df_observations["date"].dt.year
    geometry = gpd.points_from_xy(df_observations['longitude'], df_observations['latitude'])
    gpd_observations = gpd.GeoDataFrame(df_observations, geometry=geometry, crs="EPSG:4326")
    
    # koppel waarnemingen aan gemeenten
    gpd_observations = gpd.sjoin(gpd_observations, belgium_gemeenten, how="right", predicate="within")
    
    # aantal waarnemingen per gemeente en jaar
    result = gpd_observations.groupby(["name", "year"]).agg({'observation_id': 'nunique', 'observer_id': 'nunique'}).rename(columns={'observation_id': 'observation_count', 'observer_id': 'observers_count'}).reset_index()
    result = all_combinations.merge(result, on=["name", "year"], how='left') # check that all combinations are present
    result.fillna(0, inplace=True) # fill NaN values with 0
    
    # Aandeel per jaarlijks miljoen vogelwaarnemingen
    result = result.merge(df_yearly_birds, on='year', how='left')
    result['observations_pym'] = result['observation_count'] * 1_000_000 / result['allbirds_observation_count'] 
    
    # 5 jaarlijks gemiddelde (fluctuaties opvangen)
    result['observations_pym_5yr_avg'] = result.sort_values('year').groupby('name')['observations_pym'].transform(lambda x: x.rolling(window=5, min_periods=1).mean())
    # % groei van 5 jaarlijks gemiddelde over 5 jaar
    result['observations_growth_5yr_%'] = result.sort_values('year').groupby('name')['observations_pym_5yr_avg'].transform(lambda x: x.pct_change(periods=5) * 100)
    return result
    
result_hp = group_by_year_and_municipality_and_calculate_fields(df_observations_hp)
result_bk = group_by_year_and_municipality_and_calculate_fields(df_observations_bk)

# merge the species dataframes
yearly_by_municipal = pd.merge(result_hp, result_bk, on=['year', 'name'], how='outer', suffixes=("_hp", "_bk"))

# fill NaN values with 0
# yearly_by_municipal.fillna(0, inplace=True) # geen waarnemingen -> 0 ipv NaN
yearly_by_municipal.sort_index(ascending=False).head(10)

Unnamed: 0,name,year,observation_count_hp,observers_count_hp,allbirds_observation_count_hp,observations_pym_hp,observations_pym_5yr_avg_hp,observations_growth_5yr_%_hp,observation_count_bk,observers_count_bk,allbirds_observation_count_bk,observations_pym_bk,observations_pym_5yr_avg_bk,observations_growth_5yr_%_bk
31373,Étalle,2024,30.0,16.0,3270062,9.174138,7.754548,-4.321828,0.0,0.0,3270062,0.0,0.0,
31372,Érezée,2024,24.0,16.0,3270062,7.33931,9.6811,10.608389,0.0,0.0,3270062,0.0,0.0,
31371,Éghezée,2024,9.0,7.0,3270062,2.752241,1.672696,2.830882,3.0,2.0,3270062,0.917414,0.183483,inf
31370,Écaussinnes,2024,1.0,1.0,3270062,0.305805,0.693,9.058051,2.0,2.0,3270062,0.611609,1.925987,-63.63567
31369,Zwijndrecht,2024,0.0,0.0,3270062,0.0,0.17894,-1.221051,116.0,22.0,3270062,35.473334,19.715053,21666.3
31368,Zwevegem,2024,2.0,2.0,3270062,0.611609,1.666456,28.402626,14.0,8.0,3270062,4.281264,1.28055,145.0236
31367,Zwalm,2024,3.0,3.0,3270062,0.917414,5.27176,410.149017,0.0,0.0,3270062,0.0,0.105047,inf
31366,Zutendaal,2024,85.0,29.0,3270062,25.993391,27.069916,321.917769,0.0,0.0,3270062,0.0,0.0,-100.0
31365,Zulte,2024,0.0,0.0,3270062,0.0,0.237205,146.688003,0.0,0.0,3270062,0.0,0.120676,inf
31364,Zuienkerke,2024,0.0,0.0,3270062,0.0,0.0,,0.0,0.0,3270062,0.0,0.0,-100.0


## Write result to parquet-file in "gold" folder

In [21]:
yearly_by_municipal.to_parquet(f'./gold/yearly_observations_by_municipality.parquet', engine="pyarrow")