# Yearly Observations by location

Jaarlijkse observaties per locatie gedefiniëerd op waarnemingen.be. Het voordeel van deze waarnemingen.be locaties, is dat deze een kleinere oppervlakte bestrijken dan de 581 gemeentes in België. Er zijn er immers een 14 000-tal. </br> We gebruiken enkel de locaties met een gekende oppervlakte en koppelen de waarnemingen die zijn toegekend aan deze locatie.

In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np

# set the max columns to none
pd.set_option('display.max_columns', None)
# set the max columns to none
pd.set_option('display.max_rows', None)

## Load clean or gold data

In [2]:
yearly = f'../2_cleaning/clean_data/observations_yearly_clean.parquet'
locations = f'../2_cleaning/clean_data/locations_clean.parquet'
boomklever = f'../3_transformation/gold/observations_bk.parquet'
halsbandparkiet = f'../3_transformation/gold/observations_hp.parquet'

df_yearly_birds = pd.read_parquet(yearly, engine="pyarrow")
df_locations = pd.read_parquet(locations, engine="pyarrow")
df_observations_bk = pd.read_parquet(boomklever, engine="pyarrow")
df_observations_hp = pd.read_parquet(halsbandparkiet, engine="pyarrow")

## Load and transform clean data

Er zijn meer indexen (location_id) voor de locaties dan namen. Dit wil zeggen dat er verschillende locatie-id's zijn met dezelfde namen

In [3]:
df_locations.index.nunique()

13966

In [4]:
df_locations['location_name'].nunique()

13920

In [5]:
df_locations.reset_index(inplace=True)
# df_locations['location_id'] = df_locations['location_id'].astype(float)
df_locations.head()

Unnamed: 0,location_id,location_name,province,province_id,municipality,municipality_id,location_type,page,area_km2
0,241620,A Fladjon,Liège,22.0,Ans (entité),43743.0,Gebied,1,0.00478
1,252470,A la Creux (réserve naturelle),Luxembourg,26.0,Léglise (entité),43953.0,Gebied,1,0.1813
2,241688,A St-Jacques,Namur,24.0,Somme-Leuze (entité),43891.0,Gebied,1,0.3022
3,27478,Aaigem (Dg),Oost-Vlaanderen,16.0,Erpe-Mere,24153.0,Deelgemeente,1,7.41
4,31885,Aalbeke - Allartpark,West-Vlaanderen,15.0,Aalbeke (Dg),24033.0,Gebied,1,0.0216


In [6]:
# # Maak dataframe met alle combinaties van locatie en periode 'name' and 'year'
all_locations = df_locations['location_id'].unique()
all_years = df_yearly_birds.index.unique()
all_combinations = pd.MultiIndex.from_product([all_locations, all_years], names=["location_id", "year"]).to_frame(index=False)

def group_by_year_and_location_and_calculate_fields(df_observations):
    df_observations["year"] = df_observations["date"].dt.year
    
    # koppel waarnemingen aan gemeenten
    df_observations = df_observations.merge(df_locations[["location_id", "location_name", "area_km2"]], on="location_id", how="left")
    
    # aantal waarnemingen per locatie en jaar
    result = df_observations.groupby(["location_id", "year"]).agg({'observation_id': 'count', 'observer_id': 'nunique', 'area_km2': 'mean'}).rename(columns={'observation_id': 'observation_count', 'observer_id': 'observers_count', 'area_km2':'area_km2'}).reset_index()
 

    result = all_combinations.merge(result, on=["location_id", "year"], how='left') # check that all combinations are present
    result.fillna(0, inplace=True) # fill NaN values with 0
    
    # Aandeel per jaarlijks miljoen vogelwaarnemingen
    result = result.merge(df_yearly_birds, on='year', how='left')
    result['observations_pym'] = result['observation_count'] * 1_000_000 / result['allbirds_observation_count'] 
    
    # 5 jaarlijks gemiddelde (fluctuaties opvangen)
    result['observations_pym_5yr_avg'] = result.sort_values('year').groupby(['location_id'])['observations_pym'].transform(lambda x: x.rolling(window=5, min_periods=1).mean())
    # % groei van 5 jaarlijks gemiddelde over 5 jaar
    result['observations_growth_5yr_%'] = result.sort_values('year').groupby(['location_id'])['observations_pym_5yr_avg'].transform(lambda x: x.pct_change(periods=5) * 100)
    # Observers per km2
    result['observers_per_km2'] = result['observers_count'] / result['area_km2']
    # Observations per km2
    result['observations_per_km2'] = result['observation_count'] / result['area_km2']
    
    result.drop(columns=['allbirds_observation_count','area_km2'], inplace=True) # not necessary for each species
    return result
    
result_hp = group_by_year_and_location_and_calculate_fields(df_observations_hp)
result_bk = group_by_year_and_location_and_calculate_fields(df_observations_bk)

# merge the species dataframes
yearly_by_location = pd.merge(result_hp, result_bk, on=['year', 'location_id'], how='outer', suffixes=("_hp", "_bk"))
yearly_by_location = yearly_by_location.merge(df_locations[["location_name","location_id","area_km2"]], on=["location_id"], how='outer')
yearly_by_location = yearly_by_location.merge(df_yearly_birds, on='year', how='left') # add allbirds_observation_count



In [7]:
yearly_by_location.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 754164 entries, 0 to 754163
Data columns (total 19 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   location_id                   754164 non-null  int64  
 1   year                          754164 non-null  int64  
 2   observation_count_hp          754164 non-null  float64
 3   observers_count_hp            754164 non-null  float64
 4   observations_pym_hp           754164 non-null  float64
 5   observations_pym_5yr_avg_hp   754164 non-null  float64
 6   observations_growth_5yr_%_hp  26323 non-null   float64
 7   observers_per_km2_hp          13033 non-null   float64
 8   observations_per_km2_hp       13033 non-null   float64
 9   observation_count_bk          754164 non-null  float64
 10  observers_count_bk            754164 non-null  float64
 11  observations_pym_bk           754164 non-null  float64
 12  observations_pym_5yr_avg_bk   754164 non-nul

In [8]:
# replace NaN values with 0 (eg. growth will be NaN if there are no observations)
yearly_by_location.fillna(0, inplace=True) 

# set max and min values for growth (-100% to 100%) to avoid outliers and infinity when coming from 0 observations. 
yearly_by_location['observations_growth_5yr_%_hp'] = yearly_by_location['observations_growth_5yr_%_hp'].clip(lower=-100, upper=100)
yearly_by_location['observations_growth_5yr_%_bk'] = yearly_by_location['observations_growth_5yr_%_bk'].clip(lower=-100, upper=100)

yearly_by_location.head(10)

Unnamed: 0,location_id,year,observation_count_hp,observers_count_hp,observations_pym_hp,observations_pym_5yr_avg_hp,observations_growth_5yr_%_hp,observers_per_km2_hp,observations_per_km2_hp,observation_count_bk,observers_count_bk,observations_pym_bk,observations_pym_5yr_avg_bk,observations_growth_5yr_%_bk,observers_per_km2_bk,observations_per_km2_bk,location_name,area_km2,allbirds_observation_count
0,23088,1971,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Elingen (Dg),2.99,2242
1,23088,1972,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Elingen (Dg),2.99,5281
2,23088,1973,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Elingen (Dg),2.99,6547
3,23088,1974,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Elingen (Dg),2.99,9548
4,23088,1975,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Elingen (Dg),2.99,9115
5,23088,1976,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Elingen (Dg),2.99,7035
6,23088,1977,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Elingen (Dg),2.99,7394
7,23088,1978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Elingen (Dg),2.99,11301
8,23088,1979,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Elingen (Dg),2.99,15202
9,23088,1980,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Elingen (Dg),2.99,23498


In [9]:
yearly_by_location[yearly_by_location.isna().any(axis=1)].sort_values(["year"], ascending=False).head(5)

Unnamed: 0,location_id,year,observation_count_hp,observers_count_hp,observations_pym_hp,observations_pym_5yr_avg_hp,observations_growth_5yr_%_hp,observers_per_km2_hp,observations_per_km2_hp,observation_count_bk,observers_count_bk,observations_pym_bk,observations_pym_5yr_avg_bk,observations_growth_5yr_%_bk,observers_per_km2_bk,observations_per_km2_bk,location_name,area_km2,allbirds_observation_count


## Write result to parquet-file in "gold" folder

In [10]:
yearly_by_location.to_parquet(f'./gold/yearly_observations_by_location.parquet', engine="pyarrow")