# Yearly sources

Uit welke bronnen haalt waarnemingen.be zijn informatie.



In [3]:
import pandas as pd

# set the max columns to none
pd.set_option('display.max_columns', None)
# set the max columns to none
pd.set_option('display.max_rows', None)

## Load clean or gold data

In [4]:
boomklever = f'../3_transformation/gold/observations_bk.parquet'
halsbandparkiet = f'../3_transformation/gold/observations_hp.parquet'

df_boomklever = pd.read_parquet(boomklever, engine="pyarrow")
df_halsbandparkiet = pd.read_parquet(halsbandparkiet, engine="pyarrow")

## Merge data and group by source and year

In [5]:
species = { 
        116 : {'afkorting' : 'hp', 'soort' : 'halsbandparkiet'}, 
        70 : {'afkorting' : 'bk', 'soort' : 'boomklever'}
        }

In [6]:
def group_by_year_and_calculate_fields(df_species):
    df_species["source"] = df_species["source"].fillna("onbekend")
    df_species["year"] = df_species["date"].dt.year 
    df_species_yearly = df_species.reset_index().groupby(["year", "source"]).agg({'observation_id': 'nunique'}).rename(columns={'observation_id': 'observation_count'})
    return df_species_yearly
    
df_halsbandparkiet_yearly = group_by_year_and_calculate_fields(df_halsbandparkiet)
df_boomklever_yearly = group_by_year_and_calculate_fields(df_boomklever)

# merge the species dataframes
df_observations_yearly = pd.merge(df_halsbandparkiet_yearly, df_boomklever_yearly, on=['year', 'source'], how='outer', suffixes=("_hp", "_bk"))

# fill NaN values with 0
df_observations_yearly.fillna(0, inplace=True) # geen waarnemingen -> 0 ipv NaN
df_observations_yearly.sort_index(ascending=True)
df_observations_yearly.reset_index(inplace=True)
df_observations_yearly.set_index("year", inplace=True)
df_observations_yearly[df_observations_yearly.index == 2011].head(10)



Unnamed: 0_level_0,source,observation_count_hp,observation_count_bk
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2011,Gebiedsdekkend ingevoerd,9.0,44.0
2011,Meetnetten.be,2.0,0.0
2011,ObsMapp,5.0,22.0
2011,SOVON autoclustering module,0.0,110.0
2011,Site,13.0,31.0
2011,Webobs html5,1.0,5.0
2011,copied observation,0.0,26.0
2011,mobile pages,11.0,20.0
2011,onbekend,1451.0,5698.0
2011,via wnpda,12.0,728.0


## Write result to parquet-file in "gold" folder

In [7]:
df_observations_yearly.to_parquet(f'./gold/yearly_sources.parquet', index=True, engine="pyarrow")