In [1]:
import importlib
from glob import glob
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm

from bfm_finetune import plots
from bfm_finetune.utils import (
    aggregate_into_latlon_grid,
    get_lat_lon_ranges,
    unroll_matrix_into_df,
)

In [2]:
# finetune_location = Path(
#     "/projects/prjs1134/data/projects/biodt/storage/finetune"
# )  # snellius
finetune_location = Path("../data/finetune")  # local
geolifeclef_location = finetune_location / "geolifeclef24"

# https://lab.plantnet.org/seafile/d/bdb829337aa44a9489f6/files/?p=%2FPresenceAbsenceSurveys%2FReadMe.txt
# presence-absence: in europe
pa_path = geolifeclef_location / "GLC24_PA_metadata_train.csv"

In [None]:
df = pd.read_csv(pa_path)
# 1483637
print(len(df))
# ['lon', 'lat', 'year', 'geoUncertaintyInM', 'areaInM2', 'region', 'country', 'speciesId', 'surveyId']
print(df.columns)
df["speciesId"].unique()  # 5016 species
df["surveyId"].unique()  # 88987 different surveys

# one survey:
df[df["surveyId"] == 212]
# 15 rows (15 different species are present, the other ones are absent?)

# one species?
df[df["speciesId"] == 6874.0]  # 924 rows: indicating where it appears (no time filter)

In [4]:
step = 0.25
lat_range, lon_range = get_lat_lon_ranges(lat_step=step, lon_step=step)

In [None]:
df_selected = df[df["speciesId"] == 6874.0]
df_selected.groupby("year").surveyId.count()
# how many each species every year
# 2017    177
# 2018    247
# 2019    128
# 2020    189
# 2021    183

In [None]:
years = sorted(df_selected["year"].unique().tolist())
all_dfs = []
for year in tqdm(years, desc="years"):
    matrix = aggregate_into_latlon_grid(
        df_selected[df_selected["year"] == year],
        lat_range=lat_range,
        lon_range=lon_range,
        step=step,
    )
    df_grid = unroll_matrix_into_df(
        lat_range=lat_range, lon_range=lon_range, matrix=matrix
    )
    all_dfs.append(df_grid)

if all_dfs:
    final_df = all_dfs[0][["lat", "lon"]]
    for year, df_year in zip(years, all_dfs):
        final_df[year] = df_year["value"]

    fig = plots.plot_df_latlon(df=final_df, value_keys=years, title="speciesId=6874")
    fig.show()