Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
slurm-*.out
git-lfs/

### Python ###
# Byte-compiled / optimized / DLL files
Expand Down
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ repos:
- id: end-of-file-fixer
- id: check-yaml
- id: check-json
exclude_types: [jupyter]
- id: check-added-large-files

- repo: https://github.com/psf/black
Expand Down
39 changes: 37 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@


## Description
This repository contains the code used to engineer **BioCube: A Multimodal Dataset for Biodiversity**. The paper accompanying this repository can be found at: TBA
This repository contains the code used to engineer **BioCube: A Multimodal Dataset for Biodiversity Research**. The produced dataset, can be found on the [BioCube's Hugging Face](https://huggingface.co/datasets/BioDT/BioCube) page with detailed descriptions of the modalities it contains.


This codebase offers the below core functionalities:
- Download
Expand Down Expand Up @@ -61,8 +62,10 @@ At this point, we can select any kind of modalities and slice them for specific

![Alt text](img/data_batch.png "Data Batch Description")

Creating **Batches** can be done in two settings based on the sampling frequence (daily and monthly) and requires that you have downloaded BioCube and setted up the path variables appropriately.

To create Batches, just call the function:
### Daily
To create daily Batches, just call the function:

```python
create_dataset(
Expand All @@ -76,6 +79,24 @@ create_dataset(
)
```

### Monthly

To download BioCube and create monthly Batches, just run the below script:
```bash
bfm_data/dataset_creation/batch_creation/create_batches.sh
```
Or a step-by-step workflow:
```bash
# First run
python bfm_data/dataset_creation/batch_creation/scan_biocube.py --root biocube_data/data --out catalog_report.parquet
# Then run
python bfm_data/dataset_creation/batch_creation/build_batches_monthly.py
```

You can inspect the created Batches by using the `streamlit run batch_viewer.py --data_dir ./batches` that is located on the same folder as the previous scripts.

To produce statistics from the Batches that can be used for downstream tasks (e.g. normalization), just run `python batch_stats.py --batch_dir batches --out batches_stats.json`

## Storage

`Data` folder contains raw data.
Expand All @@ -99,6 +120,20 @@ This publication is part of the project Biodiversity Foundation Model of the res
This work used the Dutch national e-infrastructure with the support of the SURF Cooperative using grant no. EINF-10148*


## Citation

If you find our work useful, please consider citing us!

```
@article{stasinos2025biocube,
title={BioCube: A Multimodal Dataset for Biodiversity Research},
author={Stasinos, Stylianos and Mensio, Martino and Lazovik, Elena and Trantas, Athanasios},
journal={arXiv preprint arXiv:2505.11568},
year={2025}
}
```


## Useful commands

Copy files between clusters: cluster_1=a cluster , cluster_2=SURF Snellius
Expand Down
177 changes: 122 additions & 55 deletions bfm_data/data_ingestion/ingestion_scripts/copernicus_land.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
import csv
import os
from concurrent.futures import ThreadPoolExecutor

import geopandas as gpd
import netCDF4 as nc
import numpy as np
import pandas as pd
import requests
import reverse_geocode
from shapely.geometry import Point

from bfm_data.config import paths
from bfm_data.utils.geo import (
Expand Down Expand Up @@ -63,6 +65,10 @@ def load_region_bounding_boxes(self, region: str):
region (str): The region name (e.g., 'Europe', 'Latin America').
"""
_, iso_codes = get_countries_by_continent(region)

if 'CY' not in iso_codes:
iso_codes.append('CY')

self.country_rectangles = get_bounding_boxes_for_countries(iso_codes)

def filter_11th_day_files(self, file_urls: list) -> list:
Expand Down Expand Up @@ -140,61 +146,122 @@ def extract_ndvi_locations(self, nc_file_path: str):
lon_points = np.arange(-180, 180 + 0.25, 0.25)

if self.global_mode:
ndvi_data = {}

for lat_val in lat_points:
for lon_val in lon_points:
i = np.abs(lat - lat_val).argmin()
j = np.abs(lon - lon_val).argmin()
ndvi_value = ndvi[i, j]

if ndvi_value != 255 and ndvi_value > self.ndvi_threshold:
transformed_lon = lon_val if lon_val >= 0 else lon_val + 360
coord = (lat_val, transformed_lon)
country = reverse_geocode.get(coord)["country"]
if country not in ndvi_data:
ndvi_data[country] = []
ndvi_data[country].append(
(lat_val, transformed_lon, ndvi_value)
)
world_gdf = gpd.read_file("/projects/prjs1134/data/projects/biodt/storage/geoBoundaries/geoBoundaries CGAZ ADM0.geojson").set_crs("EPSG:4326")
world_gdf["shapeName"] = world_gdf["shapeName"].str.strip()

lat_points = np.arange(-90, 90.25, 0.1)
lon_points = np.arange(-180, 180.25, 0.1)

grid_points = [Point(lon, lat) for lat in lat_points for lon in lon_points]
grid_df = pd.DataFrame({
"Latitude": [pt.y for pt in grid_points],
"Longitude": [pt.x for pt in grid_points],
"geometry": grid_points
})
grid_gdf = gpd.GeoDataFrame(grid_df, geometry="geometry", crs="EPSG:4326")

joined = gpd.sjoin(grid_gdf, world_gdf[["geometry", "shapeName"]], predicate="within", how="inner")

def snap_to_grid(x, res=0.25):
return np.round(x / res) * res

ndvi_data = {country: [] for country in joined["shapeName"].unique()}
tmp_country_data = {country: [] for country in joined["shapeName"].unique()}

for _, row in joined.iterrows():
lat_val = row["Latitude"]
lon_val = row["Longitude"]
country = row["shapeName"]

i = np.abs(lat - lat_val).argmin()
j = np.abs(lon - lon_val).argmin()
ndvi_value = ndvi[i, j]

if ndvi_value != 255 and ndvi_value > self.ndvi_threshold:
lat_025 = snap_to_grid(lat_val, 0.25)
lon_025 = snap_to_grid(lon_val, 0.25)
lon_025 = lon_025 if lon_025 >= 0 else lon_025 + 360
tmp_country_data[country].append((lat_025, lon_025, ndvi_value))

for country, values in tmp_country_data.items():
if values:
df = pd.DataFrame(values, columns=["lat", "lon", "ndvi"])
df = df.groupby(["lat", "lon"], as_index=False)["ndvi"].mean()
ndvi_data[country] = list(df.itertuples(index=False, name=None))

return month_year, ndvi_data

else:
ndvi_data = {
get_country_name_from_iso(country): []
for country in self.country_rectangles.keys()
world_gdf = gpd.read_file("/projects/prjs1134/data/projects/biodt/storage/geoBoundaries/geoBoundaries CGAZ ADM0.geojson").set_crs("EPSG:4326")
world_gdf["shapeName"] = world_gdf["shapeName"].str.strip()

target_names = [get_country_name_from_iso(code).strip() for code in self.country_rectangles]
if "Cyprus" not in target_names:
target_names.append("Cyprus")

region_gdf = world_gdf[world_gdf["shapeName"].isin(target_names)].reset_index(drop=True)

if "Cyprus" not in region_gdf["shapeName"].values:
cyprus_row = world_gdf[world_gdf["shapeName"] == "Cyprus"]
if not cyprus_row.empty:
region_gdf = pd.concat([region_gdf, cyprus_row], ignore_index=True)

manual_countries = {
"Bosnia and Herzegovina": "Bosnia and Herzegovina",
"North Macedonia": "North Macedonia",
"Moldova": "Moldova"
}
for country in manual_countries:
if country not in region_gdf["shapeName"].values:
row = world_gdf[world_gdf["shapeName"] == country]
if not row.empty:
region_gdf = pd.concat([region_gdf, row], ignore_index=True)

lat_points = np.arange(32, 72, 0.1)
lon_points = np.arange(-25, 45, 0.1)

grid_points = [Point(lon, lat) for lat in lat_points for lon in lon_points]
grid_df = pd.DataFrame({
"Latitude": [pt.y for pt in grid_points],
"Longitude": [pt.x for pt in grid_points],
"geometry": grid_points
})
grid_gdf = gpd.GeoDataFrame(grid_df, geometry="geometry", crs="EPSG:4326")

joined = gpd.sjoin(grid_gdf, region_gdf[["geometry", "shapeName"]], predicate="within", how="inner")

matched_countries = sorted(joined["shapeName"].unique())
expected_countries = sorted(region_gdf["shapeName"].unique())

def snap_to_grid(x, res=0.25):
return np.round(x / res) * res

ndvi_data = {country: [] for country in expected_countries}
tmp_country_data = {country: [] for country in expected_countries}

for _, row in joined.iterrows():
lat_val = row["Latitude"]
lon_val = row["Longitude"]
country = row["shapeName"]

i = np.abs(lat - lat_val).argmin()
j = np.abs(lon - lon_val).argmin()
ndvi_value = ndvi[i, j]

if ndvi_value != 255 and ndvi_value > self.ndvi_threshold:
lat_025 = snap_to_grid(lat_val, 0.25)
lon_025 = snap_to_grid(lon_val, 0.25)
lon_025 = lon_025 if lon_025 >= 0 else lon_025 + 360
tmp_country_data[country].append((lat_025, lon_025, ndvi_value))

for country, values in tmp_country_data.items():
if values:
df = pd.DataFrame(values, columns=["lat", "lon", "ndvi"])
df = df.groupby(["lat", "lon"], as_index=False)["ndvi"].mean()
ndvi_data[country] = list(df.itertuples(index=False, name=None))

for country_iso, bbox in self.country_rectangles.items():
min_lon, min_lat, max_lon, max_lat = bbox

for lat_val in lat_points:
for lon_val in lon_points:
if min_lon > max_lon:
in_lon_range = lon_val >= min_lon or lon_val <= max_lon
else:
in_lon_range = min_lon <= lon_val <= max_lon

if min_lat <= lat_val <= max_lat and in_lon_range:
i = np.abs(lat - lat_val).argmin()
j = np.abs(lon - lon_val).argmin()
ndvi_value = ndvi[i, j]

if (
ndvi_value != 255
and ndvi_value > self.ndvi_threshold
):
transformed_lon = (
lon_val if lon_val >= 0 else lon_val + 360
)
country_name = get_country_name_from_iso(
country_iso
)
ndvi_data[country_name].append(
(lat_val, transformed_lon, ndvi_value)
)

dataset.close()
return month_year, ndvi_data
dataset.close()
return month_year, ndvi_data

except Exception as e:
print(f"Error processing the file {nc_file_path}: {e}")
Expand Down Expand Up @@ -313,12 +380,12 @@ def run_data_download(
land_dir = paths.LAND_DIR

if global_mode:
csv_file = f"{land_dir}/global_ndvi.csv"
csv_file = f"{land_dir}/global_ndvi_data.csv"
elif region:
region_cleaned = region.replace(" ", "_")
csv_file = f"{land_dir}/{region_cleaned}_ndvi_test.csv"
csv_file = f"{land_dir}/{region_cleaned}_ndvi_data.csv"
else:
csv_file = f"{land_dir}/default_ndvi.csv"
csv_file = f"{land_dir}/default_ndvi_data.csv"

downloader = CopernicusLandDownloader(
links_url=links_url,
Expand Down
2 changes: 1 addition & 1 deletion bfm_data/data_ingestion/ingestion_scripts/forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def run_forest_data_processing(region: str = None, global_mode: bool = True):
data_file = paths.FOREST_LAND_FILE

if region:
output_csv = f"{data_dir}/{region}_forest_data_test.csv"
output_csv = f"{data_dir}/{region}_forest_data.csv"
else:
output_csv = f"{data_dir}/global_forest_data.csv"

Expand Down
26 changes: 26 additions & 0 deletions bfm_data/data_preprocessing/indicators/combine_agriculture.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import pandas as pd
from bfm_data.config import paths

mode = "europe" # or "global"

input_files = {
"Agriculture": f"{mode.capitalize()}_agriculture_data.csv",
"Agriculture_Irrigated": f"{mode.capitalize()}_agriculture_irrigated_data.csv",
"Arable": f"{mode.capitalize()}_arable_data.csv",
"Cropland": f"{mode.capitalize()}_cropland_data.csv"
}

all_dataframes = []

for variable_name, filepath in input_files.items():
df = pd.read_csv(filepath)
df.insert(0, "Variable", variable_name)
all_dataframes.append(df)

combined_df = pd.concat(all_dataframes, ignore_index=True)

output_filename = f"{mode.capitalize()}_combined_agriculture_data.csv"
output_path = paths.AGRICULTURE_DIR / output_filename
combined_df.to_csv(output_path, index=False)

print(f"Saved to '{output_path}'")
28 changes: 28 additions & 0 deletions bfm_data/data_preprocessing/indicators/combine_land.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import pandas as pd
from bfm_data.config import paths

mode = "europe" # or "global"

land_file = f"{mode.capitalize()}_land_data.csv"
ndvi_file = f"{mode.capitalize()}_ndvi_monthly_un_025.csv"

land_df = pd.read_csv(land_file)
ndvi_df = pd.read_csv(ndvi_file)

for df in [land_df, ndvi_df]:
df["Latitude"] = df["Latitude"].apply(lambda x: round(float(x), 4))
df["Longitude"] = df["Longitude"].apply(lambda x: round(float(x), 4))

merged_df = pd.merge(
land_df,
ndvi_df,
on=["Country", "Latitude", "Longitude"],
how="outer", # Use 'inner' if only common coordinates are needed
suffixes=("", "_ndvi")
)

output_filename = f"{mode.capitalize()}_combined_land_data.csv"
output_path = paths.LAND_DIR / output_filename
merged_df.to_csv(output_path, index=False)

print(f"Merged file saved to '{output_path}'")
Loading