BioDT · DjAzDeck · Jun 30, 2025 · May 13, 2025 · May 14, 2025 · May 20, 2025
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 slurm-*.out
+git-lfs/
 
 ### Python ###
 # Byte-compiled / optimized / DLL files

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -7,6 +7,7 @@ repos:
     -   id: end-of-file-fixer
     -   id: check-yaml
     -   id: check-json
+        exclude_types: [jupyter]
     -   id: check-added-large-files
 
 -   repo: https://github.com/psf/black

diff --git a/README.md b/README.md
@@ -3,7 +3,8 @@
 
 
 ## Description
-This repository contains the code used to engineer **BioCube: A Multimodal Dataset for Biodiversity**. The paper accompanying this repository can be found at: TBA
+This repository contains the code used to engineer **BioCube: A Multimodal Dataset for Biodiversity Research**. The produced dataset, can be found on the [BioCube's Hugging Face](https://huggingface.co/datasets/BioDT/BioCube) page with detailed descriptions of the modalities it contains.
+
 
 This codebase offers the below core functionalities:
 - Download
@@ -61,8 +62,10 @@ At this point, we can select any kind of modalities and slice them for specific
 
 ![Alt text](img/data_batch.png "Data Batch Description")
 
+Creating **Batches** can be done in two settings based on the sampling frequence (daily and monthly) and requires that you have downloaded BioCube and setted up the path variables appropriately. 
 
-To create Batches, just call the function:
+### Daily
+To create daily Batches, just call the function:
 
 ```python
 create_dataset(
@@ -76,6 +79,24 @@ create_dataset(
 )
 ```
 
+### Monthly
+
+To download BioCube and create monthly Batches, just run the below script:
+```bash
+bfm_data/dataset_creation/batch_creation/create_batches.sh
+```
+Or a step-by-step workflow: 
+```bash
+# First run
+python bfm_data/dataset_creation/batch_creation/scan_biocube.py --root biocube_data/data --out catalog_report.parquet
+# Then run
+python bfm_data/dataset_creation/batch_creation/build_batches_monthly.py
+```
+
+You can inspect the created Batches by using the `streamlit run batch_viewer.py --data_dir ./batches` that is located on the same folder as the previous scripts.
+
+To produce statistics from the Batches that can be used for downstream tasks (e.g. normalization), just run `python batch_stats.py --batch_dir batches --out batches_stats.json`
+
 ## Storage
 
 `Data` folder contains raw data.
@@ -99,6 +120,20 @@ This publication is part of the project Biodiversity Foundation Model of the res
 This work used the Dutch national e-infrastructure with the support of the SURF Cooperative using grant no. EINF-10148*
 
 
+## Citation 
+
+If you find our work useful, please consider citing us!
+
+```
+@article{stasinos2025biocube,
+  title={BioCube: A Multimodal Dataset for Biodiversity Research},
+  author={Stasinos, Stylianos and Mensio, Martino and Lazovik, Elena and Trantas, Athanasios},
+  journal={arXiv preprint arXiv:2505.11568},
+  year={2025}
+}
+```
+
+
 ## Useful commands
 
 Copy files between clusters: cluster_1=a cluster , cluster_2=SURF Snellius

diff --git a/bfm_data/data_ingestion/ingestion_scripts/copernicus_land.py b/bfm_data/data_ingestion/ingestion_scripts/copernicus_land.py
@@ -3,11 +3,13 @@
 import csv
 import os
 from concurrent.futures import ThreadPoolExecutor
-
+import geopandas as gpd
 import netCDF4 as nc
 import numpy as np
+import pandas as pd
 import requests
 import reverse_geocode
+from shapely.geometry import Point
 
 from bfm_data.config import paths
 from bfm_data.utils.geo import (
@@ -63,6 +65,10 @@ def load_region_bounding_boxes(self, region: str):
             region (str): The region name (e.g., 'Europe', 'Latin America').
         """
         _, iso_codes = get_countries_by_continent(region)
+
+        if 'CY' not in iso_codes:
+            iso_codes.append('CY')
+
         self.country_rectangles = get_bounding_boxes_for_countries(iso_codes)
 
     def filter_11th_day_files(self, file_urls: list) -> list:
@@ -140,61 +146,122 @@ def extract_ndvi_locations(self, nc_file_path: str):
             lon_points = np.arange(-180, 180 + 0.25, 0.25)
 
             if self.global_mode:
-                ndvi_data = {}
-
-                for lat_val in lat_points:
-                    for lon_val in lon_points:
-                        i = np.abs(lat - lat_val).argmin()
-                        j = np.abs(lon - lon_val).argmin()
-                        ndvi_value = ndvi[i, j]
-
-                        if ndvi_value != 255 and ndvi_value > self.ndvi_threshold:
-                            transformed_lon = lon_val if lon_val >= 0 else lon_val + 360
-                            coord = (lat_val, transformed_lon)
-                            country = reverse_geocode.get(coord)["country"]
-                            if country not in ndvi_data:
-                                ndvi_data[country] = []
-                            ndvi_data[country].append(
-                                (lat_val, transformed_lon, ndvi_value)
-                            )
+                world_gdf = gpd.read_file("/projects/prjs1134/data/projects/biodt/storage/geoBoundaries/geoBoundaries CGAZ ADM0.geojson").set_crs("EPSG:4326")
+                world_gdf["shapeName"] = world_gdf["shapeName"].str.strip()
+
+                lat_points = np.arange(-90, 90.25, 0.1)
+                lon_points = np.arange(-180, 180.25, 0.1)
+
+                grid_points = [Point(lon, lat) for lat in lat_points for lon in lon_points]
+                grid_df = pd.DataFrame({
+                    "Latitude": [pt.y for pt in grid_points],
+                    "Longitude": [pt.x for pt in grid_points],
+                    "geometry": grid_points
+                })
+                grid_gdf = gpd.GeoDataFrame(grid_df, geometry="geometry", crs="EPSG:4326")
+
+                joined = gpd.sjoin(grid_gdf, world_gdf[["geometry", "shapeName"]], predicate="within", how="inner")
+
+                def snap_to_grid(x, res=0.25):
+                    return np.round(x / res) * res
+
+                ndvi_data = {country: [] for country in joined["shapeName"].unique()}
+                tmp_country_data = {country: [] for country in joined["shapeName"].unique()}
+
+                for _, row in joined.iterrows():
+                    lat_val = row["Latitude"]
+                    lon_val = row["Longitude"]
+                    country = row["shapeName"]
+
+                    i = np.abs(lat - lat_val).argmin()
+                    j = np.abs(lon - lon_val).argmin()
+                    ndvi_value = ndvi[i, j]
+
+                    if ndvi_value != 255 and ndvi_value > self.ndvi_threshold:
+                        lat_025 = snap_to_grid(lat_val, 0.25)
+                        lon_025 = snap_to_grid(lon_val, 0.25)
+                        lon_025 = lon_025 if lon_025 >= 0 else lon_025 + 360
+                        tmp_country_data[country].append((lat_025, lon_025, ndvi_value))
+
+                for country, values in tmp_country_data.items():
+                    if values:
+                        df = pd.DataFrame(values, columns=["lat", "lon", "ndvi"])
+                        df = df.groupby(["lat", "lon"], as_index=False)["ndvi"].mean()
+                        ndvi_data[country] = list(df.itertuples(index=False, name=None))
+
+                return month_year, ndvi_data
 
             else:
-                ndvi_data = {
-                    get_country_name_from_iso(country): []
-                    for country in self.country_rectangles.keys()
+                world_gdf = gpd.read_file("/projects/prjs1134/data/projects/biodt/storage/geoBoundaries/geoBoundaries CGAZ ADM0.geojson").set_crs("EPSG:4326")
+                world_gdf["shapeName"] = world_gdf["shapeName"].str.strip()
+
+                target_names = [get_country_name_from_iso(code).strip() for code in self.country_rectangles]
+                if "Cyprus" not in target_names:
+                    target_names.append("Cyprus")
+
+                region_gdf = world_gdf[world_gdf["shapeName"].isin(target_names)].reset_index(drop=True)
+
+                if "Cyprus" not in region_gdf["shapeName"].values:
+                    cyprus_row = world_gdf[world_gdf["shapeName"] == "Cyprus"]
+                    if not cyprus_row.empty:
+                        region_gdf = pd.concat([region_gdf, cyprus_row], ignore_index=True)
+
+                manual_countries = {
+                    "Bosnia and Herzegovina": "Bosnia and Herzegovina",
+                    "North Macedonia": "North Macedonia",
+                    "Moldova": "Moldova"
                 }
+                for country in manual_countries:
+                    if country not in region_gdf["shapeName"].values:
+                        row = world_gdf[world_gdf["shapeName"] == country]
+                        if not row.empty:
+                            region_gdf = pd.concat([region_gdf, row], ignore_index=True)
+
+                lat_points = np.arange(32, 72, 0.1)
+                lon_points = np.arange(-25, 45, 0.1)
+
+                grid_points = [Point(lon, lat) for lat in lat_points for lon in lon_points]
+                grid_df = pd.DataFrame({
+                    "Latitude": [pt.y for pt in grid_points],
+                    "Longitude": [pt.x for pt in grid_points],
+                    "geometry": grid_points
+                })
+                grid_gdf = gpd.GeoDataFrame(grid_df, geometry="geometry", crs="EPSG:4326")
+
+                joined = gpd.sjoin(grid_gdf, region_gdf[["geometry", "shapeName"]], predicate="within", how="inner")
+
+                matched_countries = sorted(joined["shapeName"].unique())
+                expected_countries = sorted(region_gdf["shapeName"].unique())
+
+                def snap_to_grid(x, res=0.25):
+                    return np.round(x / res) * res
+
+                ndvi_data = {country: [] for country in expected_countries}
+                tmp_country_data = {country: [] for country in expected_countries}
+
+                for _, row in joined.iterrows():
+                    lat_val = row["Latitude"]
+                    lon_val = row["Longitude"]
+                    country = row["shapeName"]
+
+                    i = np.abs(lat - lat_val).argmin()
+                    j = np.abs(lon - lon_val).argmin()
+                    ndvi_value = ndvi[i, j]
+
+                    if ndvi_value != 255 and ndvi_value > self.ndvi_threshold:
+                        lat_025 = snap_to_grid(lat_val, 0.25)
+                        lon_025 = snap_to_grid(lon_val, 0.25)
+                        lon_025 = lon_025 if lon_025 >= 0 else lon_025 + 360
+                        tmp_country_data[country].append((lat_025, lon_025, ndvi_value))
+
+                for country, values in tmp_country_data.items():
+                    if values:
+                        df = pd.DataFrame(values, columns=["lat", "lon", "ndvi"])
+                        df = df.groupby(["lat", "lon"], as_index=False)["ndvi"].mean()
+                        ndvi_data[country] = list(df.itertuples(index=False, name=None))
 
-                for country_iso, bbox in self.country_rectangles.items():
-                    min_lon, min_lat, max_lon, max_lat = bbox
-
-                    for lat_val in lat_points:
-                        for lon_val in lon_points:
-                            if min_lon > max_lon:
-                                in_lon_range = lon_val >= min_lon or lon_val <= max_lon
-                            else:
-                                in_lon_range = min_lon <= lon_val <= max_lon
-
-                            if min_lat <= lat_val <= max_lat and in_lon_range:
-                                i = np.abs(lat - lat_val).argmin()
-                                j = np.abs(lon - lon_val).argmin()
-                                ndvi_value = ndvi[i, j]
-
-                                if (
-                                    ndvi_value != 255
-                                    and ndvi_value > self.ndvi_threshold
-                                ):
-                                    transformed_lon = (
-                                        lon_val if lon_val >= 0 else lon_val + 360
-                                    )
-                                    country_name = get_country_name_from_iso(
-                                        country_iso
-                                    )
-                                    ndvi_data[country_name].append(
-                                        (lat_val, transformed_lon, ndvi_value)
-                                    )
-
-            dataset.close()
-            return month_year, ndvi_data
+                dataset.close()
+                return month_year, ndvi_data
 
         except Exception as e:
             print(f"Error processing the file {nc_file_path}: {e}")
@@ -313,12 +380,12 @@ def run_data_download(
     land_dir = paths.LAND_DIR
 
     if global_mode:
-        csv_file = f"{land_dir}/global_ndvi.csv"
+        csv_file = f"{land_dir}/global_ndvi_data.csv"
     elif region:
         region_cleaned = region.replace(" ", "_")
-        csv_file = f"{land_dir}/{region_cleaned}_ndvi_test.csv"
+        csv_file = f"{land_dir}/{region_cleaned}_ndvi_data.csv"
     else:
-        csv_file = f"{land_dir}/default_ndvi.csv"
+        csv_file = f"{land_dir}/default_ndvi_data.csv"
 
     downloader = CopernicusLandDownloader(
         links_url=links_url,

diff --git a/bfm_data/data_ingestion/ingestion_scripts/forest.py b/bfm_data/data_ingestion/ingestion_scripts/forest.py
@@ -16,7 +16,7 @@ def run_forest_data_processing(region: str = None, global_mode: bool = True):
     data_file = paths.FOREST_LAND_FILE
 
     if region:
-        output_csv = f"{data_dir}/{region}_forest_data_test.csv"
+        output_csv = f"{data_dir}/{region}_forest_data.csv"
     else:
         output_csv = f"{data_dir}/global_forest_data.csv"
 

diff --git a/bfm_data/data_preprocessing/indicators/combine_agriculture.py b/bfm_data/data_preprocessing/indicators/combine_agriculture.py
@@ -0,0 +1,26 @@
+import pandas as pd
+from bfm_data.config import paths
+
+mode = "europe"  # or "global"
+
+input_files = {
+    "Agriculture": f"{mode.capitalize()}_agriculture_data.csv",
+    "Agriculture_Irrigated": f"{mode.capitalize()}_agriculture_irrigated_data.csv",
+    "Arable": f"{mode.capitalize()}_arable_data.csv",
+    "Cropland": f"{mode.capitalize()}_cropland_data.csv"
+}
+
+all_dataframes = []
+
+for variable_name, filepath in input_files.items():
+    df = pd.read_csv(filepath)
+    df.insert(0, "Variable", variable_name)
+    all_dataframes.append(df)
+
+combined_df = pd.concat(all_dataframes, ignore_index=True)
+
+output_filename = f"{mode.capitalize()}_combined_agriculture_data.csv"
+output_path = paths.AGRICULTURE_DIR / output_filename
+combined_df.to_csv(output_path, index=False)
+
+print(f"Saved to '{output_path}'")
diff --git a/bfm_data/data_preprocessing/indicators/combine_land.py b/bfm_data/data_preprocessing/indicators/combine_land.py
@@ -0,0 +1,28 @@
+import pandas as pd
+from bfm_data.config import paths
+
+mode = "europe"  # or "global"
+
+land_file = f"{mode.capitalize()}_land_data.csv"
+ndvi_file = f"{mode.capitalize()}_ndvi_monthly_un_025.csv"
+
+land_df = pd.read_csv(land_file)
+ndvi_df = pd.read_csv(ndvi_file)
+
+for df in [land_df, ndvi_df]:
+    df["Latitude"] = df["Latitude"].apply(lambda x: round(float(x), 4))
+    df["Longitude"] = df["Longitude"].apply(lambda x: round(float(x), 4))
+
+merged_df = pd.merge(
+    land_df,
+    ndvi_df,
+    on=["Country", "Latitude", "Longitude"],
+    how="outer",  # Use 'inner' if only common coordinates are needed
+    suffixes=("", "_ndvi")
+)
+
+output_filename = f"{mode.capitalize()}_combined_land_data.csv"
+output_path = paths.LAND_DIR / output_filename
+merged_df.to_csv(output_path, index=False)
+
+print(f"Merged file saved to '{output_path}'")