In [58]:

import pandas as pd
import json
from deltalake import write_deltalake, DeltaTable

In [59]:

# --- Step 1: Define cube metadata (one row per cube) ---
cubes = [
   {
        "cube_id": "prototype_dev_cube_v1",
         "n2k_site_code": "BE1000001",
        "cube_dir": "out/prototype_cubing/",
        "cube_version": "v1",
        "workflow_version": "cubing_engine_v1",
        "spatial_method": "polygon",
        "bbox": None,
        "polygon_wkt": "POLYGON((4.743004 50.68406, 4.743004 50.877911, 4.171371 50.877911, 4.171371 50.68406, 4.743004 50.68406))",
        "start_year": "2010-01",
        "end_year": "2025-09",
        "layers": "gbif",
        "layer_metadata": {
            "gbif": {
                "occurrences": True,
                "absences": False,
                "species_paths": "/directory/species_list.csv",
                "taxonomic": {"highest_rank": "kingdom", "lowest_rank": "species"},
                "selection_issues": {"hasCoordinate": True, "zeroCoordinate": True, "countryMismatch": True}
            }
        },
        "output_file": "species_oi_v2.csv",
         "provenance_remark": "GBIF cube using cubing_engine_v1"
    },

          {
        "cube_id": "prototype_dev_cube_v2",
         "n2k_site_code": "BE1000001",
        "cube_dir": "out/prototype_cubing/",
        "cube_version": "v2",
         "workflow_version": "cubing_engine_v1",
        "spatial_method": "bbox",
        "bbox": "4.171371,50.68406,4.743004,50.877911",
        "polygon_wkt": "POLYGON((4.743004 50.68406, 4.743004 50.877911, 4.171371 50.877911, 4.171371 50.68406, 4.743004 50.68406))",
        "start_year": "2010-01",
        "end_year": "2025-09",
        "layers": "gbif,chelsa",
        "layer_metadata": {
            "gbif": {
                "occurrences": True,
                "absences": False,
                "species_paths": "/directory/species_list.csv",
                "taxonomic": {"highest_rank": "kingdom", "lowest_rank": "species"},
                "selection_issues": {"hasCoordinate": True, "zeroCoordinate": True, "countryMismatch": True}
            },
            "chelsa": {
                "variables_included": ["tas", "tasmin", "tasmax"],
                "time_range": "1980-2020",
                "source_version": "V.2.1"
            }
        },
        "output_file": "species_oi.csv",
        "provenance_remark": "GBIF + CHELSA cube using cubing_engine_v1"
    }

]


# --- Step 2: Write to Delta Table ---
df = pd.DataFrame(cubes)
delta_path = "delta/bmd_cube_metadata"
write_deltalake(delta_path, df, mode="overwrite", schema_mode= "overwrite")

dt = DeltaTable(delta_path)
df = dt.to_pandas()
df.head()  # inspect first few rows


Unnamed: 0,cube_id,n2k_site_code,cube_dir,cube_version,workflow_version,spatial_method,bbox,polygon_wkt,start_year,end_year,layers,layer_metadata,output_file,provenance_remark
0,prototype_dev_cube_v1,BE1000001,out/prototype_cubing/,v1,cubing_engine_v1,polygon,,"POLYGON((4.743004 50.68406, 4.743004 50.877911...",2010-01,2025-09,gbif,"{'chelsa': None, 'gbif': {'absences': False, '...",species_oi_v2.csv,GBIF cube using cubing_engine_v1
1,prototype_dev_cube_v2,BE1000001,out/prototype_cubing/,v2,cubing_engine_v1,bbox,"4.171371,50.68406,4.743004,50.877911","POLYGON((4.743004 50.68406, 4.743004 50.877911...",2010-01,2025-09,"gbif,chelsa","{'chelsa': {'source_version': 'V.2.1', 'time_r...",species_oi.csv,GBIF + CHELSA cube using cubing_engine_v1


In [60]:
df = dt.to_pandas()

# --- Step 2: Prepare RO-Crate structure ---
ro_crate = {
    "@context": "https://w3id.org/ro/crate/1.1/context",
    "@graph": [
        {
            "@id": "ro-crate-metadata.json",
            "@type": "CreativeWork",
            "about": {"@id": "./"}
        },
        {
            "@id": "./",
            "@type": "Dataset",
            "name": "BMD Data Cube Package",
            "hasPart": []
        }
    ]
}

dataset_graph = ro_crate["@graph"][1]


In [61]:
def generate_ro_crate(cubes):
    ro_crate = {
        "@context": [
            "https://w3id.org/ro/crate/1.1/context",
            {"prov": "http://www.w3.org/ns/prov#"}
        ],
        "@graph": [
            {
                "@id": "ro-crate-metadata.json",
                "@type": "CreativeWork",
                "about": {"@id": "./"}
            },
            {
                "@id": "./",
                "@type": "Dataset",
                "name": "BMD Data Cube",
                "hasPart": []
            }
        ]
    }

    dataset_graph = ro_crate["@graph"][1]

    # Collect unique site codes
    site_codes = {c["n2k_site_code"] for c in cubes if "n2k_site_code" in c}

    for site_code in site_codes:
        ro_crate["@graph"].append({
            "@id": f"site:{site_code}",
            "@type": "Place",
            "identifier": site_code,
            "name": f"Natura2000 Site {site_code}"
        })

    # Add cubes
    for cube in cubes:
        cube_entry = {
            "@id": cube["cube_id"],
            "@type": "Dataset",
            "name": cube["cube_id"],
            "version": cube["cube_version"],
            "spatialCoverage": {
                "@id": f"site:{cube['n2k_site_code']}"
            },
            "temporalCoverage": f"{cube['start_year']}/{cube['end_year']}",
            "distribution": {
                "@id": f"dist:{cube['cube_id']}",
                "@type": "DataDownload",
                "encodingFormat": "text/csv",
                "contentUrl": f"{cube['cube_dir']}{cube['output_file']}"
            },
            "prov:wasGeneratedBy": {
                "@id": f"workflow:{cube['workflow_version']}",
                "@type": "SoftwareApplication",
                "name": cube["workflow_version"]
            },
            "hasPart": []
        }

        # Add layer datasets
        for layer_name, meta in cube["layer_metadata"].items():
            layer_id = f"layer:{cube['cube_id']}:{layer_name}"
            layer_entry = {
                "@id": layer_id,
                "@type": "Dataset",
                "name": layer_name,
                "additionalProperty": [
                    {"@type": "PropertyValue", "name": k, "value": str(v)}
                    for k, v in meta.items()
                ]
            }
            ro_crate["@graph"].append(layer_entry)
            cube_entry["hasPart"].append({"@id": layer_id})

        dataset_graph["hasPart"].append(cube_entry)

    return ro_crate




In [62]:
generate_ro_crate(cubes)

{'@context': ['https://w3id.org/ro/crate/1.1/context',
  {'prov': 'http://www.w3.org/ns/prov#'}],
 '@graph': [{'@id': 'ro-crate-metadata.json',
   '@type': 'CreativeWork',
   'about': {'@id': './'}},
  {'@id': './',
   '@type': 'Dataset',
   'name': 'BMD Data Cube',
   'hasPart': [{'@id': 'prototype_dev_cube_v1',
     '@type': 'Dataset',
     'name': 'prototype_dev_cube_v1',
     'version': 'v1',
     'spatialCoverage': {'@id': 'site:BE1000001'},
     'temporalCoverage': '2010-01/2025-09',
     'distribution': {'@id': 'dist:prototype_dev_cube_v1',
      '@type': 'DataDownload',
      'encodingFormat': 'text/csv',
      'contentUrl': 'out/prototype_cubing/species_oi_v2.csv'},
     'prov:wasGeneratedBy': {'@id': 'workflow:cubing_engine_v1',
      '@type': 'SoftwareApplication',
      'name': 'cubing_engine_v1'},
     'hasPart': [{'@id': 'layer:prototype_dev_cube_v1:gbif'}]},
    {'@id': 'prototype_dev_cube_v2',
     '@type': 'Dataset',
     'name': 'prototype_dev_cube_v2',
     'version'