# RO-Crate validation

This notebook contains snippets from `autosubmit.provenance` code.

This code is reused here as a sandbox to toy with the code when there are
validation issues.

In [1]:
import json
import mimetypes
import os
import subprocess
from datetime import datetime
from pathlib import Path
from ruamel.yaml import YAML
from shutil import rmtree
from tempfile import TemporaryDirectory
from textwrap import dedent
from typing import Any, Callable, Optional, Union, cast

from rocrate.model.contextentity import ContextEntity  # type: ignore
from rocrate.rocrate import Dataset, File, ROCrate  # type: ignore
from rocrate.utils import iso_now  # type: ignore

from autosubmit.autosubmit import Autosubmit
from autosubmit.config.basicconfig import BasicConfig
from autosubmit.config.configcommon import AutosubmitConfig
from autosubmit.database.db_common import get_autosubmit_version, get_experiment_description
from autosubmit.job.job import Job
from autosubmit.job.job_common import Status
from autosubmit.log.log import Log, AutosubmitCritical
from autosubmit.statistics.statistics import Statistics

In [2]:
# Constants

PROFILES = [
    {
        "@id": "https://w3id.org/ro/wfrun/process/0.5",
        "@type": "CreativeWork",
        "name": "Process Run Crate",
        "version": "0.5"
    },
    {
        "@id": "https://w3id.org/ro/wfrun/workflow/0.5",
        "@type": "CreativeWork",
        "name": "Workflow Run Crate",
        "version": "0.5"
    },
    {
        "@id": "https://w3id.org/workflowhub/workflow-ro-crate/1.0",
        "@type": "CreativeWork",
        "name": "Workflow RO-Crate",
        "version": "1.0"
    }
]

PARAMETER_TYPES_MAP = {
    'str': 'Text',
    'int': 'Integer',
    'float': 'Float',
    'bool': 'Boolean',
    'dict': str,
    'list': str
}

DEFAULT_EXPORTED_KEYS = [
    'DEFAULT',
    'EXPERIMENT',
    'CONFIG',
    'PROJECT'
]


In [3]:
# Misc functions for AS data

def _get_action_status(jobs: list[Job]) -> str:
    """Get the status of the workflow action.

    :param jobs: list of jobs, used to infer the current workflow/action status.
    :return: a valid RO-Crate and Schema.org action status.
    """
    if not jobs:
        return 'PotentialActionStatus'
    if all([job.status == Status.COMPLETED for job in jobs]):
        return 'CompletedActionStatus'
    failed_statuses = [
        Status.FAILED
    ]
    if any([job.status in failed_statuses for job in jobs]):
        return 'FailedActionStatus'
    return 'PotentialActionStatus'

In [4]:
# RO-Crate functions

def _add_files(crate: ROCrate, base_path: Path, relative_path: str, expid: str,
               encoding_format: Optional[str] = None) -> None:
    """Add all files of a directory into the RO-Crate.

    Ignores existing crate archives.

    :param crate: the RO-Crate instance.
    :param base_path: the base path for the files being added.
    :param relative_path: the relative path (to the ``base_path``).
    :param expid: the experiment identifier, used to exclude previously created RO-Crate archives.
    :param encoding_format: the encoding format (if any).
    """
    folder = Path(base_path, relative_path)
    for root, dirs, files in os.walk(folder, topdown=True):
        for file in files:
            file_path = Path(root, file)
            if file.startswith(f'{expid}-crate') and file.endswith('.zip'):
                continue
            _add_file(crate, base_path, file_path, encoding_format)

def _add_directory(crate: ROCrate, base_path: Path, dir_path: Path) -> None:
    """Add a directory to the crate, recursively.

    Does not add the directory contents (i.e., subdirectories or files).
    """
    for parent in reversed(dir_path.parents):
        if str(parent) == '.':
            continue

        dir_id = f'{str(parent)}/'

        if dir_id not in [e.id for e in crate.data_entities]:
            dest_path = dir_path.parent.relative_to(base_path)
            crate.add_dataset(
                source=dir_id,
                dest_path=dest_path
            )

def _add_file(crate: ROCrate, base_path: Optional[Path], file_path: Path, encoding_format: Optional[str] = None,
              use_uri: bool = False, **args: Any) -> Any:
    """Add a file into the RO-Crate.

    :param crate: the RO-Crate instance.
    :param base_path: the base path for the files being added. Optional.
    :param file_path: the path for the file being added.
    :param encoding_format: the encoding format (if any).
    :param use_uri: whether to use the Path as a URI or as a source directly. Defaults to ``False``.
    :return: the object returned by ro-crate-py
    :rtype: Any
    """
    properties = {
        "name": file_path.name,
        "sdDatePublished": iso_now(),
        "dateModified": datetime.utcfromtimestamp(file_path.stat().st_mtime).replace(
            microsecond=0).isoformat(),
        "contentSize": file_path.stat().st_size,
        **args
    }
    encoding_format = encoding_format if encoding_format is not None else mimetypes.guess_type(file_path)[0]
    if encoding_format is not None:
        # N.B.: We must not write ``None``'s or other missing or empty values
        #       to the encoding format if none found.
        properties['encodingFormat'] = encoding_format

    source = file_path if not use_uri else file_path.as_uri()

    dest_path = None
    if base_path:
        dest_path = file_path.relative_to(base_path)
    file = File(crate=crate,
                source=source,
                dest_path=dest_path,
                fetch_remote=False,
                validate_url=False,
                properties=properties)
    # This is to prevent ``metadata/experiment_data.yml`` to be added twice.
    # Once as the workflow main file, and twice when scanning the experiment
    # ``conf`` folder for YAML files.
    # See: https://github.com/ResearchObject/ro-crate-py/issues/165
    if file.id not in [x['@id'] for x in crate.data_entities]:
        _add_directory(crate, base_path, file_path.parent)
        return crate.add_file(
            source=source,
            dest_path=dest_path,
            fetch_remote=False,
            validate_url=False,
            properties=properties
        )
    return None

In [5]:
# Basic information about the AS experiment;
# change it as needed to use a local experiment.

# Experiment
expid = 'a008'

# AS config
as_conf = AutosubmitConfig('a008')
as_conf.reload(force_load=True)

# Experiment data
workflow_configuration = as_conf.experiment_data
expid = workflow_configuration['DEFAULT']['EXPID']
as_version = get_autosubmit_version(expid)

experiment_path = Path(BasicConfig.LOCAL_ROOT_DIR, expid)
unified_yaml_configuration = experiment_path / "conf/metadata/experiment_data.yml"

# Some RO-Crate data
conforming_profiles = [
    {"@id": profile["@id"]} for profile in PROFILES
]

mimetypes.init()

# jobs
job_list = Autosubmit.load_job_list(expid, as_conf)
jobs = job_list.get_job_list()

# start-end
exp_stats = Statistics(jobs=jobs, start=None, end=None, queue_time_fix={})
exp_stats.calculate_statistics()
start_time = None
end_time = None
# N.B.: ``exp_stats.jobs_stat`` is sorted in reverse order.
number_of_jobs = len(exp_stats.jobs_stat)
if number_of_jobs > 0:
    start_time = exp_stats.jobs_stat[-1].start_time.replace(microsecond=0).isoformat()
if number_of_jobs > 1:
    end_time = exp_stats.jobs_stat[0].finish_time.replace(microsecond=0).isoformat()


Loading JobList
[32mLoad finished[0m[39m
Creating jobs...
Adding dependencies to the graph..
Adding dependencies to the job..
Transitive reduction...
Looking for edgeless jobs...


In [6]:
crate = ROCrate(gen_preview=True)
crate.name = expid
crate.conformsTo = conforming_profiles
# Fetch the experiment description from the main database
crate.description = get_experiment_description(expid)[0][0]
for profile in PROFILES:
    crate.add(ContextEntity(crate, properties=profile))

Log.info('Creating RO-Crate archive...')

# Create workflow configuration (prospective provenance)
main_entity = crate.add_workflow(
    source=unified_yaml_configuration,
    main=True,
    lang="Autosubmit",
    lang_version=as_version,
    gen_cwl=False
)

# Add original YAML configuration.
crate.add_dataset(source=Path(experiment_path, "conf"), dest_path="conf")

experiment_configuration_path = Path(experiment_path, "conf")
for config_entry in as_conf.current_loaded_files.keys():
    config_entry_path = Path(config_entry)
    if not config_entry_path.is_file():
        continue
    # We do not want to add the entries under <EXPID>/conf/ again.
    if experiment_configuration_path in config_entry_path.parents:
        continue

    base_path = experiment_path if experiment_path in config_entry_path.parents else None

    # Everything else is added as absolute URI, as it might be
    # a file like ``/etc/fstab``, or a private configuration from
    # the project.
    use_uri = base_path is None
    _add_file(crate, base_path, config_entry_path, encoding_format=None, use_uri=use_uri)
# Add log files.
_add_files(crate, experiment_path, BasicConfig.LOCAL_TMP_DIR, expid, "text/plain")
# Add plots files.
_add_files(crate, experiment_path, "plot", expid)
# Add status files.
_add_files(crate, experiment_path, "status", expid, "text/plain")
# Add SQLite DB and pickle files.
_add_files(crate, experiment_path, "pkl", expid, "application/octet-stream")

Creating RO-Crate archive...


In [7]:
# Inputs and Outputs

rocrate_json = workflow_configuration.get('ROCRATE', None)
if not rocrate_json:
    yaml = YAML()
    yaml_content = '''
ROCRATE:
  INPUTS:
    # Add the extra keys to be exported.
    - "MHM"
  OUTPUTS:
    # Relative to the Autosubmit project folder.
    - "*/*.gif"
  PATCH: |
    {
      "@graph": [
        {
          "@id": "./",
          "license": "Apache-2.0",
          "creator": {
            "@id": "https://orcid.org/0000-0001-8250-4074"
          }
        },
        {
          "@id": "https://orcid.org/0000-0001-8250-4074",
          "@type": "Person",
          "affiliation": {
              "@id": "https://ror.org/05sd8tv96"
          }
        },
      ]
    }
    '''
    rocrate_json = yaml.load(yaml_content)

print('RO-Crate JSON:')
print(rocrate_json)

RO-Crate JSON:
{'ROCRATE': {'INPUTS': ['MHM'], 'OUTPUTS': ['*/*.gif'], 'PATCH': '{\n  "@graph": [\n    {\n      "@id": "./",\n      "license": "Apache-2.0",\n      "creator": {\n        "@id": "https://orcid.org/0000-0001-8250-4074"\n      }\n    },\n    {\n      "@id": "https://orcid.org/0000-0001-8250-4074",\n      "@type": "Person",\n      "affiliation": {\n          "@id": "https://ror.org/05sd8tv96"\n      }\n    },\n  ]\n}\n'}}


In [8]:
create_action_properties = {
    "@type": "CreateAction",
    "actionStatus": {"@id": f"http://schema.org/{_get_action_status(jobs)}"},
    "description": crate.description
}
if start_time is not None:
    create_action_properties['startTime'] = start_time
if end_time is not None:
    create_action_properties['endTime'] = end_time
create_action = crate.add(
    ContextEntity(crate, '#create-action', create_action_properties)
)
crate.root_dataset.properties().update({
    'mentions': {'@id': create_action.id}
})

project_type = as_conf.experiment_data['PROJECT']['PROJECT_TYPE'].upper()
exported_keys = DEFAULT_EXPORTED_KEYS.copy()
if project_type == 'LOCAL':
    exported_keys.append('LOCAL')
elif project_type == 'GIT':
    exported_keys.append('GIT')

ins = []
outs = []
# TODO: Modify when we manage to have dicts/objects in YAML,
#       https://earth.bsc.es/gitlab/es/autosubmit/-/issues/1045
if 'INPUTS' in rocrate_json and rocrate_json['INPUTS']:
    ins.extend(rocrate_json['INPUTS'])
if 'OUTPUTS' in rocrate_json and rocrate_json['OUTPUTS']:
    outs.extend(rocrate_json['OUTPUTS'])
# Add the extra keys defined by the user in the ``ROCRATE.INPUT``.
if ins:
    exported_keys.extend(ins)

In [12]:
tmp_dir = TemporaryDirectory()
print(tmp_dir.name)

crate.write(tmp_dir.name)

# Prints some chars only -- we know if it is correct by looking at the
# RootDataset.
with open(Path(tmp_dir.name, 'ro-crate-metadata.json')) as f:
    print(f.read()[:1000])

rmtree(tmp_dir.name)

/tmp/tmpx__9op8c
{
    "@context": "https://w3id.org/ro/crate/1.1/context",
    "@graph": [
        {
            "@id": "ro-crate-preview.html",
            "@type": "CreativeWork",
            "about": {
                "@id": "./"
            }
        },
        {
            "@id": "./",
            "@type": "Dataset"
        },
        {
            "@id": "ro-crate-metadata.json",
            "@type": "CreativeWork",
            "about": {
                "@id": "./"
            },
            "conformsTo": [
                {
                    "@id": "https://w3id.org/ro/crate/1.1"
                },
                {
                    "@id": "https://w3id.org/workflowhub/workflow-ro-crate/1.0"
                }
            ]
        },
        {
            "@id": "https://w3id.org/ro/wfrun/process/0.5",
            "@type": "CreativeWork",
            "name": "Process Run Crate",
            "version": "0.5"
        },
        {
            "@id": "https://w3id.org/ro/wfr