# Summarise MOTBX resources

Convert MOTBX resources (YAML files) to CSV file.

In [1]:
from pathlib import Path
import os
import csv
from motbxtools import motbxschema
import json
import yaml
import pprint
pp = pprint.PrettyPrinter(indent = 2)

CWD = Path.cwd()
if CWD.name != "notebooks":
    print("Make sure to run this notebook from the 'notebooks' directory.")

MOTBX_DIR = CWD.parent # home directory of this repository

with open(MOTBX_DIR.joinpath("resources/MOTBX_version.yaml"), "r") as f:
    MOTBX_VERSION = yaml.safe_load(f)

# path to directory where resources YAML file are to be saved
RESOURCES_DIR = MOTBX_DIR.joinpath("resources/curated")
SUMMARY_DIR = MOTBX_DIR.joinpath("resources/summary")
if not os.path.exists(SUMMARY_DIR):
    os.mkdir(SUMMARY_DIR)
SUMMARY_CSV = SUMMARY_DIR.joinpath(f"MOTBX_{MOTBX_VERSION['latest']}.csv")

# path to JSON SCHEMA file defining structure of MOTBX resources
SCHEMA_JSON = MOTBX_DIR.joinpath("schema/motbxschema.json")

TEST_RESOURCES_DIR = MOTBX_DIR.joinpath("tests/resources_pass")
TEST_SUMMARY_CSV = MOTBX_DIR.joinpath("tests/resources.csv")

In [2]:
# load JSON schema from file
schema = motbxschema.MotbxSchema(SCHEMA_JSON)

# print schema formatted as JSON
print(json.dumps(schema.schema, indent = 2))

{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "title": "MOTBX resource",
  "description": "Schema for resources of the EATRIS Multi-omics Toolbox (MOTBX)",
  "type": "object",
  "properties": {
    "resourceID": {
      "type": "string"
    },
    "resourceCategory": {
      "type": "string",
      "enum": [
        "Epigenomics",
        "External Quality Assessment",
        "Genomics",
        "Internal Quality Control",
        "Metabolomics",
        "Omics data management and analysis",
        "Proteomics",
        "Transcriptomics"
      ]
    },
    "resourceSubcategory": {
      "type": "string",
      "minLength": 4,
      "maxLength": 30
    },
    "resourceTitle": {
      "type": "string",
      "minLength": 15,
      "maxLength": 160
    },
    "resourceDescription": {
      "type": "string",
      "minLength": 50,
      "maxLength": 2500
    },
    "resourceUrl": {
      "type": "string",
      "format": "uri",
      "pattern": "^https://|^http://|^ftp

In [3]:
# define CSV column names
fieldnames = list(schema.schema["properties"].keys())
print(fieldnames)

['resourceID', 'resourceCategory', 'resourceSubcategory', 'resourceTitle', 'resourceDescription', 'resourceUrl', 'resourceTags']


Create YAML file summary for test resources.

In [4]:
# open CSV file
with open(TEST_SUMMARY_CSV, "w", newline = "", encoding = "utf-8") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames = fieldnames)
    writer.writeheader()

    # iterate through resources
    for root, dirs, files in os.walk(TEST_RESOURCES_DIR):
        for name in files:
            if not name.endswith(".yaml"):
                continue
            print(name)

            # load test resource and validate
            resource = motbxschema.MotbxResource(os.path.join(root, name))
            resource.validate(schema)

            # write to CSV file
            row = resource.flatten(fieldnames)
            pp.pprint(row)
            writer.writerow(row)

test1.yaml
{ 'resourceCategory': 'Internal Quality Control',
  'resourceDescription': 'ISO Guide 80:2014 guidance for the in-house '
                         'preparation of quality control materials (QCMs). ISO '
                         'Guide 80 outlines the characteristics and '
                         'preparation processes of reference materials for '
                         'quality control. It applies to stable materials used '
                         'locally and those transported without significant '
                         'property changes. Laboratory staff preparing '
                         'in-house quality control materials should follow ISO '
                         'Guides 34 and 35 for transportation-based supply '
                         'chains. The preparation of quality control materials '
                         'requires assessments for homogeneity, stability, and '
                         'limited characterization. It aims to demonstrate '
          

Create summary for curated resources

In [6]:
# open CSV file
with open(SUMMARY_CSV, "w", newline = "", encoding = "utf-8") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames = fieldnames)
    writer.writeheader()

    # iterate through resources
    for root, dirs, files in os.walk(RESOURCES_DIR):
        for name in files:
            if not name.endswith(".yaml"):
                continue
            print(name)

            # load test resource and validate
            resource = motbxschema.MotbxResource(os.path.join(root, name))
            #resource.validate(schema) don't validate in first version

            # write to CSV file
            row = resource.flatten(fieldnames)
            writer.writerow(row) # TODO: fix 403 error

ID0001.yaml
ID0002.yaml
ID0003.yaml
ID0004.yaml
ID0005.yaml
ID0006.yaml
ID0007.yaml
ID0008.yaml
ID0009.yaml
ID0010.yaml
ID0011.yaml
ID0012.yaml
ID0013.yaml
ID0014.yaml
ID0015.yaml
ID0016.yaml
ID0017.yaml
ID0018.yaml
ID0019.yaml
ID0020.yaml
ID0021.yaml
ID0022.yaml
ID0023.yaml
ID0024.yaml
ID0025.yaml
ID0026.yaml
ID0027.yaml
ID0028.yaml
ID0029.yaml
ID0030.yaml
ID0031.yaml
ID0032.yaml
ID0033.yaml
ID0034.yaml
ID0035.yaml
ID0036.yaml
ID0037.yaml
ID0038.yaml
ID0039.yaml
ID0040.yaml
ID0041.yaml
ID0042.yaml
ID0043.yaml
ID0044.yaml
ID0045.yaml
ID0046.yaml
ID0047.yaml
ID0048.yaml
ID0049.yaml
ID0050.yaml
ID0051.yaml
ID0052.yaml
ID0053.yaml
ID0054.yaml
ID0055.yaml
ID0056.yaml
ID0057.yaml
ID0058.yaml
ID0059.yaml
ID0060.yaml
ID0061.yaml
ID0062.yaml
ID0063.yaml
ID0064.yaml
ID0065.yaml
ID0066.yaml
ID0067.yaml
ID0068.yaml
ID0069.yaml
ID0070.yaml
ID0071.yaml
ID0072.yaml
ID0073.yaml
ID0074.yaml
ID0075.yaml
ID0076.yaml
ID0077.yaml
ID0078.yaml
ID0079.yaml
ID0080.yaml
ID0081.yaml
