# Summarise MOTBX resources

Convert MOTBX resources (YAML files) to CSV file.

In [158]:
from pathlib import Path
import os
import csv
from motbxtools import motbxschema
import json
import yaml
from collections import  defaultdict as ddict
import pprint
pp = pprint.PrettyPrinter(indent = 2)

CWD = Path.cwd()
if CWD.name != "notebooks":
    print("Make sure to run this notebook from the 'notebooks' directory.")

MOTBX_DIR = CWD.parent # home directory of this repository

with open(MOTBX_DIR.joinpath("resources/MOTBX_version.yaml"), "r") as f:
    MOTBX_VERSION = yaml.safe_load(f)

# path to directory where resources YAML file are saved
RESOURCES_DIR = MOTBX_DIR.joinpath("resources/curated")

# path to JSON SCHEMA file defining structure of MOTBX resources
SCHEMA_JSON = MOTBX_DIR.joinpath("schema/motbxschema.json")

TAG_MAPPING = MOTBX_DIR.joinpath("tags_keywords.yaml")

In [159]:
# load JSON schema from file
schema = motbxschema.MotbxSchema(SCHEMA_JSON)

# print schema formatted as JSON
print(json.dumps(schema.schema, indent = 2))

{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "title": "MOTBX resource",
  "description": "Schema for resources of the EATRIS Multi-omics Toolbox (MOTBX)",
  "type": "object",
  "properties": {
    "resourceID": {
      "type": "string"
    },
    "resourceCategory": {
      "type": "string",
      "enum": [
        "Epigenomics",
        "External Quality Assessment",
        "Genomics",
        "Internal Quality Control",
        "Metabolomics",
        "Omics data management and analysis",
        "Proteomics",
        "Transcriptomics"
      ]
    },
    "resourceSubcategory": {
      "type": "string",
      "minLength": 4,
      "maxLength": 30
    },
    "resourceTitle": {
      "type": "string",
      "minLength": 15,
      "maxLength": 160
    },
    "resourceDescription": {
      "type": "string",
      "minLength": 50,
      "maxLength": 2500
    },
    "resourceUrl": {
      "type": "string",
      "format": "uri",
      "pattern": "^https://|.pdf$"
    },

In [160]:
# define CSV column names
fieldnames = list(schema.schema["properties"].keys())
print(fieldnames)

['resourceID', 'resourceCategory', 'resourceSubcategory', 'resourceTitle', 'resourceDescription', 'resourceUrl', 'resourceTags']


In [161]:
with open(TAG_MAPPING, "r") as file:
    tag_mapping = yaml.safe_load(file)

tag_mapping

[{'translational research data': [{'omics': [{'synonyms': ['type of omics',
       'omics type',
       'experimental data']},
     'multi-omics (tag)',
     {'genomics (tag)': [{'synonyms': ['genome', 'genomics DNA', 'genotype']},
       'whole genome',
       {'exome': [{'synonyms': ['whole exome']}]},
       'copy number variation',
       {'somatic genome variations': [{'synonyms': ['somatic mutations',
           'somatic mutation']}]}]},
     {'epigenomics (tag)': [{'synonyms': ['epigenetic modifications',
         'epigenome']},
       {'CpG methylation': [{'synonyms': ['methylation', 'methylome']}]}]},
     {'transcriptomics (tag)': [{'synonyms': ['transcriptome',
         'gene expression',
         'RNA']},
       {'mRNA': [{'synonyms': ['messenger RNA']}]},
       {'miRNA': [{'synonyms': ['micro RNA', 'microRNA']}]}]},
     {'proteomics (tag)': [{'synonyms': ['protein levels',
         'proteins',
         'proteome']}]},
     {'metabolomics (tag)': [{'synonyms': ['metabolit

In [162]:
def unnest(nested, pre=None):
    pre = pre[:] if pre else []
    if isinstance(nested, dict):
        for key, value in nested.items():
            if isinstance(value, dict):
                for d in unnest(value, pre + [key]):
                    yield d
            elif isinstance(value, list):
                for v in value:
                    for d in unnest(v, pre + [key]):
                        yield d
            else:
                yield pre + [key, value]
    elif isinstance(nested, list):
        for value in nested:
            if isinstance(value, dict):
                for d in unnest(value, pre):
                    yield d
            elif isinstance(value, list):
                for v in value:
                    for d in unnest(v, pre):
                        yield d
            else:
                yield pre + [key, value]
    else:
        yield pre + [nested]

tags = set()
term2parents = ddict(set)
term2synonyms = ddict(set)
for l in unnest(tag_mapping):
    _tags = set([i for i in l if i[-5:] == "(tag)"])
    if len(_tags) > 1:
        print(_tags)
    tags |= _tags
    if "synonyms" in l:
        term2synonyms[l[-3]].add(l[-1])
    for idx, i in enumerate(l):
        if i == "synonyms":
            break
        term2parents[i] |= set(l[:idx])

synonym2terms = ddict(set)
for k, v in term2synonyms.items():
    for i in v:
        synonym2terms[i].add(k)

term2keywords = ddict(set)
for k, v in term2parents.items():
    term2keywords[k] |= v.union(*[term2synonyms[i] for i in v])

len(tags), len(term2keywords) + len([i for j in term2synonyms.values() for i in j]) + 62

(41, 344)

In [163]:
# iterate through resources
for root, dirs, files in os.walk(RESOURCES_DIR):
    for name in files:
        if not name.endswith(".yaml"):
            continue
        print(name)

        # load test resource and validate
        resource = motbxschema.MotbxResource(os.path.join(root, name))
        #resource.validate(schema) don't validate in first version
        resource_tags = set()
        resource_keywords = set()
        for i in resource.resource["resourceTags"]:
            i = i.lower()
            if f'{i} (tag)' in tags:
                # add old tag to new tags
                resource_tags.add(i)
                # add synonyms defined for this tag as keywords
                resource_keywords |= set(term2keywords[f'{i} (tag)'])
            elif i in term2parents:
                resource_keywords.add(i)
                resource_keywords |= set(term2keywords[i])
            else:
                #print(i)
                if i in synonym2terms.keys():
                    #print(synonym2terms[i])
                    resource_keywords.add(i)
                    for t in synonym2terms[i]:
                        if t in tags:
                            resource_tags.add(t[:-6])
                            resource_keywords |= set(term2synonyms[t])
                        else:
                            resource_keywords.add(t)
                else:
                    print(i, "!")
                    resource_keywords.add(i)
        #print(resource_tags)
        #pp.pprint(sorted(resource_keywords))
        #break

ID0001.yaml
ID0002.yaml
documentation !
ID0003.yaml
ID0004.yaml
miRNA-seq !
nucleic acid quantitation !
ID0005.yaml
nucleic acid quantitation !
ID0006.yaml
WGS !
ID0007.yaml
ID0008.yaml
reference dataset !
ID0009.yaml
methyl-seq kit !
ID0010.yaml
reference dataset !
ID0011.yaml
EMQN !
certification !
external quality assessment (EQA) !
quality !
ID0012.yaml
EQIPD !
certification !
external quality assessment (EQA) !
quality !
toolbox !
ID0013.yaml
ECCQ !
certification !
external quality assessment (EQA) !
quality !
ID0014.yaml
external quality assessment (EQA) !
proficiency testing programme !
ID0015.yaml
WGS !
workflow !
ID0016.yaml
workflow !
ID0017.yaml
SRiC !
publication !
ID0018.yaml
toolkit !
ID0019.yaml
publication !
ID0020.yaml
data processing !
library preparation !
publication !
ID0021.yaml
publication !
ID0022.yaml
ID0023.yaml
publication !
ID0024.yaml
EATRIS-Plus project !
ID0025.yaml
Hi-seq !
data quality !
publication !
ID0026.yaml
BWAmeth !
NFcore !
alignment !
bismark !