# Change tags and add keywords

Keywords are only used for search functionality, tags are visible to user.

In [20]:
from pathlib import Path
import os
import csv
from motbxtools import motbxschema
import json
import yaml
from collections import  defaultdict as ddict
import pprint
pp = pprint.PrettyPrinter(indent = 2)

CWD = Path.cwd()
if CWD.name != "notebooks":
    print("Make sure to run this notebook from the 'notebooks' directory.")

MOTBX_DIR = CWD.parent # home directory of this repository

with open(MOTBX_DIR.joinpath("resources/MOTBX_version.yaml"), "r") as f:
    MOTBX_VERSION = yaml.safe_load(f)

# path to directory where resources YAML file are saved
RESOURCES_DIR = MOTBX_DIR.joinpath("resources/curated")

# path to JSON SCHEMA file defining structure of MOTBX resources
SCHEMA_JSON = MOTBX_DIR.joinpath("schema/motbxschema.json")

TAG_MAPPING = MOTBX_DIR.joinpath("schema/tags_keywords.yaml")

In [21]:
# load JSON schema from file
schema = motbxschema.MotbxSchema(SCHEMA_JSON)

# print schema formatted as JSON
print(json.dumps(schema.schema, indent = 2))

{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "title": "MOTBX resource",
  "description": "Schema for resources of the EATRIS Multi-omics Toolbox (MOTBX)",
  "type": "object",
  "properties": {
    "resourceID": {
      "type": "string"
    },
    "resourceCategory": {
      "type": "string",
      "enum": [
        "Epigenomics",
        "External Quality Assessment",
        "Genomics",
        "Internal Quality Control",
        "Metabolomics",
        "Omics data management and analysis",
        "Proteomics",
        "Transcriptomics"
      ]
    },
    "resourceSubcategory": {
      "type": "string",
      "minLength": 4,
      "maxLength": 30
    },
    "resourceTitle": {
      "type": "string",
      "minLength": 15,
      "maxLength": 160
    },
    "resourceDescription": {
      "type": "string",
      "minLength": 50,
      "maxLength": 2500
    },
    "resourceUrl": {
      "type": "string",
      "format": "uri",
      "pattern": "^https://|.pdf$"
    },

In [22]:
# define CSV column names
fieldnames = list(schema.schema["properties"].keys())
print(fieldnames)

['resourceID', 'resourceCategory', 'resourceSubcategory', 'resourceTitle', 'resourceDescription', 'resourceUrl', 'resourceTags', 'resourceKeywords']


In [23]:
with open(TAG_MAPPING, "r") as file:
    tag_mapping = yaml.safe_load(file)

tag_mapping

[{'translational research data': [{'omics': [{'synonyms': ['type of omics',
       'omics type',
       'experimental data']},
     'multi-omics (tag)',
     {'genomics (tag)': [{'synonyms': ['genome', 'genomics DNA', 'genotype']},
       {'whole genome': [{'synonyms': ['WGS']}]},
       {'exome': [{'synonyms': ['whole exome', 'whole-exome sequencing']}]},
       'copy number variation',
       {'somatic genome variations': [{'synonyms': ['somatic mutations',
           'somatic mutation']}]}]},
     {'epigenomics (tag)': [{'synonyms': ['epigenetic modifications',
         'epigenome']},
       {'CpG methylation': [{'synonyms': ['methylation', 'methylome']}]}]},
     {'transcriptomics (tag)': [{'synonyms': ['transcriptome',
         'gene expression',
         'RNA']},
       {'mRNA-seq': [{'synonyms': ['mRNA',
           'mRNA sequencing',
           'mRNA Seq',
           'messenger RNA']}]},
       {'miRNA-seq': [{'synonyms': ['miRNA',
           'miRNA sequencing',
           'miRN

In [24]:
def unnest(nested, pre=None):
    pre = pre[:] if pre else []
    if isinstance(nested, dict):
        for key, value in nested.items():
            if isinstance(value, dict):
                for d in unnest(value, pre + [key]):
                    yield d
            elif isinstance(value, list):
                for v in value:
                    for d in unnest(v, pre + [key]):
                        yield d
            else:
                yield pre + [key, value]
    elif isinstance(nested, list):
        for value in nested:
            if isinstance(value, dict):
                for d in unnest(value, pre):
                    yield d
            elif isinstance(value, list):
                for v in value:
                    for d in unnest(v, pre):
                        yield d
            else:
                yield pre + [key, value]
    else:
        yield pre + [nested]

tags = set()
term2parents = ddict(set)
term2synonyms = ddict(set)
for l in unnest(tag_mapping):
    _tags = set([i for i in l if i[-5:] == "(tag)"])
    if len(_tags) > 1:
        print(_tags)
    tags |= _tags
    if "synonyms" in l:
        term2synonyms[l[-3]].add(l[-1])
    for idx, i in enumerate(l):
        if i == "synonyms":
            continue
        term2parents[i] |= set(l[:idx])

synonym2terms = ddict(set)
for k, v in term2synonyms.items():
    for i in v:
        synonym2terms[i].add(k)

term2keywords = ddict(set)
for k, v in term2parents.items():
    k = k.replace(" (tag)", "")
    v = set([i.replace(" (tag)", "") for i in v])
    term2keywords[k] |= v.union(*[term2synonyms[i] for i in v])

len(tags), len(term2keywords) + len([i for j in term2synonyms.values() for i in j])

(43, 543)

In [25]:
# iterate through resources
for root, dirs, files in os.walk(RESOURCES_DIR):
    for name in files:
        if not name.endswith(".yaml"):
            continue
        print(name)
        # load test resource and validate
        resource = motbxschema.MotbxResource(os.path.join(root, name))
        resource_tags = set()
        resource_keywords = set()
        for i in resource.resource["resourceTags"]:
            if f'{i} (tag)' in tags:
                # add old tag to new tags
                resource_tags.add(i)
                # add synonyms defined for this tag as keywords
                resource_keywords |= set(term2keywords[f'{i} (tag)'])
            elif i in term2parents:
                resource_keywords.add(i)
                resource_keywords |= set(term2keywords[i])
                _tags = [j.replace(" (tag)", "") for j in term2parents[i] if j[-5:] == "(tag)"]
                if _tags:
                    resource_tags |= set(_tags)
            else:
                #print(i)
                if i in synonym2terms.keys():
                    #print(synonym2terms[i])
                    resource_keywords.add(i)
                    for t in synonym2terms[i]:
                        if t in tags:
                            resource_tags.add(t[:-6])
                            resource_keywords |= set(term2synonyms[t])
                        else:
                            resource_keywords.add(t)
                else:
                    resource_keywords.add(i)
        resource.resource["resourceTags"] = sorted(list(resource_tags))
        resource.resource["resourceKeywords"] = sorted(list(resource_keywords))
        try:
            resource.validate(schema)
        except Exception as error:
            print(error)
        #resource.save()
        #break

ID0001.yaml


ID0002.yaml
ID0003.yaml
ID0004.yaml
ID0005.yaml
ID0006.yaml
ID0007.yaml

ID0008.yaml
ID0009.yaml
ID0010.yaml
HTTPSConnectionPool(host='chinese-quartet.org', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1002)')))
ID0011.yaml
ID0012.yaml
ID0013.yaml
ID0014.yaml
ID0015.yaml

ID0016.yaml

ID0017.yaml
ID0018.yaml
ID0019.yaml
ID0020.yaml
ID0021.yaml

ID0022.yaml
ID0023.yaml
ID0024.yaml
ID0025.yaml
ID0026.yaml
ID0027.yaml
ID0028.yaml
ID0029.yaml

ID0030.yaml
ID0031.yaml

ID0032.yaml
ID0033.yaml
ID0034.yaml
ID0035.yaml
ID0036.yaml
ID0037.yaml
ID0038.yaml
ID0039.yaml
ID0040.yaml

ID0041.yaml
ID0042.yaml
ID0043.yaml

ID0044.yaml
ID0045.yaml

ID0046.yaml

ID0047.yaml

ID0048.yaml

ID0049.yaml
ID0050.yaml

ID0051.yaml

ID0052.yaml
ID0053.yaml
ID0054.yaml
ID0055.yaml
ID0056.yaml
ID0057.yaml
ID0058.yaml
ID0059.yaml
ID0060.yaml
ID0061.yaml
ID0062.y