# Summarise MOTBX resources

Convert MOTBX resources (YAML files) to CSV file.

In [1]:
import os
import yaml
from pathlib import Path
from motbxtools import motbxschema
import pprint
pp = pprint.PrettyPrinter(indent = 2)

CWD = Path.cwd()
if CWD.name != "notebooks":
    print("Make sure to run this notebook from the 'notebooks' directory.")

MOTBX_DIR = CWD.parent # home directory of this repository

with open(MOTBX_DIR.joinpath("resources/MOTBX_version.yaml"), "r") as f:
    MOTBX_VERSION = yaml.safe_load(f)

# path to directory where resources YAML file are saved
RESOURCES_DIR = MOTBX_DIR.joinpath("resources/curated")
SUMMARY_DIR = MOTBX_DIR.joinpath("resources/summary")
if not os.path.exists(SUMMARY_DIR):
    os.mkdir(SUMMARY_DIR)
SUMMARY_CSV_LATEST = SUMMARY_DIR.joinpath(
    f"MOTBX_{MOTBX_VERSION['latest']}.csv")
SUMMARY_CSV_LATEST_EXCL_INVALID = SUMMARY_DIR.joinpath(
    f"MOTBX_{MOTBX_VERSION['latest']}_excl_invalid.csv")
SUMMARY_CSV_PREVIOUS = SUMMARY_DIR.joinpath(
    f"MOTBX_{MOTBX_VERSION['previous'][0]}.csv")
CHANGELOG_CSV = SUMMARY_DIR.joinpath(
    f"changelog_{MOTBX_VERSION['latest']}.csv")
# path to validation report
# - file that contains info about failed validations of MOTBX resources
VALIDATION_REPORT = SUMMARY_DIR.joinpath(
    f"validation_report_{MOTBX_VERSION['latest']}.txt")

# path to JSON SCHEMA file defining structure of MOTBX resources
SCHEMA_JSON = MOTBX_DIR.joinpath("schema/motbxschema.json")

TEST_RESOURCES_DIR = MOTBX_DIR.joinpath("tests/resources_pass")
TEST_SUMMARY_CSV = MOTBX_DIR.joinpath("tests/resources.csv")

Create YAML file summary for test resources.

In [2]:
test_collection = motbxschema.MotbxCollection(TEST_RESOURCES_DIR, SCHEMA_JSON)
test_collection.summarise(TEST_SUMMARY_CSV, validate=True)

Loading MOTBX resources | test2.yaml

Create summary for curated resources

In [2]:
motbx_collection = motbxschema.MotbxCollection(RESOURCES_DIR, SCHEMA_JSON)

In [None]:
motbx_collection.summarise(SUMMARY_CSV_LATEST,
                           validate=True, exclude_invalid=False,
                           old_summary_csv_path=SUMMARY_CSV_PREVIOUS,
                           changelog_path=CHANGELOG_CSV,
                           validationlog_path=VALIDATION_REPORT)

In [3]:
motbx_collection.summarise(SUMMARY_CSV_LATEST_EXCL_INVALID,
                           validate=True, exclude_invalid=True,
                           old_summary_csv_path=SUMMARY_CSV_PREVIOUS)

HTTPSConnectionPool(host='chinese-quartet.org', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1002)')))
Resource: ID0010.yaml
URL: https://chinese-quartet.org/#/dashboard
-------------------------------------------------------------------------------
Loading MOTBX resources | ID0024.yaml
Resource: ID0024.yaml
URL: SampleAnalysisProtocol/UU_MethSeq/MOTBX_InternalProtocol_MethylSeq_20230706/pdf
-------------------------------------------------------------------------------
Loading MOTBX resources | ID0032.yaml
Resource: ID0032.yaml
URL: SampleAnalysisProtocols/IRYCIS_qRT_PCR/MOTBX_InternalProtocol_PlasmaPreparation_20230630.pdf
-------------------------------------------------------------------------------
Loading MOTBX resources | ID0033.yaml
Resource: ID0033.yaml
URL: SampleAnalysisProtocolsIRYCIS_qRT_PCR/MOTBX_InternalProtocol_SerumP

In [6]:
info = motbx_collection.get_info()
tags = sorted(info["resourceTags"])
print(len(tags), "tags:")
for idx in range(0, len(tags), 5):
    try:
        print(", ".join(tags[idx:idx+5]))
    except:
        print(", ".join(tags[idx:]))

print()
print("categories and subcategories:")
for t in sorted(info[("resourceCategory", "resourceSubcategory")]):
    print("\t\t".join(t))

32 tags:
DNA, EATRIS-Plus project, ISO standard, PCR, array
biological fluid, biomarker, data standard, database, epigenomics
genomics, guideline, internal quality control (IQC), mRNA, mass spectrometry
metabolomics, metadata standard, miRNA, minimum information standard, multi-omics
phenotypic data, preclinical study, proficiency testing, proteomics, protocol
reference data set, reference material, registry, scientific publication, sequencing
software application, transcriptomics

categories and subcategories:
Data analysis		Computing platforms
Data analysis		Software applications and workflows
Data management and stewardship		Data and metadata standards
Data management and stewardship		Databases and catalogues
Data management and stewardship		Guidelines and best practices
Epigenomics		Guidelines and best practices
Epigenomics		Laboratory protocols and methods
Genomics		Guidelines and best practices
Metabolomics		Guidelines and best practices
Metabolomics		Laboratory protocols and met