# Add first version of MOTBX content to this repository

This notebook must be run from the `notebooks` directory.

## Define file paths

Change file paths if necessary.

In [69]:
from pathlib import Path
import os
import pprint
pp = pprint.PrettyPrinter(indent=2, width=80, compact=True)


CWD = Path.cwd()
if CWD.name != "notebooks":
    print("Make sure to run this notebook from the 'notebooks' directory.")

MOTBX_DIR = CWD.parent # home directory of this repository
# path to Excel file containing legacy resource descriptions
MOTBX_LEGACY_XLSX = MOTBX_DIR.joinpath(
    "resources/legacy/MOTBX_resources_for_website - Copy 2023-07-28.xlsx")
# name of sheet in Excel file containing MOTBX resource descriptions
MOTBX_LEGACY_SHEET_NAME = "website_content_for_JB"
# path to directory where resources YAML file are to be saved
RESOURCES_DIR = MOTBX_DIR.joinpath("resources/curated")
# create new directory to save resource files if it doesn't exist yet
if not os.path.exists(RESOURCES_DIR):
    os.mkdir(RESOURCES_DIR)
# path file listing resources that fail validation with MOTBX resource schema
MOTBX_LEGACY_ISSUES = MOTBX_DIR.joinpath(
    "resources/legacy/NOTES_on_MOTBX_resources_for_website.csv")
# path to JSON SCHEMA file defining structure of MOTBX resources
SCHEMA_JSON = MOTBX_DIR.joinpath("schema/motbxschema.json")

## Read input file

Load sheet from Excel file and print information about colummns and field values.

In [70]:
import pandas as pd

content_df = pd.read_excel(
    MOTBX_LEGACY_XLSX, sheet_name=MOTBX_LEGACY_SHEET_NAME, header=0)
content_df.head()

Unnamed: 0,Category,Category_description,Sub-category,Resource_title,Resource_description,Link_to_resource,Tags,Doc_type,Format,OLD_Resource_description
0,Internal Quality Control,,Guidelines and best practices,ISO Guide 80:2014: Guidance for in-house prepa...,ISO Guide 80:2014 guidance for the in-house pr...,https://www.iso.org/standard/44313.html,"ISO standard, guidelines, quality control mate...",website/external,Website,Guidelines for the in-house preparation of qua...
1,Internal Quality Control,,Guidelines and best practices,ISO Guide 31:2015: Reference material - conten...,ISO Guide 31:2015 guideline assists reference ...,https://www.iso.org/standard/52468.html,"ISO standard, documentation, guidelines, refer...",Reference material,Website,"Contents of certificates, labels and accompany..."
2,Internal Quality Control,,Reference material,ERCC RNA Spike-In Mix: ThermoFisher Scientific...,Variation in RNA expression data can be attrib...,https://www.thermofisher.com/order/catalog/pro...,"reference material, RNA-seq, NGS, sequencing, ...",Reference material,Website,ERCC RNA Spike-In Mix used as RNA control for ...
3,Internal Quality Control,,Reference material,Universal Human Reference RNA: ThermoFisher Sc...,Universal Human Reference RNA is an RNA extrac...,https://www.thermofisher.com/order/catalog/pro...,"reference material, RNA-seq, NGS, sequencing, ...",Reference material,Website,Universal Human Reference RNA used as a positi...
4,Internal Quality Control,,Reference material,Total Human Brain Reference RNA: ThermoFisher ...,Human Brain Total RNA is a total RNA sample ex...,https://www.thermofisher.com/order/catalog/pro...,"reference material, RNA-seq, NGS, sequencing, ...",Reference material,Website,Total Human brain RNA used as positive control...


### Explore file content

In [71]:
# print column names
pp.pprint(list(content_df.columns))

[ 'Category', 'Category_description', 'Sub-category', 'Resource_title',
  'Resource_description', 'Link_to_resource', 'Tags', 'Doc_type', 'Format',
  'OLD_Resource_description']


In [72]:
# print category and corresponding subcategory names
set([(i["Category"], i["Sub-category"]) for idx, i in content_df.iterrows()])

{('Epigenomics', 'Guidelines and best practices'),
 ('Epigenomics', 'Protocols'),
 ('Epigenomics', 'Raw data processing'),
 ('Epigenomics', 'Technical and quality checks'),
 ('External Quality Assessment', 'Proficiency Testing programme'),
 ('External Quality Assessment', 'Quality certification'),
 ('Genomics', 'Guidelines and best practices'),
 ('Genomics', 'Raw data processing'),
 ('Internal Quality Control', 'Guidelines and best practices'),
 ('Internal Quality Control', 'Reference material'),
 ('Metabolomics', 'Clinical use cases'),
 ('Metabolomics', 'Guidelines and best practices'),
 ('Metabolomics', 'Protocols'),
 ('Metabolomics', 'Raw data processing'),
 ('Omics data management and analysis', 'FAIR omics data'),
 ('Omics data management and analysis', 'Multi-omics analysis'),
 ('Proteomics', 'Guidelines and best practices'),
 ('Proteomics', 'Protocols'),
 ('Proteomics', 'Raw data processing'),
 ('Proteomics', 'Reference material'),
 ('Proteomics', 'Samples processing'),
 ('Prote

In [73]:
from IPython.display import Markdown

# print length of category names, etc.
len_cat = set([len(i) for i in content_df["Category"]])
len_sub = set([len(i) for i in content_df["Sub-category"]])
len_title = set([len(i) for i in content_df["Resource_title"]])
len_desc = set([len(i) for i in content_df["Resource_description"]])

Markdown("""
Length of category names ranges from %i to %i characters.

Length of sub-category names ranges from %i to %i characters.

Length of titles ranges from %i to %i characters.

Length of resource descriptions ranges from %i to %i characters."""%(
    min(len_cat), max(len_cat), min(len_sub), max(len_sub),
    min(len_title), max(len_title), min(len_desc), max(len_desc)))


Length of category names ranges from 8 to 34 characters.

Length of sub-category names ranges from 4 to 29 characters.

Length of titles ranges from 16 to 158 characters.

Length of resource descriptions ranges from 59 to 2480 characters.

In [74]:
# print tags
pp.pprint(sorted(set([j.strip() for i in content_df["Tags"]
    for j in i.split(",")])))

[ 'BWAmeth', 'CpG methylation', 'DNAseq', 'EATRIS-Plus project', 'ECCQ',
  'EM-seq', 'EM-seq NGS', 'EMQN', 'EQIPD', 'FAIR principles', 'HR-MS analysis',
  'Hi-seq', 'ISO standard', 'MIQE', 'MZmine', 'NFcore', 'NGS', 'RNA', 'RNA-seq',
  'RT-PCR', 'RT-qPCR', 'Retrotranscription', 'SRiC', 'WGS', 'alignment',
  'analysis platform', 'analysis workflows', 'biofluids', 'bioinformatics',
  'biomarker', 'biomarkers', 'bismark', 'cDNA synthesis', 'certification',
  'clinical guidelines', 'clinical validation', 'data analysis',
  'data archives', 'data processing', 'data quality', 'data search',
  'data standards', 'documentation', 'epigenomics', 'epigneomics',
  'external quality assessment (EQA)', 'extraction', 'genomics',
  'good practices', 'guidelines', 'in-house', 'internal quality control (IQC)',
  'isolation', 'library preparation', 'lipidomics', 'mRNA', 'mapping',
  'mass spectrometry', 'metabolites', 'metabolomics', 'metadata',
  'metadata standard', 'metadata standards', 'methyl-seq', 

In [75]:
# Doc_type names
sorted(set([i for i in content_df["Doc_type"] if isinstance(i, str)]))

['Article',
 'Article/Guideline',
 'Certification',
 'Deliverable/report',
 'Guideline',
 'Reference material',
 'protocol/SOP',
 'source code',
 'source code and documentation',
 'template/Case study',
 'website/external']

In [76]:
# Format names
sorted(set([i for i in content_df["Format"] if isinstance(i, str)]))

['Git repository, Jupyter notebook, and ISA files',
 'GitHub repository',
 'PDF',
 'Website',
 'Word/Pdf file',
 'link']

## Create output

Create YAML files from data frame rows.

Load MOTBX resource schema (JSON SCHEMA file).

In [77]:
from motbxtools import motbxschema
import jsonschema
import yaml

schema = motbxschema.MotbxSchema(SCHEMA_JSON)

Create resources, validate them with schema, and write to YAML file.

In [82]:
resources_to_check = []
yaml_count = -1

for idx, row in content_df.iterrows():
    row = row.str.strip()
    resource_id = f"ID{idx+1:04n}"
    resource = {
        "resourceID": resource_id,
        "resourceCategory": row["Category"],
        "resourceSubcategory": row["Sub-category"],
        "resourceTitle": row["Resource_title"],
        "resourceDescription": row["Resource_description"],
        "resourceUrl": row["Link_to_resource"],#.replace("http://", "https://"),
        "resourceTags": sorted([i.strip() for i in row["Tags"].split(",")]),
    }
    if resource["resourceUrl"] != row["Link_to_resource"]:
        resources_to_check.append(
            (resource_id, row["Resource_title"],
            f"NOTE: Use https instead of http in {row['Link_to_resource']}"))
    # validate against JSON schema
    try:
        jsonschema.validate(resource, schema.schema)
    except Exception as e:
        # print("--------------------\n", resource_id, row["Resource_title"])
        # print(e)
        resources_to_check.append((resource_id, row["Resource_title"], str(e)))
        continue
    # save to file
    yaml_path = RESOURCES_DIR.joinpath(f"{resource_id}.yaml")
    with open(yaml_path, "w") as fp:
        yaml.dump(resource, fp)
        yaml_count += 1

print(idx, print(yaml_count))

80
80 None


In [83]:
import csv

# write warnings and errors about resources to file
out_file = MOTBX_LEGACY_ISSUES
with open(out_file, "w") as fp:
    writer = csv.writer(fp, delimiter='\t', quotechar='"')
    for row in resources_to_check:
        writer.writerow(["-"*80])
        writer.writerow(row)