# Summarise MOTBX resources

Convert MOTBX resources (YAML files) to CSV file.

In [34]:
from pathlib import Path
import os
import csv
from motbxtools import motbxschema
import yaml
from collections import defaultdict as ddict
import pprint
pp = pprint.PrettyPrinter(indent = 2)

CWD = Path.cwd()
if CWD.name != "notebooks":
    print("Make sure to run this notebook from the 'notebooks' directory.")

MOTBX_DIR = CWD.parent # home directory of this repository

with open(MOTBX_DIR.joinpath("resources/MOTBX_version.yaml"), "r") as f:
    MOTBX_VERSION = yaml.safe_load(f)

# path to directory where resources YAML file are saved
RESOURCES_DIR = MOTBX_DIR.joinpath("resources/curated")
SUMMARY_DIR = MOTBX_DIR.joinpath("resources/summary")
if not os.path.exists(SUMMARY_DIR):
    os.mkdir(SUMMARY_DIR)
SUMMARY_CSV_LATEST = SUMMARY_DIR.joinpath(
    f"MOTBX_{MOTBX_VERSION['latest']}.csv")
SUMMARY_CSV_PREVIOUS = SUMMARY_DIR.joinpath(
    f"MOTBX_{MOTBX_VERSION['previous'][0]}.csv")
CHANGELOG_CSV = SUMMARY_DIR.joinpath(
    f"changelog_{MOTBX_VERSION['latest']}.csv")

# path to JSON SCHEMA file defining structure of MOTBX resources
SCHEMA_JSON = MOTBX_DIR.joinpath("schema/motbxschema.json")

TEST_RESOURCES_DIR = MOTBX_DIR.joinpath("tests/resources_pass")
TEST_SUMMARY_CSV = MOTBX_DIR.joinpath("tests/resources.csv")

In [35]:
# load JSON schema from file
schema = motbxschema.MotbxSchema(SCHEMA_JSON)

# print schema formatted as JSON
# print(json.dumps(schema.schema, indent = 2))

In [36]:
# define CSV column names
fieldnames = list(schema.schema["properties"].keys())
print(fieldnames)

['resourceID', 'resourceCategory', 'resourceSubcategory', 'resourceTitle', 'resourceDescription', 'resourceUrl', 'resourceTags', 'resourceKeywords']


Create YAML file summary for test resources.

In [37]:
# open CSV file
with open(TEST_SUMMARY_CSV, "w", newline = "", encoding = "utf-8") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames = fieldnames)
    writer.writeheader()

    # iterate through resources
    for root, dirs, files in os.walk(TEST_RESOURCES_DIR):
        for name in files:
            if not name.endswith(".yaml"):
                continue
            print(name)

            # load test resource and validate
            resource = motbxschema.MotbxResource(os.path.join(root, name))
            resource.validate(schema)

            # write to CSV file
            row = resource.flatten(fieldnames)
            #pp.pprint(row)
            writer.writerow(row)

test1.yaml


test2.yaml


In [38]:
# load summary CSV from previous version
previous_version = {}
with open(SUMMARY_CSV_PREVIOUS, "r") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        print(row["resourceID"], end="\r")
        previous_version[row["resourceID"]] = row

ID0081

Create summary for curated resources

In [40]:
# open CSV file
with (open(SUMMARY_CSV_LATEST, "w", newline = "", encoding = "utf-8")
       as csvfile,
       open(CHANGELOG_CSV, "w", newline = "", encoding = "utf-8")
       as changelogfile):
    writer = csv.DictWriter(csvfile, fieldnames = fieldnames)
    writer.writeheader()
    changelog = csv.DictWriter(changelogfile, fieldnames = [
        "resourceID", "New resource (yes/no)", "Updated field(s)"])
    changelog.writeheader()

    # iterate through resources
    for root, dirs, files in os.walk(RESOURCES_DIR):
        for name in files:
            if not name.endswith(".yaml"):
                continue
            print(name, end="\r")

            # load test resource and validate
            resource = motbxschema.MotbxResource(os.path.join(root, name))
            #resource.validate(schema) don't validate in first version

            # write to CSV file
            row = resource.flatten(fieldnames)
            new_resource = False
            try:
                row_old = previous_version[row["resourceID"]]
            except KeyError:  # resource is new
                row_old = {}
                new_resource = True
            changed_fields = dict(set(row.items()) ^ set(row_old.items())).keys()
            writer.writerow(row)
            changelog.writerow({
                "resourceID": row["resourceID"],
                "New resource (yes/no)": "yes" if new_resource else "no",
                "Updated field(s)": ", ".join(changed_fields)
            })


ID0081.yaml