# Explore first version of MOTBX content

In [None]:
from pathlib import Path
import os
import pprint
pp = pprint.PrettyPrinter(indent=2, width=80, compact=True)


CWD = Path.cwd()
if CWD.name != "notebooks":
    print("Make sure to run this notebook from the 'notebooks' directory.")
MOTBX_DIR = CWD.parent
MOTBX_LEGACY_XLSX = MOTBX_DIR.joinpath(
    "resources/legacy/MOTBX_resources_for_website.xlsx")
RESOURCES_DIR = MOTBX_DIR.joinpath("resources/curated")
if not os.path.exists(RESOURCES_DIR):
    os.mkdir(RESOURCES_DIR)
SCHEMA_JSON = MOTBX_DIR.joinpath("schema/motbxschema.json")


Load sheet from Excel file and print information about colummns and field values.

In [None]:
import pandas as pd

content_df = pd.read_excel(
    MOTBX_LEGACY_XLSX, sheet_name="website_content_for_JB", header=0)
content_df.head()

In [None]:
# column names
list(content_df.columns)

In [None]:
# category names
sorted(set(content_df["Category"]))

In [None]:
# category names
pp.pprint(sorted([len(i) for i in content_df["Sub-category"]]))
sorted(set(content_df["Sub-category"]))

In [None]:
# lengths of titles
pp.pprint(sorted([len(i) for i in content_df["Resource_title"]]))

In [None]:
# lengths of descriptions
pp.pprint(sorted([len(i) for i in content_df["Resource_description"]]))

In [None]:
# tags
pp.pprint(sorted(set([j.strip() for i in content_df["Tags"] 
    for j in i.split(",")])))

In [None]:
# category names
sorted(set([i for i in content_df["Doc_type"] if isinstance(i, str)]))

In [None]:
# category names
sorted(set([i for i in content_df["Format"] if isinstance(i, str)]))

Create YAML files from data frame rows.

In [None]:
from motbxtools import motbxschema
import jsonschema
import yaml

schema = motbxschema.MotbxSchema(SCHEMA_JSON)

In [None]:
resources_to_check = []
for idx, row in content_df.iterrows():
    row = row.str.strip()
    resource_id = f"ID{idx+1:04n}"
    resource = {
        "resourceID": resource_id,
        "resourceCategory": row["Category"],
        "resourceSubcategory": row["Sub-category"],
        "resourceTitle": row["Resource_title"],
        "resourceDescription": row["Resource_description"],
        "resourceUrl": row["Link_to_resource"].replace("http://", "https://"),
        "resourceTags": sorted([i.strip() for i in row["Tags"].split(",")]), 
    }
    if resource["resourceUrl"] != row["Link_to_resource"]:
        resources_to_check.append(
            (resource_id, row["Resource_title"], 
            f"NOTE: Use https instead of http in {row['Link_to_resource']}"))
    # validate against JSON schema
    try:
        jsonschema.validate(resource, schema.schema)
    except Exception as e:
        # print("--------------------\n", resource_id, row["Resource_title"])
        # print(e)
        resources_to_check.append((resource_id, row["Resource_title"], str(e)))
        continue
    # save to file
    yaml_path = RESOURCES_DIR.joinpath(f"{resource_id}.yaml")
    with open(yaml_path, "w") as fp:
        yaml.dump(resource, fp)

In [None]:
import csv

# write warnings and errors about resources to file
out_file = MOTBX_DIR.joinpath(
    "resources/legacy/NOTES_on_MOTBX_resources_for_website.csv")
with open(out_file, "w") as fp:
    writer = csv.writer(fp, delimiter='\t', quotechar='"')
    for row in resources_to_check:
        writer.writerow(["-"*80])
        writer.writerow(row)