# Creating CIM v2 documents from CMIP6 spreadsheet

## Setup

In [1]:
import glob
import inspect
import os
import uuid

import pyesdoc
import pyesdoc.ontologies.cim as cim
import xlrd

ES-DOC :: pyesdoc :: INFO > Loading pyesdoc config from: /Users/macg/dev/esdoc/ops/config/pyesdoc.conf


In [2]:
# Path to test folder.
_HOME = os.path.join(os.path.expanduser("~"), "esdoc-test")

# Path to workbook being converted to CIM v2 documents.
_WORKBOOK_FPATH = os.path.join(_HOME, "CMIP6Experiments.xlsx")

# Workbook being converted to CIM v2 documents.
_WORKBOOK = None

# Name of relevant worksheets to be found within workbook.
_WS_ENSEMBLE_REQUIREMENT = "EnsembleRequirement"
_WS_EXPERIMENT = "experiment"
_WS_FORCING_CONSTRAINT = "ForcingConstraint"
_WS_PARTY = "party"
_WS_REFERENCES = "references"
_WS_REQUIREMENT = "requirement"
_WS_TEMPORAL_CONSTRAINT = "TemporalConstraint"
_WS_URL = "url"

# Default document project code.
_DOC_PROJECT = 'CMIP6-TEST'

# Default document source.
_DOC_SOURCE = 'test-script'

# Default document author.
_DOC_AUTHOR = pyesdoc.create(cim.v2.Party,
                             source=_DOC_SOURCE,
                             uid='253825f3-fbc8-43fb-b1f6-cc575dc693eb')
_DOC_AUTHOR.email = u"charlotte.pascoe@stfc.ac.uk"
_DOC_AUTHOR.name = u"Charlotte Pascoe"

## Helper functions to extract data from workbook

In [3]:
def _get_workbook():
    """Returns pointer to workbook object.
    
    """
    global _WORKBOOK
    
    if _WORKBOOK is None:
        _WORKBOOK = xlrd.open_workbook(_WORKBOOK_FPATH)

    return _WORKBOOK


def _get_ws(ws_name):
    """Returns pointer to a named worksheet.
    
    """
    return _get_workbook().sheet_by_name(ws_name)


def _get_ws_rows(ws_name):
    """Returns collection of rows within a named worksheet.
    
    """
    return enumerate(_get_ws(ws_name).get_rows())


def _get_ws_data(ws_name):
    """Returns collection of rows within a named worksheet that correspond to actual data.
    
    """
    for idx, row in _get_ws_rows(ws_name):
        if idx > 0 and len(row[0].value) > 0:
            yield row

            
def _get_ws_col_map(ws_name):
    """Returns map of column index to column names - supports situation when user reorders columns.
    
    """
    for idx, row in _get_ws_rows(ws_name):
        if idx == 0:
            return {col.value: col_idx for col_idx, col in enumerate(row)} 


def _get_ws_document(row, col_map, doc_type, doc_mappings):
    """Returns a cim document from a spreadsheet row.
    
    """
    # Create document.
    doc = pyesdoc.create(doc_type,
                         project=_DOC_PROJECT,
                         source=_DOC_SOURCE,
                         author=_DOC_AUTHOR)
    
    # Apply attribute mappings.
    for mapping in doc_mappings:
        # Unpack mapping info.
        cell_value_convertor = None
        if isinstance(mapping, tuple):
            mapping, cell_value_convertor = mapping
        mapping = mapping.split(":")
        doc_attr = mapping[0]
        col_name = mapping[0] if len(mapping) == 1 else mapping[1]
        
        # Get cell value.
        cell_value = row[col_map[col_name]].value
        if cell_value_convertor:
            cell_value = cell_value_convertor(cell_value)
                    
        # Set document attribute.
        setattr(doc, doc_attr, cell_value)

    return doc


def _get_ws_documents(ws_name, doc_type, doc_mappings):
    """Returns set of cim documents within a spreadsheet."""
    result = list()
    col_map = _get_ws_col_map(ws_name)
    for row in _get_ws_data(ws_name):
        result.append(_get_ws_document(row, col_map, doc_type, doc_mappings))

    return result

## Declare cell value convertors

In [4]:
def _convert_to_bool(value):
    """Converts a cell value to a boolean."""
    return unicode(value).lower() in [u'true', u't', u'yes', u'y', u"1"]

def _convert_to_time_period(value):
    """Converts a cell value to a cim.v2.TimePeriod instance"""
    d = cim.v2.TimePeriod()
    d.length = int(value.split(" ")[0])
    d.units = value.split(" ")[1]
    d.date_type = u'unused'

    return d

## Set CIM v2 document collections

In [5]:
_urls = _get_ws_documents(_WS_URL, cim.v2.OnlineResource, [
    ("description"),
    ("name"),
    ("linkage"),
    ("protocol")
])

_citations = _get_ws_documents(_WS_REFERENCES, cim.v2.Citation, [
    ("abstract"),
    ("citation_str"),
    ("context"),
    ("doi"),
    ("title"),
    ("url")
])

_parties = _get_ws_documents(_WS_PARTY, cim.v2.Party, [
    ("address"),
    ("email"),
    ("name"),
    ("organisation", _convert_to_bool),
    ("url")
]) 


In [6]:
_temporal_constraints = _get_ws_documents(_WS_TEMPORAL_CONSTRAINT, cim.v2.TemporalConstraint, [
    ("canonical_name"),
    ("conformance_is_requested:conformance_requested", _convert_to_bool),
    ("duration:required_duration", _convert_to_time_period),
    ("name"),
    ("references"),
])

_forcing_constraints = _get_ws_documents(_WS_FORCING_CONSTRAINT, cim.v2.ForcingConstraint, [
    ("canonical_name"),
    ("conformance_is_requested:conformance_requested", _convert_to_bool),        
    ("forcing_type"),
    ("name"),
    ("references"),
])


## Set inter-document references

### Set urls

In [7]:
def _get_url(name):
    """Returns matching URL.
    
    """
    if name is None or len(name.strip()) == 0:
        return None
    for url in _urls:
        if url.name.lower() == name.lower():
            return url

In [8]:
# Set party urls.
for party in [p for p in _parties]:
    party.url = _get_url(party.url)

# # Set reference urls.
for citation in [c for c in _citations if c.url]:
    citation.url = _get_url(citation.url)

### Set citations

In [9]:
def _get_citation(name):
    """Returns matching citation.
    
    """
    if name is None or len(name.strip()) == 0:
        return None
    for citation in _citations:
        if citation.citation_str.lower() == name.lower():
            return citation

In [10]:
# Forcing constraint citations.
for fc in [i for i in _forcing_constraints if i.references and not isinstance(i.references, list)]:
    fc.references = [_get_citation(fc.references)]

# Temporal constraint citations.
for tc in [i for i in _temporal_constraints if i.references and not isinstance(i.references, list)]:
    tc.references = [_get_citation(tc.references)]


## Display document validation errors

In [11]:
def _validate(target):
    """Displays document validation errors."""
    # Invoke document factory functions.
    if inspect.isfunction(target):
        target = target()

    # Set document collection.
    try:
        iter(target)
    except TypeError:
        docs = [target]
    else:
        docs = target
    
    # Validate document collection.
    for doc in docs:
        for err_idx, err in enumerate(pyesdoc.validate(doc)):
            if err_idx == 0:
                print("Document errors: {} :: {} :: v{}".format(doc.type_key, doc.meta.id, doc.meta.version))
            print("\t{}".format(err))    

In [12]:
_validate(_urls)

In [13]:
_validate(_citations)

In [14]:
_validate(_parties)

Document errors: cim.2.shared.Party :: b4f62bd5-5a0f-484b-a5a3-814e83618242 :: v0
	doc.address --> is of invalid type (actual = <type 'str'>, expected=<type 'unicode'>)
	doc.email --> is of invalid type (actual = <type 'str'>, expected=<type 'unicode'>)


In [15]:
_validate(_forcing_constraints)

Document errors: cim.2.activity.ForcingConstraint :: 47d4a938-252c-44bb-8cd9-f5043a42b0df :: v0
	doc.category --> is null
	doc.code --> is null
	doc.references --> is of invalid type (actual = <type 'str'>, expected=<class 'pyesdoc.ontologies.cim.v2.typeset_for_shared_package.Citation'>)
Document errors: cim.2.activity.ForcingConstraint :: 0a5c9282-7308-4c36-b411-d975fd13ab7a :: v0
	doc.category --> is null
	doc.code --> is null
Document errors: cim.2.activity.ForcingConstraint :: 104822ec-04a2-4bbc-bbe3-542fbab0b9ab :: v0
	doc.category --> is null
	doc.code --> is null
Document errors: cim.2.activity.ForcingConstraint :: eb1aafc0-376c-45f7-8295-8320415b776e :: v0
	doc.category --> is null
	doc.code --> is null
Document errors: cim.2.activity.ForcingConstraint :: 5d15fafd-bbf3-4289-b57d-93a67d918644 :: v0
	doc.category --> is null
	doc.code --> is null
Document errors: cim.2.activity.ForcingConstraint :: f16742d3-f067-47d4-8ddd-86936b19d816 :: v0
	doc.category --> is null
	doc.code -->

In [16]:
_validate(_temporal_constraints)

Document errors: cim.2.activity.TemporalConstraint :: 16bda489-8605-462b-b307-a4c984c69644 :: v0
	doc.references --> is of invalid type (actual = <type 'str'>, expected=<class 'pyesdoc.ontologies.cim.v2.typeset_for_shared_package.Citation'>)
Document errors: cim.2.activity.TemporalConstraint :: 25f57eb2-d8f9-492e-8646-434605facb47 :: v0
	doc.references --> is of invalid type (actual = <type 'str'>, expected=<class 'pyesdoc.ontologies.cim.v2.typeset_for_shared_package.Citation'>)
Document errors: cim.2.activity.TemporalConstraint :: 95e29c9e-0e63-4d02-adc9-7f51a6f27876 :: v0
	doc.references --> is of invalid type (actual = <type 'str'>, expected=<class 'pyesdoc.ontologies.cim.v2.typeset_for_shared_package.Citation'>)
Document errors: cim.2.activity.TemporalConstraint :: 8ae0547c-9519-4f41-9598-1a857e59990a :: v0
	doc.references --> is of invalid type (actual = <type 'str'>, expected=<class 'pyesdoc.ontologies.cim.v2.typeset_for_shared_package.Citation'>)
Document errors: cim.2.activity.

## Save CIM documents to file system 

In [17]:
# Set I/O directory.
pyesdoc.set_option("output_dir", _HOME)

In [18]:
# Build collection of documents to be written to file system.
docs = sorted(_urls + _citations + _parties + _forcing_constraints + _temporal_constraints)

In [19]:
# Write document set to file system.
for doc in sorted(docs):
    print pyesdoc.write(doc)

AttributeError: 'Citation' object has no attribute 'meta'

In [None]:
# Read from file system
docs = map(pyesdoc.read, glob.glob(os.path.join(_HOME, "*.json")))
for doc in sorted(docs):
    print doc

In [None]:
# Clean up file system.
for fpath in glob.glob(os.path.join(_HOME, "*.json")):
    os.remove(fpath)