In [19]:
# Script to build Markdown pages that provide term metadata for complex vocabularies
# Steve Baskauf 2020-06-28 CC0
# This script merges static Markdown header and footer documents with term information tables (in Markdown) generated from data in the rs.tdwg.org repo from the TDWG Github site

import re
import requests   # best library to manage HTTP transactions
import csv        # library to read/write/parse CSV files
import json       # library to convert JSON to Python data structures
import pandas as pd

# -----------------
# Configuration section
# -----------------

# !!!! NOTE !!!!
# There is not currently an example of a complex vocabulary that has the column headers
# used in the sample files. In order to test this script, it uses the Audubon Core files,
# which have headers that differ from the samples. So throughout the code, there are
# pairs of lines where the default header names are commented out and the Audubon Core
# headers are not. To build a page using the sample files, you will need to reverse the
# commenting of these pairs.

# This is the base URL for raw files from the branch of the repo that has been pushed to GitHub
#githubBaseUri = 'https://raw.githubusercontent.com/tdwg/rs.tdwg.org/practice/'
githubBaseUri = '../../'

headerFileName = 'termlist-header.md'
footerFileName = 'termlist-footer.md'
outFileName = '../docs/abcd.md'

# This is a Python list of the database names of the term lists to be included in the document.
#termLists = ['audubon', 'exif-for-ac', 'xmp-for-ac', 'dwc-for-ac', 'dc-for-ac', 'dcterms-for-ac']
termLists = ['abcd2']

# NOTE! There may be problems unless every term list is of the same vocabulary type since the number of columns will differ
# However, there probably aren't any circumstances where mixed types will be used to generate the same page.
vocab_type = 1 # 1 is simple vocabulary, 2 is simple controlled vocabulary, 3 is c.v. with broader hierarchy

# Terms in large vocabularies like Darwin and Audubon Cores may be organized into categories using tdwgutility_organizedInClass
# If so, those categories can be used to group terms in the generated term list document.
organized_in_categories = True

# If organized in categories, the display_order list must contain the IRIs that are values of tdwgutility_organizedInClass
# If not organized into categories, the value is irrelevant. There just needs to be one item in the list.
display_order = ['DataSet', 'DataSet-Owner', 'DataSet-Legal', 'Unit', 'Unit-Owner', 'Unit-Legal', 'Unit-Contact', 'Identification', 'Identification-Agent', 'SpecimenUnit', 'SpecimenUnit-Owner', 'SpecimenUnit-Acquisition', 'SpecimenUnit-Preparation', 'SpecimenUnit-NomenclaturalTypeDesignation', 'ObsercationUnit', 'CultureCollectionUnit', 'MycologicalUnit', 'HerbariumUnit', 'BotanicalGardenUnit', 'PlantGeneticResourcesUnit', 'ZoologicalUnit', 'PalaeontologicalUnit', 'MultiMediaObject', 'MultiMediaObject-Legal', 'Gathering', 'Gathering-Agent', 'Gathering-SiteCoordinates', 'Gathering-MeasurementOrFact', 'Gathering-Multimedia', 'Gathering-Synecology', 'MeasurementOrFact', 'Sequence']
#display_order = ['http://rs.tdwg.org/abcd2/terms/DataSet', 'http://rs.tdwg.org/abcd2/terms/DataSet-Owner', 'http://rs.tdwg.org/abcd2/terms/DataSet-Legal', 'http://rs.tdwg.org/abcd2/terms/Unit', 'http://rs.tdwg.org/abcd2/terms/Unit-Owner', 'http://rs.tdwg.org/abcd2/terms/Unit-Legal', 'http://rs.tdwg.org/abcd2/terms/Unit-Contact', 'http://rs.tdwg.org/abcd2/terms/Identification', 'http://rs.tdwg.org/abcd2/terms/Identification-Agent', 'http://rs.tdwg.org/abcd2/terms/SpecimenUnit', 'http://rs.tdwg.org/abcd2/terms/SpecimenUnit-Owner', 'http://rs.tdwg.org/abcd2/terms/SpecimenUnit-Acquisition', 'http://rs.tdwg.org/abcd2/terms/SpecimenUnit-Preparation', 'http://rs.tdwg.org/abcd2/terms/SpecimenUnit-NomenclaturalTypeDesignation', 'http://rs.tdwg.org/abcd2/terms/ObsercationUnit', 'http://rs.tdwg.org/abcd2/terms/CultureCollectionUnit', 'http://rs.tdwg.org/abcd2/terms/MycologicalUnit', 'http://rs.tdwg.org/abcd2/terms/HerbariumUnit', 'http://rs.tdwg.org/abcd2/terms/BotanicalGardenUnit', 'http://rs.tdwg.org/abcd2/terms/PlantGeneticResourcesUnit', 'http://rs.tdwg.org/abcd2/terms/ZoologicalUnit', 'http://rs.tdwg.org/abcd2/terms/PalaeontologicalUnit', 'http://rs.tdwg.org/abcd2/terms/MultiMediaObject', 'http://rs.tdwg.org/abcd2/terms/MultiMediaObject-Legal', 'http://rs.tdwg.org/abcd2/terms/Gathering', 'http://rs.tdwg.org/abcd2/terms/Gathering-Agent', 'http://rs.tdwg.org/abcd2/terms/Gathering-SiteCoordinates', 'http://rs.tdwg.org/abcd2/terms/Gathering-MeasurementOrFact', 'http://rs.tdwg.org/abcd2/terms/Gathering-Multimedia', 'http://rs.tdwg.org/abcd2/terms/Gathering-Synecology', 'http://rs.tdwg.org/abcd2/terms/MeasurementOrFact', 'http://rs.tdwg.org/abcd2/terms/Sequence']
display_label = ['DataSet', 'Dataset - Owner', 'Dataset - Legal', 'Unit', 'Unit - Owner', 'Unit - Legal', 'Unit - Contact', 'Identification', 'Identification - Agent', 'Specimen Unit', 'Specimen Unit - Owner', 'Specimen Unit - Acquisition', 'Specimen Unit - Preparation', 'Specimen Unit - Nomenclatural Type Designation', 'Obsercation Unit', 'Culture Collection Unit', 'Mycological Unit', 'Herbarium Unit', 'Botanical Garden Unit', 'Plant Genetic Resources Unit', 'Zoological Unit', 'Palaeontological Unit', 'Multimedia Object', 'Multimedia Object - Legal', 'Gathering', 'Gathering - Agent', 'Gathering - Site Coordinates', 'Gathering - Measurement Or Fact', 'Gathering - Multimedia', 'Gathering-Synecology', 'Measurement Or Fact', 'Sequence']
display_comments = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
display_id = ['DataSet', 'DataSet-Owner', 'DataSet-Legal', 'Unit', 'Unit-Owner', 'Unit-Legal', 'Unit-Contact', 'Identification', 'Identification-Agent', 'SpecimenUnit', 'SpecimenUnit-Owner', 'SpecimenUnit-Acquisition', 'SpecimenUnit-Preparation', 'SpecimenUnit-NomenclaturalTypeDesignation', 'ObsercationUnit', 'CultureCollectionUnit', 'MycologicalUnit', 'HerbariumUnit', 'BotanicalGardenUnit', 'PlantGeneticResourcesUnit', 'ZoologicalUnit', 'PalaeontologicalUnit', 'MultiMediaObject', 'MultiMediaObject-Legal', 'Gathering', 'Gathering-Agent', 'Gathering-SiteCoordinates', 'Gathering-MeasurementOrFact', 'Gathering-Multimedia', 'Gathering-Synecology', 'MeasurementOrFact', 'Sequence']

#display_order = ['']
#display_label = ['Vocabulary'] # these are the section labels for the categories in the page
#display_comments = [''] # these are the comments about the category to be appended following the section labels
#display_id = ['Vocabulary'] # these are the fragment identifiers for the associated sections for the categories


# ---------------
# Function definitions
# ---------------

# replace URL with link
#
def createLinks(text):
    def repl(match):
        if match.group(1)[-1] == '.':
            return '<a href="' + match.group(1)[:-1] + '">' + match.group(1)[:-1] + '</a>.'
        return '<a href="' + match.group(1) + '">' + match.group(1) + '</a>'

    pattern = '(https?://[^\s,;\)"<]*)'
    result = re.sub(pattern, repl, text)
    return result

# 2021-08-05 Add code to convert backticks to code tags copied from the DwC QRG build script written by S. Van Hoey
def convert_code(text_with_backticks):
    """Takes all back-quoted sections in a text field and converts it to
    the html tagged version of code blocks <code>...</code>
    """
    return re.sub(r'`([^`]*)`', r'<code>\1</code>', text_with_backticks)

def convert_link(text_with_urls):
    """Takes all links in a text field and converts it to the html tagged
    version of the link
    """
    def _handle_matched(inputstring):
        """quick hack version of url handling on the current prime versions data"""
        url = inputstring.group()
        return "<a href=\"{}\">{}</a>".format(url, url)

    regx = "(http[s]?://[\w\d:#@%/;$()~_?\+-;=\\\.&]*)(?<![\)\.,])"
    return re.sub(regx, _handle_matched, text_with_urls)

# Hack the code taken from the Darwin Core terms.tmpl template to insert the HTML necessary to make the
# semicolon-separated lists of examples into an HTML list.
# {% set examples = term.examples.split("; ") %}
# {% if examples | length == 1 %}{{ examples | first }}{% else %}<ul class="list-group list-group-flush">{% for example in examples %}<li class="list-group-item">{{ example }}</li>{% endfor %}</ul>{% endif %}
def convert_examples(text_with_list_of_examples: str) -> str:
    examples_list = text_with_list_of_examples.split('; ')
    if len(examples_list) == 1:
        return examples_list[0]
    else:
        output = '<ul class="list-group list-group-flush">\n'
        for example in examples_list:
            output += '  <li class="list-group-item">' + example + '</li>\n'
        output += '</ul>'
        return output


In [8]:
term_lists_info = []

frame = pd.read_csv(githubBaseUri + 'term-lists/term-lists.csv', na_filter=False)
for termList in termLists:
    term_list_dict = {'list_iri': termList}
    term_list_dict = {'database': termList}
    for index,row in frame.iterrows():
        if row['database'] == termList:
            term_list_dict['pref_ns_prefix'] = row['vann_preferredNamespacePrefix']
            term_list_dict['pref_ns_uri'] = row['vann_preferredNamespaceUri']
            term_list_dict['list_iri'] = row['list']
    term_lists_info.append(term_list_dict)
print(term_lists_info)

[{'database': 'abcd2', 'pref_ns_prefix': 'abcd2', 'pref_ns_uri': 'http://rs.tdwg.org/abcd2/terms/', 'list_iri': 'http://rs.tdwg.org/abcd2/terms/'}]


In [28]:
# Create column list
column_list = ['pref_ns_prefix', 'pref_ns_uri', 'term_localName', 'label', 'definition', 'usage', 'examples', 'type', 'tdwgutility_required', 'tdwgutil_repeatable', 'tdwgutil_xpath', 'deprecated_xpath', 'parent', 'term_modified', 'term_deprecated']
#column_list = ['pref_ns_prefix', 'pref_ns_uri', 'term_localName', 'label', 'rdfs_comment', 'skos_scopeNote', 'dcterms_description', 'examples', 'term_modified', 'term_deprecated', 'rdf_type']
#column_list = ['pref_ns_prefix', 'pref_ns_uri', 'term_localName', 'label', 'definition', 'usage', 'notes', 'examples', term_modified', 'term_deprecated', 'type']
if vocab_type == 2:
    column_list += ['controlled_value_string']
elif vocab_type == 3:
    column_list += ['controlled_value_string', 'skos_broader']
if organized_in_categories:
    column_list.append('tdwgutility_organizedInClass')
column_list.append('version_iri')

print('Retrieving metadata about terms from all namespaces from GitHub')
# Create list of lists metadata table
table_list = []
for term_list in term_lists_info:
    # retrieve versions metadata for term list
    versions_url = githubBaseUri + term_list['database'] + '-versions/' + term_list['database'] + '-versions.csv'
    versions_df = pd.read_csv(versions_url, na_filter=False)
    
    # retrieve current term metadata for term list
    data_url = githubBaseUri + term_list['database'] + '/' + term_list['database'] + '.csv'
    frame = pd.read_csv(data_url, na_filter=False)
    for index,row in frame.iterrows():
        row_list = [term_list['pref_ns_prefix'], term_list['pref_ns_uri'], row['term_localName'], row['label'], row['definition'], row['usage'], row['examples'], row['type'], row['tdwgutility_required'], row['tdwgutil_repeatable'], row['tdwgutil_xpath'], row['deprecated_xpath'], row['parent'], row['term_modified'], row['term_deprecated']]
        #row_list = [term_list['pref_ns_prefix'], term_list['pref_ns_uri'], row['term_localName'], row['label'], row['rdfs_comment'], row['skos_scopeNote'], row['dcterms_description'], row['examples'], row['term_modified'], row['term_deprecated'], row['rdf_type']]
        #row_list = [term_list['pref_ns_prefix'], term_list['pref_ns_uri'], row['term_localName'], row['label'], row['definition'], row['usage'], row['notes'], row['examples'], row['term_modified'], row['term_deprecated'], row['type']]
        if vocab_type == 2:
            row_list += [row['controlled_value_string']]
        elif vocab_type == 3:
            if row['skos_broader'] =='':
                row_list += [row['controlled_value_string'], '']
            else:
                row_list += [row['controlled_value_string'], term_list['pref_ns_prefix'] + ':' + row['skos_broader']]
        if organized_in_categories:
            row_list.append(row['tdwgutility_organizedInClass'])

        # Borrowed terms really don't have implemented versions. They may be lacking values for version_status.
        # In their case, their version IRI will be omitted.
        found = False
        for vindex, vrow in versions_df.iterrows():
            if vrow['term_localName']==row['term_localName'] and vrow['version_status']=='recommended':
                found = True
                version_iri = vrow['version']
                # NOTE: the current hack for non-TDWG terms without a version is to append # to the end of the term IRI
                if version_iri[len(version_iri)-1] == '#':
                    version_iri = ''
        if not found:
            version_iri = ''
        row_list.append(version_iri)

        table_list.append(row_list)

# Turn list of lists into dataframe
terms_df = pd.DataFrame(table_list, columns = column_list)

terms_sorted_by_label = terms_df.sort_values(by='label')
# This makes sort case insensitive
terms_sorted_by_localname = terms_df.iloc[terms_df.term_localName.str.lower().argsort()]
terms_sorted_by_label

Retrieving metadata about terms from all namespaces from GitHub


Unnamed: 0,pref_ns_prefix,pref_ns_uri,term_localName,label,definition,usage,examples,type,tdwgutility_required,tdwgutil_repeatable,tdwgutil_xpath,deprecated_xpath,parent,term_modified,term_deprecated,tdwgutility_organizedInClass,version_iri
917,abcd2,http://rs.tdwg.org/abcd2/terms/,Gathering-Project-Contact-Organisation-Abbrevi...,Abbreviation,Label abbreviation representing the organisati...,,,Property,False,False,/DataSets/DataSet/Units/Unit/Gathering/Project...,,Gathering-Project-Contact-Organisation-Represe...,,,Gathering,http://rs.tdwg.org/abcd2/terms/version/Gatheri...
365,abcd2,http://rs.tdwg.org/abcd2/terms/,Identifier-Organisation-Abbreviation,Abbreviation,Label abbreviation representing the organisati...,,,Property,False,False,/DataSets/DataSet/Units/Unit/Identifications/I...,,Identifier-Organisation-Representation,,,Identification-Agent,http://rs.tdwg.org/abcd2/terms/version/Identif...
413,abcd2,http://rs.tdwg.org/abcd2/terms/,SpecimenUnit-Owner-Organisation-Abbreviation,Abbreviation,Label abbreviation representing the organisati...,,,Property,False,False,/DataSets/DataSet/Units/Unit/SpecimenUnit/Owne...,,SpecimenUnit-Owner-Organisation-Representation,,,SpecimenUnit-Owner,http://rs.tdwg.org/abcd2/terms/version/Specime...
54,abcd2,http://rs.tdwg.org/abcd2/terms/,DataSet-Owner-Organisation-Abbreviation,Abbreviation,Label abbreviation representing the organisati...,,,Property,False,False,/DataSets/DataSet/Metadata/Owners/Owner/Organi...,,DataSet-Owner-Organisation-Representation,,,DataSet-Owner,http://rs.tdwg.org/abcd2/terms/version/DataSet...
1372,abcd2,http://rs.tdwg.org/abcd2/terms/,SequencingAgent-Organisation-Abbreviation,Abbreviation,Label abbreviation representing the organisati...,,,Property,False,False,/DataSets/DataSet/Units/Unit/Sequences/Sequenc...,,SequencingAgent-Organisation-Representation,,,Sequence,http://rs.tdwg.org/abcd2/terms/version/Sequenc...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
876,abcd2,http://rs.tdwg.org/abcd2/terms/,GatheringAgent-@sequence,sequence,A number indicating the sequential position of...,,2,Property,False,False,/DataSets/DataSet/Units/Unit/Gathering/Agents/...,,GatheringAgent,,,Gathering-Agent,http://rs.tdwg.org/abcd2/terms/version/Gatheri...
506,abcd2,http://rs.tdwg.org/abcd2/terms/,SpecimenUnit-Preparation-@sequence,sequence,A number indicating the sequential position of...,,,Property,False,False,/DataSets/DataSet/Units/Unit/SpecimenUnit/Prep...,,SpecimenUnit-Preparation,,,SpecimenUnit-Preparation,http://rs.tdwg.org/abcd2/terms/version/Specime...
559,abcd2,http://rs.tdwg.org/abcd2/terms/,SpecimenUnit-Preservation-@sequence,sequence,A number indicating the sequential position of...,,,Property,False,False,/DataSets/DataSet/Units/Unit/SpecimenUnit/Pres...,,SpecimenUnit-Preservation,,,SpecimenUnit,http://rs.tdwg.org/abcd2/terms/version/Specime...
967,abcd2,http://rs.tdwg.org/abcd2/terms/,Gathering-NamedArea-@sequence,sequence,A number indicating the sequence of several na...,,,Property,False,False,/DataSets/DataSet/Units/Unit/Gathering/NamedAr...,,Gathering-NamedArea,,,Gathering,http://rs.tdwg.org/abcd2/terms/version/Gatheri...


In [30]:
terms_df

Unnamed: 0,pref_ns_prefix,pref_ns_uri,term_localName,label,definition,usage,examples,type,tdwgutility_required,tdwgutil_repeatable,tdwgutil_xpath,deprecated_xpath,parent,term_modified,term_deprecated,tdwgutility_organizedInClass,version_iri
0,abcd2,http://rs.tdwg.org/abcd2/terms/,DataSets,DataSets,The root element of the schema. A container el...,,,Class,True,False,/DataSets,,,,,DataSet,http://rs.tdwg.org/abcd2/terms/version/DataSet...
1,abcd2,http://rs.tdwg.org/abcd2/terms/,DataSet,DataSet,A container element for one to many unit data ...,,,Class,True,True,/DataSets/DataSet,,DataSets,,,DataSet,http://rs.tdwg.org/abcd2/terms/version/DataSet...
2,abcd2,http://rs.tdwg.org/abcd2/terms/,DatasetGUID,DatasetGUID,A globally unique identifier (GUID) for the en...,,,Property,False,False,/DataSets/DataSet/DatasetGUID,,DataSet,,,DataSet,http://rs.tdwg.org/abcd2/terms/version/Dataset...
3,abcd2,http://rs.tdwg.org/abcd2/terms/,DatasetID,DatasetID,Code or Identifier of the dataset (unique with...,,,Property,True,False,/DataSets/DataSet/DatasetID,,DataSet,,,DataSet,http://rs.tdwg.org/abcd2/terms/version/Dataset...
4,abcd2,http://rs.tdwg.org/abcd2/terms/,TechnicalContacts,TechnicalContacts,Container element for the technical contacts r...,,,Class,False,False,/DataSets/DataSet/TechnicalContacts,,DataSet,,,DataSet,http://rs.tdwg.org/abcd2/terms/version/Technic...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1413,abcd2,http://rs.tdwg.org/abcd2/terms/,AnnotationText-@language,language,The language of the text in the element carryi...,The value UND (undefined) should be used if th...,,Property,False,False,/DataSets/DataSet/Units/Unit/EAnnotations/Anno...,,AnnotationText,,,Unit,http://rs.tdwg.org/abcd2/terms/version/Annotat...
1414,abcd2,http://rs.tdwg.org/abcd2/terms/,Annotator,Annotator,The author of the electronic annotation text.,,,Property,False,False,/DataSets/DataSet/Units/Unit/EAnnotations/Anno...,,Annotation,,,Unit,http://rs.tdwg.org/abcd2/terms/version/Annotat...
1415,abcd2,http://rs.tdwg.org/abcd2/terms/,AnnotationDate,AnnotationDate,The ISO date of submission of the electronic a...,,,Property,False,False,/DataSets/DataSet/Units/Unit/EAnnotations/Anno...,,Annotation,,,Unit,http://rs.tdwg.org/abcd2/terms/version/Annotat...
1416,abcd2,http://rs.tdwg.org/abcd2/terms/,UnitExtension,UnitExtension,The extension is temporary and serves only to ...,,,Class,False,False,/DataSets/DataSet/Units/Unit/UnitExtension,,Unit,,,Unit,http://rs.tdwg.org/abcd2/terms/version/UnitExt...


Run the following cell to generate an index sorted alphabetically by lowercase term local name. Omit this index if the terms have opaque local names.

In [31]:
terms_sorted_by_localname['tdwgutility_organizedInClass']

1348              Unit
1411              Unit
1415              Unit
1412              Unit
1413              Unit
             ...      
134               Unit
758     ZoologicalUnit
760     ZoologicalUnit
761     ZoologicalUnit
759     ZoologicalUnit
Name: tdwgutility_organizedInClass, Length: 1418, dtype: object

In [32]:
# generate the index of terms grouped by category and sorted alphabetically by lowercase term local name

text = '### 3.1 Index By Term Name\n\n'
text += '(See also [3.2 Index By Label](#32-index-by-label))\n\n'
for category in range(0,len(display_order)):
    text += '**' + display_label[category] + '**\n'
    text += '\n'
    if organized_in_categories:
        filtered_table = terms_sorted_by_localname[terms_sorted_by_localname['tdwgutility_organizedInClass']==display_order[category]]
        filtered_table.reset_index(drop=True, inplace=True)
    else:
        filtered_table = terms_sorted_by_localname
        filtered_table.reset_index(drop=True, inplace=True)
        
    for row_index,row in filtered_table.iterrows():
        curie = row['pref_ns_prefix'] + ":" + row['term_localName']
        curie_anchor = curie.replace(':','_')
        text += '[' + curie + '](#' + curie_anchor + ') |\n'
    text = text[:len(text)-2] # remove final trailing vertical bar and newline
    text += '\n\n' # put back removed newline

index_by_name = text

print(index_by_name)

### 3.1 Index By Term Name

(See also [3.2 Index By Label](#32-index-by-label))

**DataSet**

[abcd2:ContentContact](#abcd2_ContentContact) |
[abcd2:ContentContact-@preferred](#abcd2_ContentContact-@preferred) |
[abcd2:ContentContact-Address](#abcd2_ContentContact-Address) |
[abcd2:ContentContact-Email](#abcd2_ContentContact-Email) |
[abcd2:ContentContact-Name](#abcd2_ContentContact-Name) |
[abcd2:ContentContact-Phone](#abcd2_ContentContact-Phone) |
[abcd2:ContentContacts](#abcd2_ContentContacts) |
[abcd2:DataSet](#abcd2_DataSet) |
[abcd2:DataSet-Contributors](#abcd2_DataSet-Contributors) |
[abcd2:DataSet-Coverage](#abcd2_DataSet-Coverage) |
[abcd2:DataSet-Creators](#abcd2_DataSet-Creators) |
[abcd2:DataSet-DateIssued](#abcd2_DataSet-DateIssued) |
[abcd2:DataSet-DateModified](#abcd2_DataSet-DateModified) |
[abcd2:DataSet-Description](#abcd2_DataSet-Description) |
[abcd2:DataSet-Description-Details](#abcd2_DataSet-Description-Details) |
[abcd2:DataSet-Description-URI](#abcd2_DataSet-Des

The following cell is an alternative to the previous one. It separates classes from property terms and lists the classes first (hacked from the Darwin Core List of Terms).

In [35]:
print('Generating term index by CURIE')
text = '### 3.1 Index By Term Name\n\n'
text += '(See also [3.2 Index By Label](#32-index-by-label))\n\n'

text += '**Classes**\n'
text += '\n'
for row_index,row in terms_sorted_by_localname.iterrows():
#    if row['rdf_type'] == 'http://www.w3.org/2000/01/rdf-schema#Class':
    if row['type'] == 'Class':
        curie = row['pref_ns_prefix'] + ":" + row['term_localName']
        curie_anchor = curie.replace(':','_')
        text += '[' + curie + '](#' + curie_anchor + ') |\n'
text = text[:len(text)-2] # remove final trailing vertical bar and newline
text += '\n\n' # put back removed newline

for category in range(0,len(display_order)):
    text += '**' + display_label[category] + '**\n'
    text += '\n'
    if organized_in_categories:
        filtered_table = terms_sorted_by_localname[terms_sorted_by_localname['tdwgutility_organizedInClass']==display_order[category]]
        filtered_table.reset_index(drop=True, inplace=True)
    else:
        filtered_table = terms_sorted_by_localname
        
    for row_index,row in filtered_table.iterrows():
        if row['type'] != 'Class':
#        if row['rdf_type'] != 'http://www.w3.org/2000/01/rdf-schema#Class':
            curie = row['pref_ns_prefix'] + ":" + row['term_localName']
            curie_anchor = curie.replace(':','_')
            text += '[' + curie + '](#' + curie_anchor + ') |\n'
    text = text[:len(text)-2] # remove final trailing vertical bar and newline
    text += '\n\n' # put back removed newline

index_by_name = text


Generating term index by CURIE


Run the following cell to generate an index by term label

In [36]:
text = '\n\n'

# Comment out the following two lines if there is no index by local names
text = '### 3.2 Index By Label\n\n'
text += '(See also [3.1 Index By Term Name](#31-index-by-term-name))\n\n'
for category in range(0,len(display_order)):
    if organized_in_categories:
        text += '**' + display_label[category] + '**\n'
        text += '\n'
        filtered_table = terms_sorted_by_label[terms_sorted_by_label['tdwgutility_organizedInClass']==display_order[category]]
        filtered_table.reset_index(drop=True, inplace=True)
    else:
        filtered_table = terms_sorted_by_label
        filtered_table.reset_index(drop=True, inplace=True)
        
    for row_index,row in filtered_table.iterrows():
        if row_index == 0 or (row_index != 0 and row['label'] != filtered_table.iloc[row_index - 1].loc['label']): # this is a hack to prevent duplicate labels
            curie_anchor = row['pref_ns_prefix'] + "_" + row['term_localName']
            text += '[' + row['label'] + '](#' + curie_anchor + ') |\n'
    text = text[:len(text)-2] # remove final trailing vertical bar and newline
    text += '\n\n' # put back removed newline

index_by_label = text

print(index_by_label)

### 3.2 Index By Label

(See also [3.1 Index By Term Name](#31-index-by-term-name))

**DataSet**

[Address](#abcd2_TechnicalContact-Address) |
[ContentContact](#abcd2_ContentContact) |
[ContentContacts](#abcd2_ContentContacts) |
[Contributors](#abcd2_DataSet-Contributors) |
[Coverage](#abcd2_DataSet-Coverage) |
[Creators](#abcd2_DataSet-Creators) |
[DataSet](#abcd2_DataSet) |
[DataSetExtension](#abcd2_DataSetExtension) |
[DataSets](#abcd2_DataSets) |
[DatasetGUID](#abcd2_DatasetGUID) |
[DatasetID](#abcd2_DatasetID) |
[DateCreated](#abcd2_DataSet-RevisionData-DateCreated) |
[DateIssued](#abcd2_DataSet-DateIssued) |
[DateModified](#abcd2_DataSet-DateModified) |
[Description](#abcd2_DataSet-Description) |
[Details](#abcd2_DataSet-Description-Details) |
[Email](#abcd2_ContentContact-Email) |
[GeoEcologicalTerm](#abcd2_DataSet-GeoecologicalTerm) |
[GeoecologicalTerms](#abcd2_DataSet-GeoecologicalTerms) |
[IconURI](#abcd2_DataSet-IconURI) |
[InformationWithheld](#abcd2_DataSet-InformationWit

In [40]:
decisions_df = pd.read_csv('https://raw.githubusercontent.com/tdwg/rs.tdwg.org/master/decisions/decisions-links.csv', na_filter=False)

# generate a table for each term, with terms grouped by category

# generate the Markdown for the terms table
text = '## 4 Vocabulary\n'
for category in range(0,len(display_order)):
    if organized_in_categories:
        text += '### 4.' + str(category + 1) + ' ' + display_label[category] + '\n'
        text += '\n'
        text += display_comments[category] # insert the comments for the category, if any.
        filtered_table = terms_sorted_by_localname[terms_sorted_by_localname['tdwgutility_organizedInClass']==display_order[category]]
        filtered_table.reset_index(drop=True, inplace=True)
    else:
        filtered_table = terms_sorted_by_localname
        filtered_table.reset_index(drop=True, inplace=True)

    for row_index,row in filtered_table.iterrows():
        text += '<table>\n'
        curie = row['pref_ns_prefix'] + ":" + row['term_localName']
        curieAnchor = curie.replace(':','_')
        text += '\t<thead>\n'
        text += '\t\t<tr>\n'
        text += '\t\t\t<th colspan="2"><a id="' + curieAnchor + '"></a>Term Name  ' + curie + '</th>\n'
        text += '\t\t</tr>\n'
        text += '\t</thead>\n'
        text += '\t<tbody>\n'
        text += '\t\t<tr>\n'
        text += '\t\t\t<td>Term IRI</td>\n'
        uri = row['pref_ns_uri'] + row['term_localName']
        text += '\t\t\t<td><a href="' + uri + '">' + uri + '</a></td>\n'
        text += '\t\t</tr>\n'
        text += '\t\t<tr>\n'
        text += '\t\t\t<td>Modified</td>\n'
        text += '\t\t\t<td>' + row['term_modified'] + '</td>\n'
        text += '\t\t</tr>\n'

        if row['version_iri'] != '':
            text += '\t\t<tr>\n'
            text += '\t\t\t<td>Term version IRI</td>\n'
            text += '\t\t\t<td><a href="' + row['version_iri'] + '">' + row['version_iri'] + '</a></td>\n'
            text += '\t\t</tr>\n'

        text += '\t\t<tr>\n'
        text += '\t\t\t<td>Label</td>\n'
        text += '\t\t\t<td>' + row['label'] + '</td>\n'
        text += '\t\t</tr>\n'

        if row['term_deprecated'] != '':
            text += '\t\t<tr>\n'
            text += '\t\t\t<td></td>\n'
            text += '\t\t\t<td><strong>This term is deprecated and should no longer be used.</strong></td>\n'
            text += '\t\t</tr>\n'

        text += '\t\t<tr>\n'
        text += '\t\t\t<td>Definition</td>\n'
        #text += '\t\t\t<td>' + row['rdfs_comment'] + '</td>\n'
        text += '\t\t\t<td>' + row['definition'] + '</td>\n'
        text += '\t\t</tr>\n'

        #if row['skos_scopeNote'] != '':
        if row['usage'] != '':
            text += '\t\t<tr>\n'
            text += '\t\t\t<td>Usage</td>\n'
            #text += '\t\t\t<td>' + convert_link(convert_code(row['skos_scopeNote'])) + '</td>\n'
            text += '\t\t\t<td>' + createLinks(row['usage']) + '</td>\n'
            text += '\t\t</tr>\n'

        #if row['dcterms_description'] != '':
        if row['usage'] != '':
            text += '\t\t<tr>\n'
            text += '\t\t\t<td>Notes</td>\n'
            #text += '\t\t\t<td>' + convert_link(convert_code(row['dcterms_description'])) + '</td>\n'
            text += '\t\t\t<td>' + createLinks(row['usage']) + '</td>\n'
            text += '\t\t</tr>\n'

        if row['examples'] != '':
            text += '\t\t<tr>\n'
            text += '\t\t\t<td>Examples</td>\n'
            text += '\t\t\t<td>' + convert_examples(convert_link(convert_code(row['examples']))) + '</td>\n'
            text += '\t\t</tr>\n'

        if (vocab_type == 2 or vocab_type == 3) and row['controlled_value_string'] != '': # controlled vocabulary
            text += '\t\t<tr>\n'
            text += '\t\t\t<td>Controlled value</td>\n'
            text += '\t\t\t<td>' + row['controlled_value_string'] + '</td>\n'
            text += '\t\t</tr>\n'

        if vocab_type == 3 and row['skos_broader'] != '': # controlled vocabulary with skos:broader relationships
            text += '\t\t<tr>\n'
            text += '\t\t\t<td>Has broader concept</td>\n'
            curieAnchor = row['skos_broader'].replace(':','_')
            text += '\t\t\t<td><a href="#' + curieAnchor + '">' + row['skos_broader'] + '</a></td>\n'
            text += '\t\t</tr>\n'

        text += '\t\t<tr>\n'
        text += '\t\t\t<td>Type</td>\n'
        #if row['rdf_type'] == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Property':
        if row['type'] == 'Property':
            text += '\t\t\t<td>Property</td>\n'
        #elif row['rdf_type'] == 'http://www.w3.org/2000/01/rdf-schema#Class':
        elif row['type'] == 'Class':
            text += '\t\t\t<td>Class</td>\n'
        #elif row['rdf_type'] == 'http://www.w3.org/2004/02/skos/core#Concept':
        elif row['type'] == 'Concept':
            text += '\t\t\t<td>Concept</td>\n'
        else:
            #text += '\t\t\t<td>' + row['rdf_type'] + '</td>\n' # this should rarely happen
            text += '\t\t\t<td>' + row['type'] + '</td>\n' # this should rarely happen
        text += '\t\t</tr>\n'

        # Look up decisions related to this term
        for drow_index,drow in decisions_df.iterrows():
            if drow['linked_affected_resource'] == uri:
                text += '\t\t<tr>\n'
                text += '\t\t\t<td>Executive Committee decision</td>\n'
                text += '\t\t\t<td><a href="http://rs.tdwg.org/decisions/' + drow['decision_localName'] + '">http://rs.tdwg.org/decisions/' + drow['decision_localName'] + '</a></td>\n'
                text += '\t\t</tr>\n'                        

        text += '\t</tbody>\n'
        text += '</table>\n'
        text += '\n'
    text += '\n'
term_table = text

print(term_table)

## 4 Vocabulary
### 4.1 DataSet

<table>
	<thead>
		<tr>
			<th colspan="2"><a id="abcd2_ContentContact"></a>Term Name  abcd2:ContentContact</th>
		</tr>
	</thead>
	<tbody>
		<tr>
			<td>Term IRI</td>
			<td><a href="http://rs.tdwg.org/abcd2/terms/ContentContact">http://rs.tdwg.org/abcd2/terms/ContentContact</a></td>
		</tr>
		<tr>
			<td>Modified</td>
			<td></td>
		</tr>
		<tr>
			<td>Term version IRI</td>
			<td><a href="http://rs.tdwg.org/abcd2/terms/version/ContentContact-2024-11-13">http://rs.tdwg.org/abcd2/terms/version/ContentContact-2024-11-13</a></td>
		</tr>
		<tr>
			<td>Label</td>
			<td>ContentContact</td>
		</tr>
		<tr>
			<td>Definition</td>
			<td>An administrative contact usually representing the agent acting as the original supplier or custodian of the dataset.</td>
		</tr>
		<tr>
			<td>Type</td>
			<td>Class</td>
		</tr>
	</tbody>
</table>

<table>
	<thead>
		<tr>
			<th colspan="2"><a id="abcd2_ContentContact-@preferred"></a>Term Name  abcd2:ContentContact-@prefer

Modify to display the indices that you want

In [41]:
#text = index_by_label + term_table
text = index_by_name + index_by_label + term_table

In [43]:
# read in header and footer, merge with terms table, and output

headerObject = open(headerFileName, 'rt', encoding='utf-8')
header = headerObject.read()
headerObject.close()

footerObject = open(footerFileName, 'rt', encoding='utf-8')
footer = footerObject.read()
footerObject.close()
outFileName = "../abcd.md"
output = header + text + footer
outputObject = open(outFileName, 'wt', encoding='utf-8')
outputObject.write(output)
outputObject.close()
    
print('done')

done
