In [1]:
import pickle
import hashlib
import re
import csv
from pathlib import Path
from itertools import islice

import rdflib

First, we import some default namespaces from `rdflib` and define additional ones. For DiGA, we define a provisional namespace under w3id.org, but we have to revise this eventially.

In [2]:
from rdflib.namespace import RDF, DC, DCTERMS, SKOS, FOAF
skosxl = rdflib.Namespace('http://www.w3.org/2008/05/skos-xl#')
skos_thes = rdflib.Namespace('http://purl.org/iso25964/skos-thes#')
bibo = rdflib.Namespace('http://purl.org/ontology/bibo/')

diga_terms = rdflib.Namespace('https://w3id.org/diga/terms/')
diga_source = rdflib.Namespace('https://w3id.org/diga/source/')

We set up our graph and bind a few namespaces for nicer serialization.

Now we create a new graph for the actual vocabulary.

In [3]:
g = rdflib.Graph()
g.bind('dc', DC)
g.bind('dct', DCTERMS)
g.bind('foaf', FOAF)
g.bind('skos', SKOS)
g.bind('skosxl', skosxl)
g.bind('skos-thes', skos_thes)
g.bind('diga_terms', diga_terms)
g.bind('diga_source', diga_source)

Our vocabulary is also a `skos:ConceptScheme`.

In [4]:
diga_terms_cs = rdflib.URIRef('https://w3id.org/diga/terms')
g.add((diga_terms_cs, RDF.type, SKOS.ConceptScheme))

<Graph identifier=Ndc1c9e78f1834db89817c94cc9050d8f (<class 'rdflib.graph.Graph'>)>

We need to assign URIs to each concept. Small, closed vocabularies like DC use human-readable URIs that express the meaning, like `dc:title`. Other, large ones like AAT use numerical IDs, e.g. `aat:300423650` (double-pointed pick). AAT uses its database IDs for this. Since we don’t have IDs, we need to generate ones. Instead of just generating random IDs, we generate an ID from the label in the tree. Compared to completely random identifiers, this allows for a certain reproducibility (multiple runs will produce the same IDs). We convert hex codes to decimals simply because they look less arcane.

In [5]:
def generate_id(label):
    h = hashlib.sha1(label.encode('utf-8')).hexdigest()
    h_short = h[0:8]
    id_int = int(h_short, 16)
    return str(id_int)

generate_id('DiGA')

'2443743546'

In [6]:
top_concept_label = 'figures'

# Generate IDs for concept and label
ident = generate_id(f"(figure 0) {top_concept_label}")
ident_en = generate_id(f"(figure 0) {top_concept_label} (en)")
top_concept = diga_terms[ident]
top_concept_en = diga_terms[ident_en]

# Add the concept
g.add((top_concept, RDF.type, SKOS.Concept))
g.add((top_concept, DC.identifier, rdflib.Literal(ident)))
g.add((top_concept, SKOS.topConceptOf, diga_terms_cs))

# Add the label
g.add((top_concept_en, RDF.type, skosxl.Label))
g.add((top_concept_en, skosxl.literalForm, rdflib.Literal(top_concept_label, lang='en')))

g.add((top_concept, skosxl.prefLabel, top_concept_en))

<Graph identifier=Ndc1c9e78f1834db89817c94cc9050d8f (<class 'rdflib.graph.Graph'>)>

The category hierarchy

In [7]:
categories= [
  ('historical persons', []),
  ('literay persons', [
    ('human beings', []),
    ('supra-human beings', [
      ('buddha', []),
      ('bodhisattva', []),
      ('deities and spirits', []),
    ]),
  ]),
  ('classes of persons', [
    ('professions and roles', []),
    ('generic mythological figures', []),
    ('fantastic animals and terioanthropomorphic figures', []),
    ('animals', []),
  ]),
]

In [8]:
def add_skos(terms, parent):
    for label, children in terms:
        # Generate IDs for concept and labels
        labels = re.split(r',\s*', label)
        preflabel = labels.pop(0)
        ident = generate_id(label)
        ident_en = generate_id(f'{preflabel} (en)')
        term = diga_terms[ident]
        term_en = diga_terms[ident_en]
        
        # Add the concept
        g.add((term, RDF.type, SKOS.Concept))
        g.add((term, DC.identifier, rdflib.Literal(ident)))
        g.add((term, SKOS.inScheme, diga_terms_cs))

        # Link to the parent
        g.add((term, skos_thes.broaderGeneric, parent))
        
        # Add the labels
        g.add((term_en, RDF.type, skosxl.Label))
        g.add((term_en, skosxl.literalForm, rdflib.Literal(preflabel.lower(), lang='en')))
        g.add((term, skosxl.prefLabel, term_en))

        # add alt labels
        for alt_label in labels:
            # Generate IDs for alt labels
            ident_en = generate_id(f"{alt_label} (en)")
            term_en = diga_terms[ident_en]
            # Add the label
            g.add((term_en, RDF.type, skosxl.Label))
            g.add((term_en, skosxl.literalForm, rdflib.Literal(alt_label.lower(), lang='en')))
            g.add((term, skosxl.altLabel, term_en))
            
        # add children
        add_skos(children, term)

add_skos(categories, top_concept)
# Show
print(g.serialize(format='turtle'))

@prefix dc: <http://purl.org/dc/elements/1.1/> .
@prefix diga_terms: <https://w3id.org/diga/terms/> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix skos-thes: <http://purl.org/iso25964/skos-thes#> .
@prefix skosxl: <http://www.w3.org/2008/05/skos-xl#> .

diga_terms:1597736316 a skos:Concept ;
    dc:identifier "1597736316" ;
    skos-thes:broaderGeneric diga_terms:390192693 ;
    skos:inScheme <https://w3id.org/diga/terms> ;
    skosxl:prefLabel diga_terms:4036793313 .

diga_terms:2091634236 a skos:Concept ;
    dc:identifier "2091634236" ;
    skos-thes:broaderGeneric diga_terms:390192693 ;
    skos:inScheme <https://w3id.org/diga/terms> ;
    skosxl:prefLabel diga_terms:2996133551 .

diga_terms:2335217424 a skos:Concept ;
    dc:identifier "2335217424" ;
    skos-thes:broaderGeneric diga_terms:2286175997 ;
    skos:inScheme <https://w3id.org/diga/terms> ;
    skosxl:prefLabel diga_terms:3135576354 .

diga_terms:2498849496 a skos:Concept ;
    dc:identifier "249884949

In [9]:
with Path('Identificators_Figures.csv').open(newline='') as csvfile:
    # Skip first 4 rows
    reader = csv.DictReader(islice(csvfile, 4, None))
    for i, row in enumerate(reader, start=1):
        break

row

{'index': 'Agni',
 'historical persons': '',
 'Human beings': '',
 'Buddha': '',
 'Bodhisattva': '',
 'Deities and Spirits': 'Agni',
 'professions and roles': '',
 'generic mythological figures': '',
 'Fantastic animals and terioanthropomorphic figures': '',
 'animals': ''}

As a demonstration, we pick an example node in the tree and show the resulting RDF representation.

When calculating an identifier for an entry, the label itself is not enough, because they are not unique. We take row number from the OCR into account.

In [10]:
# Get label and category
entity = row.pop('index')
rev = {v: k for k, v in row.items()}
category = rev[entity].lower()

f'{entity}: {category}'

'Agni: deities and spirits'

In [11]:
# Generate IDs for concept and labels
labels = re.split(r',\s*', entity)
pref_label = labels.pop(0)
ident = generate_id(f"(figure {i}) {entity}")
ident_en = generate_id(f"(figure {i}) {pref_label} (en)")
term = diga_terms[ident]
term_en = diga_terms[ident_en]
category_term = diga_terms[generate_id(category)]
# Check the category is already defined
assert (category_term, None, None) in g

# Add the concept
g.add((term, RDF.type, SKOS.Concept))
g.add((term, DC.identifier, rdflib.Literal(ident)))
g.add((term, SKOS.inScheme, diga_terms_cs))

# Link to the parent
g.add((term, skos_thes.broaderGeneric, category_term))

# Add the labels
g.add((term_en, RDF.type, skosxl.Label))
g.add((term_en, skosxl.literalForm, rdflib.Literal(pref_label.lower(), lang='en')))
g.add((term, skosxl.prefLabel, term_en))

# Add alt labels
if labels:
    for alt_label in labels:
        # Generate IDs for alt labels
        ident_en = generate_id(f"(figure {i}) {alt_label} (en)")
        term_en = diga_terms[ident_en]
        # Add the label
        g.add((term_en, RDF.type, skosxl.Label))
        g.add((term_en, skosxl.literalForm, rdflib.Literal(alt_label.lower(), lang='en')))
        g.add((term, skosxl.altLabel, term_en))

# Show
print(g.serialize(format='turtle'))

@prefix dc: <http://purl.org/dc/elements/1.1/> .
@prefix diga_terms: <https://w3id.org/diga/terms/> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix skos-thes: <http://purl.org/iso25964/skos-thes#> .
@prefix skosxl: <http://www.w3.org/2008/05/skos-xl#> .

diga_terms:1597736316 a skos:Concept ;
    dc:identifier "1597736316" ;
    skos-thes:broaderGeneric diga_terms:390192693 ;
    skos:inScheme <https://w3id.org/diga/terms> ;
    skosxl:prefLabel diga_terms:4036793313 .

diga_terms:2091634236 a skos:Concept ;
    dc:identifier "2091634236" ;
    skos-thes:broaderGeneric diga_terms:390192693 ;
    skos:inScheme <https://w3id.org/diga/terms> ;
    skosxl:prefLabel diga_terms:2996133551 .

diga_terms:2335217424 a skos:Concept ;
    dc:identifier "2335217424" ;
    skos-thes:broaderGeneric diga_terms:2286175997 ;
    skos:inScheme <https://w3id.org/diga/terms> ;
    skosxl:prefLabel diga_terms:3135576354 .

diga_terms:2396006312 a skos:Concept ;
    dc:identifier "239600631

Now we can do the same thing (plus some more sophisiticated handling of preferred and alternative labels) for the whole tree.

To avoid duplicate identifiers, we collect them and check that no generated ID is already taken.

In [12]:
with Path('Identificators_Figures.csv').open(newline='') as csvfile:
    # Skip first 4 rows
    reader = csv.DictReader(islice(csvfile, 4, None))
    for i, row in enumerate(reader, start=1):
        # Get label and category
        entity = row.pop('index')
        rev = {v: k for k, v in row.items()}
        try:
            category = rev[entity].lower()
        except KeyError as e:
            print(f'ERROR: Errorneous entity "{entity}". Skipping …')
            continue
        
        # Remove plural forms
        entity = re.sub(r'\s*\(\w+?\)', '', entity)
        
        # Generate IDs for concept and labels
        labels = re.split(r',\s*', entity)
        pref_label = labels.pop(0)
        ident = generate_id(f"(figure {i}) {entity}")
        ident_en = generate_id(f"(figure {i}) {pref_label} (en)")
        term = diga_terms[ident]
        term_en = diga_terms[ident_en]
        category_term = diga_terms[generate_id(category)]
        # Check the category is already defined
        assert (category_term, None, None) in g

        # Add the concept
        g.add((term, RDF.type, SKOS.Concept))
        g.add((term, DC.identifier, rdflib.Literal(ident)))
        g.add((term, SKOS.inScheme, diga_terms_cs))

        # Link to the parent
        g.add((term, skos_thes.broaderGeneric, category_term))

        # Add the labels
        g.add((term_en, RDF.type, skosxl.Label))
        g.add((term_en, skosxl.literalForm, rdflib.Literal(pref_label.lower(), lang='en')))
        g.add((term, skosxl.prefLabel, term_en))

        # Add alt labels
        if labels:
            for alt_label in labels:
                # Generate IDs for alt labels
                ident_en = generate_id(f"(figure {i}) {alt_label} (en)")
                term_en = diga_terms[ident_en]
                # Add the label
                g.add((term_en, RDF.type, skosxl.Label))
                g.add((term_en, skosxl.literalForm, rdflib.Literal(alt_label.lower(), lang='en')))
                g.add((term, skosxl.altLabel, term_en))

Save the graph to a file.

In [13]:
with open('figures.ttl', 'wb') as outfile:
    g.serialize(destination=outfile, format='turtle')

Done!