In [1]:
import pickle
import hashlib
import re
import csv
from pathlib import Path

import rdflib

First, we import some default namespaces from `rdflib` and define additional ones. For DiGA, we define a provisional namespace under w3id.org, but we have to revise this eventially.

In [2]:
from rdflib.namespace import RDF, DC, DCTERMS, SKOS, FOAF
skosxl = rdflib.Namespace('http://www.w3.org/2008/05/skos-xl#')
skos_thes = rdflib.Namespace('http://purl.org/iso25964/skos-thes#')
bibo = rdflib.Namespace('http://purl.org/ontology/bibo/')

diga_terms = rdflib.Namespace('https://w3id.org/diga/terms/')
diga_source = rdflib.Namespace('https://w3id.org/diga/source/')

We set up our graph and bind a few namespaces for nicer serialization.

Now we create a new graph for the actual vocabulary.

In [3]:
g = rdflib.Graph()
g.bind('dc', DC)
g.bind('dct', DCTERMS)
g.bind('foaf', FOAF)
g.bind('skos', SKOS)
g.bind('skosxl', skosxl)
g.bind('skos-thes', skos_thes)
g.bind('diga_terms', diga_terms)
g.bind('diga_source', diga_source)

Our vocabulary is also a `skos:ConceptScheme`.

In [4]:
diga_terms_cs = rdflib.URIRef('https://w3id.org/diga/terms')
g.add((diga_terms_cs, RDF.type, SKOS.ConceptScheme))

<Graph identifier=Nc678390f405e443c88dc2557ec198488 (<class 'rdflib.graph.Graph'>)>

We need to assign URIs to each concept. Small, closed vocabularies like DC use human-readable URIs that express the meaning, like `dc:title`. Other, large ones like AAT use numerical IDs, e.g. `aat:300423650` (double-pointed pick). AAT uses its database IDs for this. Since we don’t have IDs, we need to generate ones. Instead of just generating random IDs, we generate an ID from the label in the tree. Compared to completely random identifiers, this allows for a certain reproducibility (multiple runs will produce the same IDs). We convert hex codes to decimals simply because they look less arcane.

In [5]:
def generate_id(label):
    h = hashlib.sha1(label.encode('utf-8')).hexdigest()
    h_short = h[0:8]
    id_int = int(h_short, 16)
    return str(id_int)

generate_id('DiGA')

'2443743546'

In [6]:
top_concept_label = 'narratives'

# Generate IDs for concept and label
ident = generate_id(f"(0) {top_concept_label}")
ident_en = generate_id(f"(0) {top_concept_label} (en)")
top_concept = diga_terms[ident]
top_concept_en = diga_terms[ident_en]

# Add the concept
g.add((top_concept, RDF.type, SKOS.Concept))
g.add((top_concept, DC.identifier, rdflib.Literal(ident)))
g.add((top_concept, SKOS.topConceptOf, diga_terms_cs))

# Add the label
g.add((top_concept_en, RDF.type, skosxl.Label))
g.add((top_concept_en, skosxl.literalForm, rdflib.Literal(top_concept_label, lang='en')))

g.add((top_concept, skosxl.prefLabel, top_concept_en))

<Graph identifier=Nc678390f405e443c88dc2557ec198488 (<class 'rdflib.graph.Graph'>)>

In [7]:
with Path('Identificators_Narratives.csv').open(newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for i, row in enumerate(reader, start=1):
        break

row

{'Level': '1',
 'CONCEPT SOURCE': '',
 'PREF LABEL': 'Previous Births',
 'PREF LABEL SOURCE': '',
 '(sub-episodes)': '',
 'ALT LABEL 1': 'Jātakas, Past lives, previous birth stories',
 'ALT LABEL 1 SOURCE': '',
 'ALT LABEL 2': '',
 'ALT LABEL 2 SOURCE': '',
 'ALT LABEL 3': '',
 'ALT LABEL 3 SOURCE': '',
 'NOTES': 'ALTERNATIVE LABELS: NOTE',
 '': ''}

As a demonstration, we pick an example node in the tree and show the resulting RDF representation.

When calculating an identifier for an entry, the label itself is not enough, because they are not unique. We take row number from the OCR into account.

In [8]:
# Generate IDs for concept and labels
ident = generate_id(f"({i}) {row['PREF LABEL'] or row['(sub-episodes)']}")
ident_en = generate_id(f"({i}) {row['PREF LABEL'] or row['(sub-episodes)']} (en)")
term = diga_terms[ident]
term_en = diga_terms[ident_en]

# Add the concept
g.add((term, RDF.type, SKOS.Concept))
g.add((term, DC.identifier, rdflib.Literal(ident)))
g.add((term, SKOS.inScheme, diga_terms_cs))

# Link to the parent
g.add((term, skos_thes.broaderGeneric, top_concept))

# Add the source
if source := row['CONCEPT SOURCE']:
    g.add((term, DCTERMS.source, diga_source[source]))
    
# Add the labels
g.add((term_en, RDF.type, skosxl.Label))
g.add((term_en, skosxl.literalForm,
       rdflib.Literal((row['PREF LABEL'] or row['(sub-episodes)']).lower(), lang='en')))
g.add((term, skosxl.prefLabel, term_en))

# TODO: add alt labels
if alt_labels := row['ALT LABEL 1']:
    for alt_label in re.split(r',\s*', alt_labels):
        # Generate IDs for alt labels
        ident_en = generate_id(f"({i}) {alt_label} (en)")
        term_en = diga_terms[ident_en]
        # Add the label
        g.add((term_en, RDF.type, skosxl.Label))
        g.add((term_en, skosxl.literalForm, rdflib.Literal(alt_label.lower(), lang='en')))
        g.add((term, skosxl.altLabel, term_en))

# Show
print(g.serialize(format='turtle'))

@prefix dc: <http://purl.org/dc/elements/1.1/> .
@prefix diga_terms: <https://w3id.org/diga/terms/> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix skos-thes: <http://purl.org/iso25964/skos-thes#> .
@prefix skosxl: <http://www.w3.org/2008/05/skos-xl#> .

diga_terms:1115524268 a skos:Concept ;
    dc:identifier "1115524268" ;
    skos-thes:broaderGeneric diga_terms:2550564563 ;
    skos:inScheme <https://w3id.org/diga/terms> ;
    skosxl:altLabel diga_terms:1689164348,
        diga_terms:297365002,
        diga_terms:3397843173 ;
    skosxl:prefLabel diga_terms:467616027 .

diga_terms:1689164348 a skosxl:Label ;
    skosxl:literalForm "previous birth stories"@en .

diga_terms:2550564563 a skos:Concept ;
    dc:identifier "2550564563" ;
    skos:topConceptOf <https://w3id.org/diga/terms> ;
    skosxl:prefLabel diga_terms:3242875618 .

diga_terms:297365002 a skosxl:Label ;
    skosxl:literalForm "past lives"@en .

diga_terms:3242875618 a skosxl:Label ;
    skosxl:literalF

Now we can do the same thing (plus some more sophisiticated handling of preferred and alternative labels) for the whole tree.

To avoid duplicate identifiers, we collect them and check that no generated ID is already taken.

In [9]:
identifiers = {}

last_concepts = {0: top_concept}
last_level = 0

with Path('Identificators_Narratives.csv').open(encoding='utf-8', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for i, row in enumerate(reader, start=1):
        if not (row['PREF LABEL'] or row['(sub-episodes)']):
            # empty row
            continue
        
        # Generate IDs for concept and labels
        ident = generate_id(f"({i}) {row['PREF LABEL'] or row['(sub-episodes)']}")
        ident_en = generate_id(f"({i}) {row['PREF LABEL'] or row['(sub-episodes)']} (en)")
        term = diga_terms[ident]
        term_en = diga_terms[ident_en]

        # Add the concept
        g.add((term, RDF.type, SKOS.Concept))
        g.add((term, DC.identifier, rdflib.Literal(ident)))
        g.add((term, SKOS.inScheme, diga_terms_cs))

        # Find out level and parent
        is_sub_concept = bool(row['(sub-episodes)'])
        
        if current_level := row['Level']:
            last_level = current_level = int(current_level)
        else:
            current_level = last_level + 1
            if is_sub_concept:
                # Increase again by one
                current_level += 1
        last_concepts[current_level] = term
        
        # Link to the parent
        if is_sub_concept:
            g.add((term, skos_thes.broaderPartitive, last_concepts[current_level - 1]))
        else:
            g.add((term, skos_thes.broaderGeneric, last_concepts[current_level - 1]))

        # Add the source
        if source := row['CONCEPT SOURCE']:
            g.add((term, DCTERMS.source, diga_source[source]))

        # Add the labels
        g.add((term_en, RDF.type, skosxl.Label))
        g.add((term_en, skosxl.literalForm, rdflib.Literal(row['PREF LABEL'] or row['(sub-episodes)'], lang='en')))
        g.add((term, skosxl.prefLabel, term_en))

        # TODO: add alt labels
        if alt_labels := row['ALT LABEL 1']:
            for alt_label in re.split(r',\s*', alt_labels):
                # Generate IDs for alt labels
                ident_en = generate_id(f"({i}) {alt_label} (en)")
                term_en = diga_terms[ident_en]
                # Add the label
                g.add((term_en, RDF.type, skosxl.Label))
                g.add((term_en, skosxl.literalForm, rdflib.Literal(alt_label, lang='en')))
                g.add((term, skosxl.altLabel, term_en))

Save the graph to a file.

In [10]:
with open('narratives.ttl', 'wb') as outfile:
    g.serialize(destination=outfile, format='turtle')

Done!