In [1]:
import pickle
import hashlib

import rdflib

First, we import some default namespaces from `rdflib` and define additional ones. For DiGA, we define a provisional namespace under w3id.org, but we have to revise this eventially.

In [2]:
from rdflib.namespace import RDF, DC, DCTERMS, SKOS
skosxl = rdflib.Namespace('http://www.w3.org/2008/05/skos-xl#')
skos_thes = rdflib.Namespace('http://purl.org/iso25964/skos-thes#')
bibo = rdflib.Namespace('http://purl.org/ontology/bibo/')

diga_terms = rdflib.Namespace('https://w3id.org/diga/terms/')
diga_source = rdflib.Namespace('https://w3id.org/diga/source/')

We set up our graph and bind a few namespaces for nicer serialization.

In [3]:
g = rdflib.Graph()
g.bind('dct', DCTERMS)
g.bind('skos', SKOS)
g.bind('bibo', bibo)
g.bind('diga_source', diga_source)

We follow [Getty AAT](https://www.getty.edu/research/tools/vocabularies/lod/aat_semantic_representation.pdf)’s mechanism for pointing to a source for concepts and labels. At this stage, all concepts and labels stem from the _Repertorio_, so this might appear redundant, but we’ll have additional terms soon, as well as translations in other languages for which we want to note the source.

Thus, we first define the repertorio as a `bibo:Document` that we can point to.

In [4]:
g.add((diga_source.repertorio, RDF.type, bibo.Document))
g.add((diga_source.repertorio, bibo.shortTitle, rdflib.Literal('Faccenna and Filigenzi. Repertorio. 2007')))
g.add((diga_source.repertorio, DCTERMS.title, rdflib.Literal('Repertorio terminologico per la schedatura delle sculture dell’arte gandharica')))
g.add((diga_source.repertorio, SKOS.note, rdflib.Literal('Faccenna, Domenico, and Anna Filigenzi. 2007. Repertorio terminologico per la schedatura delle sculture dell’arte gandharica – Sulla base dei materiali provenienti dagli scavi della Missione Archeologica Italiana dell’IsIAO nello Swat, Pakistan. Rome: IsIAO.')))
print(g.serialize(format='turtle').decode('utf8'))

@prefix bibo: <http://purl.org/ontology/bibo/> .
@prefix dct: <http://purl.org/dc/terms/> .
@prefix diga_source: <https://w3id.org/diga/source/> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .

diga_source:repertorio a bibo:Document ;
    dct:title "Repertorio terminologico per la schedatura delle sculture dell’arte gandharica" ;
    bibo:shortTitle "Faccenna and Filigenzi. Repertorio. 2007" ;
    skos:note "Faccenna, Domenico, and Anna Filigenzi. 2007. Repertorio terminologico per la schedatura delle sculture dell’arte gandharica – Sulla base dei materiali provenienti dagli scavi della Missione Archeologica Italiana dell’IsIAO nello Swat, Pakistan. Rome: IsIAO." .




We have the source vocabulary as a separate file that we can import later.

In [5]:
with open('diga_sources.ttl', 'wb') as outfile:
    g.serialize(destination=outfile, format='turtle')

Now we create a new graph for the actual vocabulary.

In [6]:
g = rdflib.Graph()
g.bind('dc', DC)
g.bind('dct', DCTERMS)
g.bind('skos', SKOS)
g.bind('skosxl', skosxl)
g.bind('skos-thes', skos_thes)
g.bind('diga_terms', diga_terms)
g.bind('diga_source', diga_source)

Our vocabulary is also a `skos:ConceptScheme`.

In [7]:
diga_terms_cs = rdflib.URIRef('https://w3id.org/diga/terms')
g.add((diga_terms_cs, RDF.type, SKOS.ConceptScheme))

We need to assign URIs to each concept. Small, closed vocabularies like DC use human-readable URIs that express the meaning, like `dc:title`. Other, large ones like AAT use numerical IDs, e.g. `aat:300423650` (double-pointed pick). AAT uses its database IDs for this. Since we don’t have IDs, we need to generate ones. Instead of just generating random IDs, we generate an ID from the label in the tree. That allows for a certain reproducibility. We convert hex codes to decimals simply because the look less arcane.

In [8]:
def generate_id(label):
    h = hashlib.sha1(label.encode('utf-8')).hexdigest()
    h_short = h[0:8]
    id_int = int(h_short, 16)
    return str(id_int)

generate_id('DiGA')

'2443743546'

Now we load the concept tree we saved during [OCR extraction](Extract_OCR.ipynb) and convert it to RDF.

In [9]:
with open('repertorio_tree.pickle', 'rb') as picklefile:
    tree = pickle.load(picklefile)

As a demonstration, we pick the first non-root node in the tree and show the resulting RDF representation.

In [10]:
for entry in tree.expand_tree(mode=tree.DEPTH):
    if tree[entry].identifier == tree.root:
        # Ignore root, it’s a fake node
        continue
    break

In [11]:
node = tree[entry]

# Generate IDs for concept and labels
ident = generate_id(f"({node.data['number']}) {node.data['it']} | {node.data['en']}")
ident_it = generate_id(f"({node.data['number']}) {node.data['it']}")
ident_en = generate_id(f"({node.data['number']}) {node.data['en']}")
term = diga_terms[ident]
term_it = diga_terms[ident_it]
term_en = diga_terms[ident_en]

# Add the concept
g.add((term, RDF.type, SKOS.Concept))
g.add((term, DC.identifier, rdflib.Literal(ident)))
if node.predecessor(tree.identifier) == tree.root:
    # Top level concept
    g.add((term, SKOS.topConceptOf, diga_terms_cs))
else:
    g.add((term, SKOS.inScheme, diga_terms_cs))
g.add((term, DCTERMS.source, diga_source.repertorio))
    
# Add the labels
g.add((term_it, RDF.type, skosxl.Label))
g.add((term_it, skosxl.literalForm, rdflib.Literal(node.data['it'], lang='it')))
g.add((term_it, DCTERMS.source, diga_source.repertorio))

g.add((term_en, RDF.type, skosxl.Label))
g.add((term_en, skosxl.literalForm, rdflib.Literal(node.data['en'], lang='en')))
g.add((term_en, DCTERMS.source, diga_source.repertorio))

g.add((term, skosxl.prefLabel, term_it))
g.add((term, skosxl.prefLabel, term_en))

# Show
print(g.serialize(format='turtle').decode('utf8'))

@prefix dc: <http://purl.org/dc/elements/1.1/> .
@prefix dct: <http://purl.org/dc/terms/> .
@prefix diga_source: <https://w3id.org/diga/source/> .
@prefix diga_terms: <https://w3id.org/diga/terms/> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix skosxl: <http://www.w3.org/2008/05/skos-xl#> .

diga_terms:1772668966 a skos:Concept ;
    dc:identifier "1772668966" ;
    dct:source diga_source:repertorio ;
    skos:topConceptOf <https://w3id.org/diga/terms> ;
    skosxl:prefLabel diga_terms:2809229738,
        diga_terms:3983648794 .

<https://w3id.org/diga/terms> a skos:ConceptScheme .

diga_terms:2809229738 a skosxl:Label ;
    dct:source diga_source:repertorio ;
    skosxl:literalForm "strumenti per la lavorazione della pietra"@it .

diga_terms:3983648794 a skosxl:Label ;
    dct:source diga_source:repertorio ;
    skosxl:literalForm "tools for working stone"@en .




Now we can do the same thing for the whole tree.

In [12]:
for entry in tree.expand_tree(mode=tree.DEPTH):
    if tree[entry].identifier == tree.root:
        # Ignore root, it’s a fake node
        continue

    node = tree[entry]
    if node.data['type'] == 'facet':
        # Facets in brackets (like “[front face]” or “[overview]”) are not concepts
        if node.data['it'].startswith('['):
            continue

    # Generate IDs for concept and labels
    ident = generate_id(f"({node.data['number']}) {node.data['it']} | {node.data['en']}")
    ident_it = generate_id(f"({node.data['number']}) {node.data['it']}")
    ident_en = generate_id(f"({node.data['number']}) {node.data['en']}")
    term = diga_terms[ident]
    term_it = diga_terms[ident_it]
    term_en = diga_terms[ident_en]
    # Write identifier to the node data for later use
    node.data['ident'] = ident

    # Add the concept
    g.add((term, RDF.type, SKOS.Concept))
    g.add((term, DC.identifier, rdflib.Literal(ident)))
    parent_id = node.predecessor(tree.identifier)
    if parent_id == tree.root:
        # Top level concept
        g.add((term, SKOS.topConceptOf, diga_terms_cs))
    else:
        g.add((term, SKOS.inScheme, diga_terms_cs))
        # Add broader* relation to parent node, depending on type
        if node.data['type'] == 'part':
            broader = skos_thes.broaderPartitive
        else:
            broader = skos_thes.broaderGeneric
        parent = tree[parent_id]
        parent_term = diga_terms[parent.data['ident']]
        g.add((term, broader, parent_term))
        
    g.add((term, DCTERMS.source, diga_source.repertorio))

    # Add the labels
    g.add((term_it, RDF.type, skosxl.Label))
    g.add((term_it, skosxl.literalForm, rdflib.Literal(node.data['it'], lang='it')))
    g.add((term_it, DCTERMS.source, diga_source.repertorio))

    g.add((term_en, RDF.type, skosxl.Label))
    g.add((term_en, skosxl.literalForm, rdflib.Literal(node.data['en'], lang='en')))
    g.add((term_en, DCTERMS.source, diga_source.repertorio))

    g.add((term, skosxl.prefLabel, term_it))
    g.add((term, skosxl.prefLabel, term_en))

Save the graph to a file.

In [13]:
with open('repertorio.ttl', 'wb') as outfile:
    g.serialize(destination=outfile, format='turtle')

Done!