In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install rdflib # https://rdflib.readthedocs.io/en/stable/
!pip install SPARQLWrapper
!pip install oxrdflib

In [None]:
import pandas as pd
import logging
import rdflib
import oxrdflib
import pyoxigraph
import os
logging.basicConfig(format='%(asctime)s %(message)s')
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

# Create a knowledge graph

## 1. Instantiate graph

For graphs with lots of nodes and edges, you should use a triple store: see https://rdflib.readthedocs.io/en/7.1.0/plugin_stores.html

In [None]:
from rdflib import Graph, Literal, URIRef

In [None]:
g = Graph(store='Oxigraph')

## 2. Bind namespaces

By default, the `namespace` module defines many common namespaces, but others can be manually added!

In [None]:
from rdflib.namespace import RDF, RDFS, XSD, DCTERMS
from rdflib import Namespace

In [None]:
SCHEMA = Namespace('https://schema.org')
WIKIENTITY = Namespace("http://www.wikidata.org/entity/")
WIKIPROP = Namespace("http://www.wikidata.org/prop/direct/")
WIKIBASE = Namespace("http://wikiba.se/ontology#")
BIGDATA = Namespace("http://www.bigdata.com/rdf#")
WORDNET = Namespace('https://globalwordnet.github.io/schemas/wn#')
LILA = Namespace('http://lila-erc.eu/ontologies/lila/')
ONTOLEX = Namespace('http://www.w3.org/ns/lemon/ontolex#')
LEXINFO = Namespace('http://www.lexinfo.net/ontology/2.0/lexinfo#')
LIME = Namespace("http://www.w3.org/ns/lemon/lime#")
SKOS = Namespace("http://www.w3.org/2004/02/skos/core#")


# These bindings will act as prefixes
g.bind('rdf', RDF)
g.bind('rdfs', RDFS)
g.bind('xsd', XSD)
g.bind('dct', DCTERMS)
g.bind('schema', SCHEMA)
g.bind('wd', WIKIENTITY)
g.bind('wdt', WIKIPROP)
g.bind('wikibase', WIKIBASE)
g.bind('wn', WORDNET)
g.bind('lila', LILA)
g.bind('ontolex', ONTOLEX)
g.bind('lexinfo', LEXINFO)
g.bind('lime', LIME)
g.bind('skos', SKOS)

g.serialize(format='ttl')

## 3. Populate graph with LiLa

In [None]:
import urllib.error
import time
import rdflib.query
import socket
from SPARQLWrapper import SPARQLWrapper, JSON, SPARQLExceptions
import re

def transform2dicts(results):
    new_results = []
    for result in results:
        new_result = {}
        for key in result:
            new_result[key] = result[key]['value']
        new_results.append(new_result)
    return new_results

MAXRETRY = 5
def query(query: str, endpoint: str):
    socket.getaddrinfo('localhost',8080)
    endpoint = endpoint
    sparql = SPARQLWrapper(endpoint)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = []
    n = 2
    #logger.info('Querying Wikidata...')
    for i in range(MAXRETRY):
        try:
            results = sparql.queryAndConvert()['results']['bindings']
            results = transform2dicts(results)
            return results
        except urllib.error.HTTPError or SPARQLExceptions.EndPointInternalError or urllib.error.URLError as e:
            if i == MAXRETRY-1:
                raise e
            else:
                logger.info('{}, waiting 60s'.format(e))
                backoffS = n * 61
                n += 1
                time.sleep(backoffS)

lilaEndpoint = 'https://lila-erc.eu/sparql/lila_knowledge_base/sparql'

In [None]:
df = pd.read_csv('drive/MyDrive/Colab Notebooks/rdflib-lab-lemmas.csv', header=0)
df.head()

In [None]:
lilaQuery = '''
    PREFIX lila: <http://lila-erc.eu/ontologies/lila/>
    PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
    PREFIX lime: <http://www.w3.org/ns/lemon/lime#>

    SELECT ?lemma ?entry
    WHERE {{
            ?lemma ontolex:writtenRep '{}'@la ;
            		  lila:hasPOS <{}> .
            ?entry ontolex:canonicalForm ?lemma .
            <http://lila-erc.eu/data/lexicalResources/LatinWordNet/Lexicon> lime:entry ?entry .
    }}

'''

In [None]:
for idx, row in df.iterrows():
  logger.info('Querying LiLa for {}...'.format(row['lemma']))
  try:
      result = query(lilaQuery.format(row['lemma'], row['pos']), lilaEndpoint)
  except urllib.error.URLError or TimeoutError or RuntimeError as e: # if query fails
      logger.info('{}: {} occurred'.format(row['lemma'], e))

  else:
    for r in result:
      lemma = URIRef(r['lemma'])
      g.add((lemma, RDF.type, ONTOLEX.Form))
      g.add((lemma, RDFS.label, Literal(row['lemma'])))
      g.add((lemma, ONTOLEX.writtenRep, Literal(row['lemma'], lang='la')))
      g.add((lemma, LEXINFO.partOfSpeech, URIRef(row['pos'])))

      entry = URIRef(r['entry'])
      g.add((entry, RDF.type, ONTOLEX.LexicalEntry))
      g.add((entry, RDFS.label, Literal(row['lemma'])))

      g.add((entry, ONTOLEX.canonicalForm, lemma))

## 4. Navigate the graph


### Contains check

In [None]:
# Check if a triple is in the graph
lemma = URIRef('http://lila-erc.eu/data/id/lemma/87191')
if (lemma, ONTOLEX.writtenRep, Literal('acerbus', lang='la')) in g:
  print('This graph knows that the written representation of http://lila-erc.eu/data/id/lemma/87191 is a acerbus@la!')
else:
  print('This graph does not know anything about the written representation of http://lila-erc.eu/data/id/lemma/87191!')


lemma = URIRef('https://lila-erc.eu/data/id/lemma/112077')
if (lemma, RDF.type, ONTOLEX.Form) in g:
  print('This graph knows that https://lila-erc.eu/data/id/lemma/112077 is a Form')
else:
  print('This graph does not know anything about https://lila-erc.eu/data/id/lemma/112077!')

### Basic triple matching
`triples()` returns all the subjects, predicates and objects matching the pattern

In [None]:
for s, p, o in g.triples((None, RDF.type, ONTOLEX.LexicalEntry)):
  print('{} is a lexical entry'.format(s))

Not very readable huh...

`value()` returns a missing element of an incomplete triple

In [None]:
for s, p, o in g.triples((None, RDF.type, ONTOLEX.LexicalEntry)):
  label = g.value(subject=s, predicate=RDFS.label)
  print('{} is a lexical entry'.format(label))

That's better!

#### Exercise.
1. Write a SPARQL query to LiLa for retrieving the concepts of the given entry
2. While iterating over the graph, print the label of each entry
3. Add the concepts to the graph as nodes and link them to their respective entry

Tip: to get an idea of the graph pattern, explore LiLa starting from the entry page, e.g., http://lila-erc.eu/data/lexicalResources/LatinWordNet/id/LexicalEntry/l_87191

In [None]:
lwnQuery = '''
    PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

    SELECT ?concept ?definition ?label
    WHERE {{
            <{}> ontolex:evokes ?concept .
            ?concept skos:definition ?definition ;
                    rdfs:label ?label .
    }}
'''

# Iterate over all the ontolex:LexicalEntry in the graph
for entry in g.subjects(predicate=RDF.type, object=ONTOLEX.LexicalEntry):
  label = g.value(subject=entry, predicate=RDFS.label, object=None)
  logger.info('Querying LiLa for {} concepts...'.format(label))
  try:
      result = query(lwnQuery.format(entry), lilaEndpoint)
  except urllib.error.URLError or TimeoutError or RuntimeError as e: # if query fails
      logger.info('{}: {} occurred'.format(entry, e))

  else:
    for r in result:
      concept = URIRef(r['concept'])
      definition = r['definition']
      label = r['label']
      g.add((concept, RDF.type, ONTOLEX.LexicalConcept))
      g.add((concept, RDFS.label, Literal(label)))
      g.add((concept, SKOS.definition, Literal(definition, lang='en')))
      g.add((entry, ONTOLEX.evokes, concept))

INFO:__main__:Querying LiLa for uoluntas concepts...
INFO:__main__:Querying LiLa for titulus concepts...
INFO:__main__:Querying LiLa for templum concepts...
INFO:__main__:Querying LiLa for simplex concepts...
INFO:__main__:Querying LiLa for sensus concepts...
INFO:__main__:Querying LiLa for senatus concepts...
INFO:__main__:Querying LiLa for scriptura concepts...
INFO:__main__:Querying LiLa for sapientia concepts...
INFO:__main__:Querying LiLa for sanctus concepts...
INFO:__main__:Querying LiLa for salus concepts...
INFO:__main__:Querying LiLa for sacramentum concepts...
INFO:__main__:Querying LiLa for regnum concepts...
INFO:__main__:Querying LiLa for potestas concepts...
INFO:__main__:Querying LiLa for pontifex concepts...
INFO:__main__:Querying LiLa for oportet concepts...
INFO:__main__:Querying LiLa for nobilitas concepts...
INFO:__main__:Querying LiLa for nepos concepts...
INFO:__main__:Querying LiLa for necessarius concepts...
INFO:__main__:Querying LiLa for ius concepts...
INFO:

Let's examine the graph

In [None]:
entry = 'imperator'
uri = g.value(subject=None, predicate=RDFS.label, object=Literal(entry))
print('{} evokes...'.format(entry.upper()))
for concept in g.objects(subject=uri, predicate=ONTOLEX.evokes):
  definition = g.value(subject=concept, predicate=SKOS.definition, object=None)
  print(definition)

IMPERATOR evokes...
sovereign of the Roman Empire
sovereign of the Holy Roman Empire
a general officer of the highest rank
the male ruler of an empire
a person who rules or commands


## 5. Expand the graph with Wikidata

In [None]:
wikidataEndopoint = 'https://query.wikidata.org/sparql'

### Exercise.

Write a SPARQL query to Wikidata for retrieving the URI of the lexeme, given the LiLa lemma. Specify `SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }}` at the end of the query.


`FILTER(regex(?lila,"{}"))`
`SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }}`


In [None]:
wdlexemeQuery = '''
    PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>

    SELECT ?lexeme
    WHERE {{
      ?lexeme a ontolex:LexicalEntry ;
          wdt:P11033 ?lila .
    FILTER(regex(?lila,"{}"))
    SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }}
    }}'''

for lemma in g.subjects(predicate=RDF.type, object=ONTOLEX.Form):
  logger.info('Querying Wikidata for {} lexeme...'.format(lemma))
  try:
      result = query(wdlexemeQuery.format(lemma.split('id/')[1]), wikidataEndopoint)
  except urllib.error.HTTPError or SPARQLExceptions.EndPointInternalError or urllib.error.URLError as e:
      logger.info('{}: {} occurred'.format(lemma, e))

  for r in result:
    lexeme = URIRef(r['lexeme'])
    g.add((lemma, RDFS.seeAlso, lexeme))

INFO:__main__:Querying Wikidata for http://lila-erc.eu/data/id/lemma/130763 lexeme...
INFO:__main__:Querying Wikidata for http://lila-erc.eu/data/id/lemma/128926 lexeme...
INFO:__main__:Querying Wikidata for http://lila-erc.eu/data/id/lemma/127783 lexeme...
INFO:__main__:Querying Wikidata for http://lila-erc.eu/data/id/lemma/124840 lexeme...
INFO:__main__:Querying Wikidata for http://lila-erc.eu/data/id/lemma/124355 lexeme...
INFO:__main__:Querying Wikidata for http://lila-erc.eu/data/id/lemma/124309 lexeme...
INFO:__main__:Querying Wikidata for http://lila-erc.eu/data/id/lemma/127417 lexeme...
INFO:__main__:Querying Wikidata for http://lila-erc.eu/data/id/lemma/123349 lexeme...
INFO:__main__:Querying Wikidata for http://lila-erc.eu/data/id/hypolemma/39324 lexeme...
INFO:__main__:Querying Wikidata for http://lila-erc.eu/data/id/lemma/123276 lexeme...
INFO:__main__:Querying Wikidata for http://lila-erc.eu/data/id/lemma/123079 lexeme...
INFO:__main__:Querying Wikidata for http://lila-erc

In [None]:
for lexeme in g.objects(subject=URIRef('http://lila-erc.eu/data/id/lemma/87191'), predicate=RDFS.seeAlso):
  print(lexeme)

## 6. Serialize the graph

In [None]:
g.serialize(destination='drive/MyDrive/Colab Notebooks/rdflib-lab-graph.ttl',format='ttl',encoding='utf-8')

In [None]:
lilaQueryConcepts = '''
    PREFIX lila: <http://lila-erc.eu/ontologies/lila/>
    PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
    PREFIX lime: <http://www.w3.org/ns/lemon/lime#>

    SELECT ?concept
    WHERE {{
            <{}> ontolex:evokes ?concept .
            <http://lila-erc.eu/data/lexicalResources/LatinWordNet/Lexicon> lime:entry <{}> . # Assicurati che questa parte sia corretta se stai usando l'URI dell'entry
    }}
'''

for entry_uri in g.subjects(RDF.type, ONTOLEX.LexicalEntry):
    # 1. Stampa l'etichetta dell'entry (per debug)
    entry_label = g.value(subject=entry_uri, predicate=RDFS.label)
    if entry_label:
        print(f'Processing entry for concepts: {entry_label} ({entry_uri})') # Stampa l'etichetta e l'URI

    # 2. Scrivi la query SPARQL a LiLa per recuperare i concetti
    # Utilizza l'URI dell'entry nella query
    logger.info('Querying LiLa for concepts of entry {}...'.format(entry_uri))
    try:
        # Utilizza entry_uri nella formattazione della query
        # Passa l'URI dell'entry due volte, una per ontolex:evokes e una per lime:entry
        result = query(lilaQueryConcepts.format(str(entry_uri), str(entry_uri)), lilaEndpoint)
        print(f"Query result for {entry_uri}: {result}") # Stampa il risultato della query (per debug)
    except urllib.error.URLError or TimeoutError or RuntimeError as e:
        logger.info('{}: {} occurred during concept query'.format(entry_uri, e))

    else:
        # 3. Aggiungi i senses al grafo
        if result: # Controlla se ci sono risultati prima di iterare
            for r in result:
                concept = URIRef(r['concept'])
                g.add((concept, RDF.type, ONTOLEX.Concept))
                # Aggiungiamo una RDFS.label anche al concetto per maggiore chiarezza, usando l'etichetta dell'entry come base
                if entry_label:
                    g.add((concept, RDFS.label, Literal(f"Concept of {entry_label}")))
                g.add((entry_uri, ONTOLEX.evokes, concept))
                print(f"  Added concept: {concept} linked to {entry_uri}") # Stampa per debug

        else:
            print(f"  No concepts found in LiLa for entry: {entry_uri}") # Stampa se nessun concetto trovato