In [1]:
from rdflib import Graph, ConjunctiveGraph
from SPARQLWrapper import SPARQLWrapper, BASIC

import pandas as pd
import time
import re

# Non mandatory step to download the data from LOV website

In [2]:
import requests
import gzip
import shutil

URL = "https://lov.linkeddata.es/lov.nq.gz"
response = requests.get(URL)
open("lov.nq.gz", "wb").write(response.content)

with gzip.open('lov.nq.gz', 'rb') as f_in:
    with open('lov.nq', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

# Remove any syntax problem

In [3]:
f = open("lov.nq", "r", encoding="utf-8")
f_write = open("lov_clean.nq", "w", encoding="utf-8")

for line in f:
    m = re.search("<(?!http)[^\s]*>", line)
    if not m:
        f_write.write(line)

f.close()
f_write.close()

# Load the Data

In [4]:
g=ConjunctiveGraph()
g.parse(source="lov_clean.nq")

<Graph identifier=file:///C:/Users/thiba/OneDrive/Documents/GitHub/CORGI_Catalog/0.%20Ontologies%20Data/LOV/lov_clean.nq (<class 'rdflib.graph.Graph'>)>

# Retrieve the data for each property

## Type & Origin

In [48]:
q = """
SELECT ?property ?type ?origin {
    VALUES ?type { <http://www.w3.org/2002/07/owl#DatatypeProperty> <http://www.w3.org/2002/07/owl#ObjectProperty> <http://www.w3.org/1999/02/22-rdf-syntax-ns#Property>}
    GRAPH ?origin {?property <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?type}.
}
"""

props = {}

for r in g.query(q):
    props[r["property"]] = {"type":r["type"]}
    props[r["property"]]["context"] = str(r["origin"])
print(len(props))

43199


## Label

In [49]:
%%time

keys = list(props.keys())
step = 1000

for i in range(0, len(keys), step):

    q = """
    SELECT ?property ?label {
        VALUES ?property { <"""+"> <".join(keys[i:i+step])+"""> }
        {?property <http://www.w3.org/2000/01/rdf-schema#label> ?label}
        UNION 
        {?property ?props ?label.
         ?props <http://www.w3.org/2000/01/rdf-schema#subPropertyOf> <http://www.w3.org/2000/01/rdf-schema#label>}
    }
    """

    for r in g.query(q):
        props[r["property"]]["label"] = r["label"]

Wall time: 45.3 s


## Description

In [50]:
%%time

relation_descriptions = ["<http://purl.org/dc/elements/1.1/description>", "<http://purl.org/dc/terms/description>",\
                        "<http://www.w3.org/2000/01/rdf-schema#comment>", "<http://www.w3.org/2000/01/rdf-schema#description>"]

keys = list(props.keys())
step = 1000

for i in range(0, len(keys), step):

    for relation_description in relation_descriptions:

        q = """
        SELECT ?property ?description {
            VALUES ?property { <"""+"> <".join(keys[i:i+step])+"""> }
            ?property """+relation_description+"""  ?description.
        }
        """

        for r in g.query(q):
    #         print(r)
            props[r["property"]]["description"] = r["description"]

Wall time: 18.3 s


## Domain

In [51]:
%%time

keys = list(props.keys())
step = 1000

for i in range(0, len(keys), step):
    
    q = """
    SELECT ?property ?domain {
        VALUES ?property { <"""+"> <".join(keys[i:i+step])+"""> }
        ?property <http://www.w3.org/2000/01/rdf-schema#domain>  ?domain.
    }
    """

    for r in g.query(q):
        if "domain" in props[r["property"]]:
            props[r["property"]]["domain"].add(str(r["domain"]))
        else:
            props[r["property"]]["domain"] = set([str(r["domain"])])

Wall time: 7.02 s


## Range

In [52]:
%%time

keys = list(props.keys())
step = 1000

for i in range(0, len(keys), step):

    q = """
    SELECT ?property ?range {
        VALUES ?property { <"""+"> <".join(keys[i:i+step])+"""> }
        ?property <http://www.w3.org/2000/01/rdf-schema#range>  ?range.
    }
    """

    for r in g.query(q):
        if "range" in props[r["property"]]:
            props[r["property"]]["range"].add(str(r["range"]))
        else:
            props[r["property"]]["range"] = set([str(r["range"])])

Wall time: 5.85 s


## Due to bad data we have to remove some rows

In [53]:
props_df = pd.DataFrame.from_dict(props, orient="index")

props_df=props_df[props_df.index.map(lambda x: x[:4]=="http")]

## Final Property Data

In [54]:
props_df

Unnamed: 0,type,context,label,description,domain,range
http://purl.org/dc/terms/description,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,http://www.ebu.ch/metadata/ontologies/ebucore/...,description,An account of the resource.\nDescription may i...,{http://linkeddata.finki.ukim.mk/lod/ontology/...,"{http://www.w3.org/2001/XMLSchema#string, http..."
http://purl.org/dc/terms/title,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,http://www.ebu.ch/metadata/ontologies/ebucore/...,title,Describes the title of an entity (e.g. idea ti...,"{Ncfa4ec87238941d2a40d424c419f2d3a, http://www...","{http://www.w3.org/2001/XMLSchema#string, http..."
http://purl.org/dc/terms/modified,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,http://www.ebu.ch/metadata/ontologies/ebucore/...,modified,The dcterms:modified property fully represents...,{https://w3id.org/nno/ontology#NeuralNetwork},"{http://www.w3.org/2001/XMLSchema#dateTime, ht..."
http://purl.org/vocab/vann/preferredNamespaceUri,http://www.w3.org/2002/07/owl#DatatypeProperty,http://vivoweb.org/ontology/core,vann:preferredNamespaceUri,The preferred namespace URI to use when using ...,,
http://purl.org/vocab/vann/preferredNamespacePrefix,http://www.w3.org/2002/07/owl#DatatypeProperty,http://kdo.render-project.eu/kdo#,vann:preferredNamespacePrefix,The preferred namespace prefix to use when usi...,,
...,...,...,...,...,...,...
http://ndl.go.jp/dcndl/terms/seriesAlternativeTranscription,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,http://ndl.go.jp/dcndl/terms/,Series Alternative Transcription,Series Alternativeの読み又は翻字形,,
http://ndl.go.jp/dcndl/terms/holdingIssues,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,http://ndl.go.jp/dcndl/terms/,Holding Issues,所蔵する逐次刊行物の巻次・年月次,,
http://ndl.go.jp/dcndl/terms/cataloguingStatus,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,http://ndl.go.jp/dcndl/terms/,Cataloguing Status,書誌レコード作成のステータス,,
http://ndl.go.jp/dcndl/terms/holdingAgent,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,http://ndl.go.jp/dcndl/terms/,Holding Agent,当該情報資源の保有者,,


In [55]:
props_df.to_csv("all_props_from_LOV.csv")

In [56]:
props_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 42658 entries, http://purl.org/dc/terms/description to http://ndl.go.jp/dcndl/terms/record
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   type         42658 non-null  object
 1   context      42658 non-null  object
 2   label        38330 non-null  object
 3   description  18768 non-null  object
 4   domain       30180 non-null  object
 5   range        30500 non-null  object
dtypes: object(6)
memory usage: 2.3+ MB


# Retrieve the data for each class

In [57]:
%%time

classes = {}

q = """
SELECT ?class ?origin {
    VALUES ?type {rdfs:Class owl:Class}
    GRAPH ?origin{?class  <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?type}.
}
"""

for r in g.query(q):
    classes[r["class"]] = {"context":str(r["origin"])}
print(len(classes))

48875
Wall time: 1min 8s


## Label

In [58]:
%%time

keys = list(classes.keys())
step = 1000

for i in range(0, len(keys), step):

    q = """
    SELECT ?class ?label {
        VALUES ?class { <"""+"> <".join(keys[i:i+step])+"""> }
        {?class <http://www.w3.org/2000/01/rdf-schema#label> ?label}
        UNION 
        {?class ?props ?label.
         ?props <http://www.w3.org/2000/01/rdf-schema#subPropertyOf> <http://www.w3.org/2000/01/rdf-schema#label>}
    }
    """

    for r in g.query(q):
        classes[r["class"]]["label"] = r["label"]

Wall time: 48.9 s


## Description

In [59]:
%%time

relation_descriptions = ["<http://purl.org/dc/elements/1.1/description>", "<http://purl.org/dc/terms/description>",\
                        "<http://www.w3.org/2000/01/rdf-schema#comment>", "<http://www.w3.org/2000/01/rdf-schema#description>"]

keys = list(classes.keys())
step = 1000

for i in range(0, len(keys), step):

    for relation_description in relation_descriptions:

        q = """
        SELECT ?class ?description {
            VALUES ?class { <"""+"> <".join(keys[i:i+step])+"""> }
            ?class """+relation_description+"""  ?description.
        }
        """

        for r in g.query(q):
            classes[r["class"]]["description"] = r["description"]

Wall time: 20.5 s


## Due to bad data we have to remove some rows

In [60]:
classes_df = pd.DataFrame.from_dict(classes, orient="index")

classes_df=classes_df[classes_df.index.map(lambda x: x[:4]=="http")]

## Final Class Data

In [61]:
classes_df

Unnamed: 0,context,label,description
http://www.w3.org/2004/02/skos/core#Concept,http://purl.org/spar/fabio,concept,The class skos:Concept is the class of SKOS co...
http://www.w3.org/ns/person#Person,http://www.w3.org/ns/person,Officer,"An individual person who may be dead or alive,..."
http://data.lirmm.fr/ontologies/oan/PositionArticle,http://data.lirmm.fr/ontologies/oan,Position de l'article,"La classe ""PositionArticle"" indique dans quel ..."
http://data.lirmm.fr/ontologies/oan/Seance,http://data.lirmm.fr/ontologies/oan,Séance,"La classe ""Seance"" représente les différents r..."
http://data.lirmm.fr/ontologies/oan/Amendement,http://data.lirmm.fr/ontologies/oan,Amendement,"La classe ""Amendement"" sert à représenter la n..."
...,...,...,...
http://open.vocab.org/terms/NotAKillerGorilla,http://open.vocab.org/terms,Non Killer-Gorillas,Indicates that something is definitely not a K...
http://open.vocab.org/terms/Course,http://open.vocab.org/terms,Course,
http://open.vocab.org/terms/DeletedEntry,http://open.vocab.org/terms,Deleted Entries,A construct representing a deleted entry in an...
http://open.vocab.org/terms/SummerOlympicGames,http://open.vocab.org/terms,Summer Olympic Games,


In [62]:
classes_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36452 entries, http://www.w3.org/2004/02/skos/core#Concept to http://open.vocab.org/terms/UnivCourse
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   context      36452 non-null  object
 1   label        30235 non-null  object
 2   description  17917 non-null  object
dtypes: object(3)
memory usage: 1.1+ MB


In [63]:
classes_df.to_csv("all_classes_from_LOV.csv")