In [1]:
from rdflib import Graph, ConjunctiveGraph
from SPARQLWrapper import SPARQLWrapper, BASIC

import pandas as pd
import time
import re

# Non mandatory step to download the data from LOV website

In [2]:
import requests
import gzip
import shutil

URL = "https://lov.linkeddata.es/lov.nq.gz"
response = requests.get(URL)
open("lov.nq.gz", "wb").write(response.content)

with gzip.open('lov.nq.gz', 'rb') as f_in:
    with open('lov.nq', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

# Remove any syntax problem

In [3]:
f = open("lov.nq", "r", encoding="utf-8")
f_write = open("lov_clean.nq", "w", encoding="utf-8")

for line in f:
    m = re.search("<(?!http)[^\s]*>", line)
    if not m:
        f_write.write(line)

f.close()
f_write.close()

# Load the Data

In [4]:
g=ConjunctiveGraph()
g.parse(source="lov_clean.nq")

<Graph identifier=file:///C:/Users/thiba/OneDrive/Documents/GitHub/CORGI_Catalog/0.%20Ontologies%20Data/LOV/lov_clean.nq (<class 'rdflib.graph.Graph'>)>

# Retrieve the data for each property

## Type 

In [5]:
q = """
SELECT ?property ?type {
    VALUES ?type { <http://www.w3.org/2002/07/owl#DatatypeProperty> <http://www.w3.org/2002/07/owl#ObjectProperty> <http://www.w3.org/1999/02/22-rdf-syntax-ns#Property>}
    ?property <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?type.
}
"""

props = {}

for r in g.query(q):
    props[r["property"]] = {"type":r["type"]}
    props[r["property"]]["context"] = str(g.context_id(r["property"]))
print(len(props))

43199


## Label

In [6]:
%%time

keys = list(props.keys())
step = 1000

for i in range(0, len(keys), step):

    q = """
    SELECT ?property ?label {
        VALUES ?property { <"""+"> <".join(keys[i:i+step])+"""> }
        {?property <http://www.w3.org/2000/01/rdf-schema#label> ?label}
        UNION 
        {?property ?props ?label.
         ?props <http://www.w3.org/2000/01/rdf-schema#subPropertyOf> <http://www.w3.org/2000/01/rdf-schema#label>}
    }
    """

    for r in g.query(q):
        props[r["property"]]["label"] = r["label"]

Wall time: 50 s


## Description

In [7]:
%%time

relation_descriptions = ["<http://purl.org/dc/elements/1.1/description>", "<http://purl.org/dc/terms/description>",\
                        "<http://www.w3.org/2000/01/rdf-schema#comment>", "<http://www.w3.org/2000/01/rdf-schema#description>"]

keys = list(props.keys())
step = 1000

for i in range(0, len(keys), step):

    for relation_description in relation_descriptions:

        q = """
        SELECT ?property ?description {
            VALUES ?property { <"""+"> <".join(keys[i:i+step])+"""> }
            ?property """+relation_description+"""  ?description.
        }
        """

        for r in g.query(q):
    #         print(r)
            props[r["property"]]["description"] = r["description"]

Wall time: 19.6 s


## Domain

In [8]:
%%time

keys = list(props.keys())
step = 1000

for i in range(0, len(keys), step):
    
    q = """
    SELECT ?property ?domain {
        VALUES ?property { <"""+"> <".join(keys[i:i+step])+"""> }
        ?property <http://www.w3.org/2000/01/rdf-schema#domain>  ?domain.
    }
    """

    for r in g.query(q):
        if "domain" in props[r["property"]]:
            props[r["property"]]["domain"].add(str(r["domain"]))
        else:
            props[r["property"]]["domain"] = set([str(r["domain"])])

Wall time: 7.22 s


## Range

In [9]:
%%time

keys = list(props.keys())
step = 1000

for i in range(0, len(keys), step):

    q = """
    SELECT ?property ?range {
        VALUES ?property { <"""+"> <".join(keys[i:i+step])+"""> }
        ?property <http://www.w3.org/2000/01/rdf-schema#range>  ?range.
    }
    """

    for r in g.query(q):
        if "range" in props[r["property"]]:
            props[r["property"]]["range"].add(str(r["range"]))
        else:
            props[r["property"]]["range"] = set([str(r["range"])])

Wall time: 6.55 s


## Due to bad data we have to remove some rows

In [20]:
props_df = pd.DataFrame.from_dict(props, orient="index")

props_df=props_df[props_df.index.map(lambda x: x[:4]=="http")]

## Final Property Data

In [21]:
props_df

Unnamed: 0,type,context,range,label,description,domain
http://aims.fao.org/aos/geopolitical.owl#HDIUnit,http://www.w3.org/2002/07/owl#DatatypeProperty,http://aims.fao.org/aos/geopolitical.owl#context,{http://www.w3.org/2001/XMLSchema#string},,,
http://aims.fao.org/aos/geopolitical.owl#HDITotal,http://www.w3.org/2002/07/owl#DatatypeProperty,http://aims.fao.org/aos/geopolitical.owl#context,{http://www.w3.org/2001/XMLSchema#float},,,
http://aims.fao.org/aos/geopolitical.owl#landAreaUnit,http://www.w3.org/2002/07/owl#DatatypeProperty,http://aims.fao.org/aos/geopolitical.owl#context,{http://www.w3.org/2001/XMLSchema#string},land area unit,,
http://aims.fao.org/aos/geopolitical.owl#nameListFR,http://www.w3.org/2002/07/owl#DatatypeProperty,http://aims.fao.org/aos/geopolitical.owl#context,,nameListFR,,
http://aims.fao.org/aos/geopolitical.owl#nationalityIT,http://www.w3.org/2002/07/owl#DatatypeProperty,http://aims.fao.org/aos/geopolitical.owl#context,,nationalityIT,,
...,...,...,...,...,...,...
https://w3id.org/tree#timeQuery,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,https://w3id.org/tree#context,{http://www.w3.org/2001/XMLSchema#dateTime},Time Query,Will search for elements starting from a certa...,{https://w3id.org/tree#Node}
https://w3id.org/tree#remainingItems,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,https://w3id.org/tree#context,{http://www.w3.org/2001/XMLSchema#integer},Remaining Items,Total number of items of this node and its chi...,{https://w3id.org/tree#Node}
https://w3id.org/tree#search,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,https://w3id.org/tree#context,{http://www.w3.org/ns/hydra/core#IriTemplate},Search,The Node can be searched for child nodes.,{https://w3id.org/tree#Node}
https://w3id.org/tree#shape,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,https://w3id.org/tree#context,{http://www.w3.org/ns/shacl#NodeShape},Shape,The SHACL shape the members of the collection ...,{https://w3id.org/tree#Collection}


In [22]:
props_df.to_csv("all_props_from_LOV.csv")

In [23]:
props_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 42658 entries, http://aims.fao.org/aos/geopolitical.owl#HDIUnit to https://w3id.org/tree#zoom
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   type         42658 non-null  object
 1   context      42658 non-null  object
 2   range        30500 non-null  object
 3   label        38330 non-null  object
 4   description  18768 non-null  object
 5   domain       30180 non-null  object
dtypes: object(6)
memory usage: 2.3+ MB


# Retrieve the data for each class

In [13]:
%%time

classes = {}

q = """
SELECT ?class  {
    ?class  <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/2000/01/rdf-schema#Class>.
}
"""

for r in g.query(q):
    classes[r["class"]] = {"context":str(g.context_id(r["class"]))}
print(len(classes))

3039
Wall time: 207 ms


## Label

In [14]:
%%time

keys = list(classes.keys())
step = 1000

for i in range(0, len(keys), step):

    q = """
    SELECT ?class ?label {
        VALUES ?class { <"""+"> <".join(keys[i:i+step])+"""> }
        {?class <http://www.w3.org/2000/01/rdf-schema#label> ?label}
        UNION 
        {?class ?props ?label.
         ?props <http://www.w3.org/2000/01/rdf-schema#subPropertyOf> <http://www.w3.org/2000/01/rdf-schema#label>}
    }
    """

    for r in g.query(q):
        classes[r["class"]]["label"] = r["label"]

Wall time: 4.32 s


## Description

In [15]:
%%time

relation_descriptions = ["<http://purl.org/dc/elements/1.1/description>", "<http://purl.org/dc/terms/description>",\
                        "<http://www.w3.org/2000/01/rdf-schema#comment>", "<http://www.w3.org/2000/01/rdf-schema#description>"]

keys = list(classes.keys())
step = 1000

for i in range(0, len(keys), step):

    for relation_description in relation_descriptions:

        q = """
        SELECT ?class ?description {
            VALUES ?class { <"""+"> <".join(keys[i:i+step])+"""> }
            ?class """+relation_description+"""  ?description.
        }
        """

        for r in g.query(q):
            classes[r["class"]]["description"] = r["description"]

Wall time: 1.55 s


## Due to bad data we have to remove some rows

In [16]:
classes_df = pd.DataFrame.from_dict(classes, orient="index")

classes_df=classes_df[classes_df.index.map(lambda x: x[:4]=="http")]

## Final Class Data

In [17]:
classes_df

Unnamed: 0,context,label,description
http://commontag.org/ns#ReaderTag,http://commontag.org/ns#context,Reader Tag,A Tag asserted by the reader (consumer) of a c...
http://commontag.org/ns#AutoTag,http://commontag.org/ns#context,Auto Tag,A Tag asserted by an automated tool on a conte...
http://commontag.org/ns#Tag,http://commontag.org/ns#context,Tag,A Common Tag associating a URI and a keyword t...
http://commontag.org/ns#AuthorTag,http://commontag.org/ns#context,Author Tag,A Tag asserted by the author of a content reso...
http://commontag.org/ns#TaggedContent,http://commontag.org/ns#context,Tagged Content,Content which has one or more Common Tag.
...,...,...,...
https://w3id.org/tree#PrefixRelation,https://w3id.org/tree#context,Prefix Relation,All members of this related node start with th...
https://w3id.org/tree#SubstringRelation,https://w3id.org/tree#context,Substring Relation,All members of this related node contain this ...
https://w3id.org/tree#InBetweenRelation,https://w3id.org/tree#context,In Between Relation,For comparing intervals: all further members a...
https://w3id.org/tree#ConditionalImport,https://w3id.org/tree#context,Conditional Import,Import a page when the tree:path is interestin...


In [18]:
classes_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3031 entries, http://commontag.org/ns#ReaderTag to https://w3id.org/tree#Node
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   context      3031 non-null   object
 1   label        2989 non-null   object
 2   description  2519 non-null   object
dtypes: object(3)
memory usage: 94.7+ KB


In [19]:
classes_df.to_csv("all_classes_from_LOV.csv")