In [1]:
from rdflib import Graph, ConjunctiveGraph
from SPARQLWrapper import SPARQLWrapper, BASIC

import pandas as pd
import time
import re

# Non mandatory step to download the data from LOV website

In [2]:
import requests
import gzip
import shutil

URL = "https://lov.linkeddata.es/lov.nq.gz"
response = requests.get(URL)
open("lov.nq.gz", "wb").write(response.content)

with gzip.open('lov.nq.gz', 'rb') as f_in:
    with open('lov.nq', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

# Remove any syntax problem

In [3]:
f = open("lov.nq", "r", encoding="utf-8")
f_write = open("lov_clean.nq", "w", encoding="utf-8")

for line in f:
    m = re.search("<(?!http)[^\s]*>", line)
    if not m:
        f_write.write(line)

f.close()
f_write.close()

# Load the Data

In [4]:
g=ConjunctiveGraph()
g.parse(source="lov_clean.nq")

<Graph identifier=file:///C:/Users/thiba/OneDrive/Documents/GitHub/CORGI_Catalog/Ontologies%20Data/LOV/lov_clean.nq (<class 'rdflib.graph.Graph'>)>

# Retrieve the data for each property

## Type 

In [5]:
q = """
SELECT ?property ?type {
    VALUES ?type { <http://www.w3.org/2002/07/owl#DatatypeProperty> <http://www.w3.org/2002/07/owl#ObjectProperty> <http://www.w3.org/1999/02/22-rdf-syntax-ns#Property>}
    ?property <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?type.
}
"""

props = {}

for r in g.query(q):
    props[r["property"]] = {"type":r["type"]}
    props[r["property"]]["context"] = {str(g.context_id(r["property"]))}
print(len(props))

43199


## Label

In [6]:
%%time

keys = list(props.keys())
step = 1000

for i in range(0, len(keys), step):

    q = """
    SELECT ?property ?label {
        VALUES ?property { <"""+"> <".join(keys[i:i+step])+"""> }
        {?property <http://www.w3.org/2000/01/rdf-schema#label> ?label}
        UNION 
        {?property ?props ?label.
         ?props <http://www.w3.org/2000/01/rdf-schema#subPropertyOf> <http://www.w3.org/2000/01/rdf-schema#label>}
    }
    """

    for r in g.query(q):
        props[r["property"]]["label"] = r["label"]
        if "context" in props[r["property"]]:
            props[r["property"]]["context"].add(str(g.context_id(r["property"])))
        else:
            props[r["property"]]["context"] = {str(g.context_id(r["property"]))}

Wall time: 42.1 s


## Description

In [None]:
%%time

relation_descriptions = ["<http://purl.org/dc/elements/1.1/description>", "<http://purl.org/dc/terms/description>",\
                        "<http://www.w3.org/2000/01/rdf-schema#comment>", "<http://www.w3.org/2000/01/rdf-schema#description>"]

keys = list(props.keys())
step = 1000

for i in range(0, len(keys), step):

    for relation_description in relation_descriptions:

        q = """
        SELECT ?property ?description {
            VALUES ?property { <"""+"> <".join(keys[i:i+step])+"""> }
            ?property """+relation_description+"""  ?description.
        }
        """

        for r in g.query(q):
    #         print(r)
            props[r["property"]]["description"] = r["description"]
            if "context" in props[r["property"]]:
                props[r["property"]]["context"].add(str(g.context_id(r["property"])))
            else:
                props[r["property"]]["context"] = {str(g.context_id(r["property"]))}

## Domain

In [None]:
%%time

keys = list(props.keys())
step = 1000

for i in range(0, len(keys), step):
    
    q = """
    SELECT ?property ?domain {
        VALUES ?property { <"""+"> <".join(keys[i:i+step])+"""> }
        ?property <http://www.w3.org/2000/01/rdf-schema#domain>  ?domain.
    }
    """

    for r in g.query(q):
        if "domain" in props[r["property"]]:
            props[r["property"]]["domain"].add(r["domain"])
        else:
            props[r["property"]]["domain"] = set([r["domain"]])

        if "context" in props[r["property"]]:
            props[r["property"]]["context"].add(str(g.context_id(r["property"])))
        else:
            props[r["property"]]["context"] = {str(g.context_id(r["property"]))}

## Range

In [None]:
%%time

keys = list(props.keys())
step = 1000

for i in range(0, len(keys), step):

    q = """
    SELECT ?property ?range {
        VALUES ?property { <"""+"> <".join(keys[i:i+step])+"""> }
        ?property <http://www.w3.org/2000/01/rdf-schema#range>  ?range.
    }
    """

    for r in g.query(q):
        if "range" in props[r["property"]]:
            props[r["property"]]["range"].add(r["range"])
        else:
            props[r["property"]]["range"] = set([r["range"]])

        if "context" in props[r["property"]]:
            props[r["property"]]["context"].add(str(g.context_id(r["property"])))
        else:
            props[r["property"]]["context"] = {str(g.context_id(r["property"]))}

## Final Property Data

In [None]:
props_df = pd.DataFrame.from_dict(props, orient="index")
props_df

In [None]:
props_df.to_csv("all_props_from_LOV.csv")

In [None]:
props_df.info()

# Retrieve the data for each class

In [None]:
%%time

classes = {}

q = """
SELECT ?class  {
    ?class  <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/2000/01/rdf-schema#Class>.
}
"""

for r in g.query(q):
    classes[r["class"]] = {"context":{str(g.context_id(r["class"]))}}
print(len(classes))

## Label

In [None]:
%%time

keys = list(classes.keys())
step = 1000

for i in range(0, len(keys), step):

    q = """
    SELECT ?class ?label {
        VALUES ?class { <"""+"> <".join(keys[i:i+step])+"""> }
        {?class <http://www.w3.org/2000/01/rdf-schema#label> ?label}
        UNION 
        {?class ?props ?label.
         ?props <http://www.w3.org/2000/01/rdf-schema#subPropertyOf> <http://www.w3.org/2000/01/rdf-schema#label>}
    }
    """

    for r in g.query(q):
        classes[r["class"]]["label"] = r["label"]
        if "context" in classes[r["class"]]:
            classes[r["class"]]["context"].add(str(g.context_id(r["class"])))
        else:
            classes[r["class"]]["context"] = {str(g.context_id(r["class"]))}

## Description

In [None]:
%%time

relation_descriptions = ["<http://purl.org/dc/elements/1.1/description>", "<http://purl.org/dc/terms/description>",\
                        "<http://www.w3.org/2000/01/rdf-schema#comment>", "<http://www.w3.org/2000/01/rdf-schema#description>"]

keys = list(classes.keys())
step = 1000

for i in range(0, len(keys), step):

    for relation_description in relation_descriptions:

        q = """
        SELECT ?class ?description {
            VALUES ?class { <"""+"> <".join(keys[i:i+step])+"""> }
            ?class """+relation_description+"""  ?description.
        }
        """

        for r in g.query(q):
    #         print(r)
            classes[r["class"]]["description"] = r["description"]
            if "context" in classes[r["class"]]:
                classes[r["class"]]["context"].add(str(g.context_id(r["class"])))
            else:
                classes[r["class"]]["context"] = {str(g.context_id(r["class"]))}

## Final Class Data

In [None]:
classes_df = pd.DataFrame.from_dict(classes, orient="index")
classes_df

In [None]:
classes_df.info()

In [None]:
classes_df.to_csv("all_classes.csv")