In [1]:
import requests
import time
import numpy as np
from write_data import *
import pandas as pd

### The goal of this notebook is to retrieve the data from Wikidata however we do not have a direct access to a dump hence we will go through every possible property (until we do not have any results) to retrieve their labels, descriptions, ranges, domains and types.

### However it should not be relaunched as it can take multiple hours to retrieve all the data

In [2]:
context = "http://wikidata.org/"

In [3]:
#We will use this function to generate the possible property from wd:P_CPT to wd:P_CPT+step
def generate_all_prop(cpt, step):
    return "wd:P"+" wd:P".join([str(i) for i in range(cpt*step, (cpt+1)*step)])

# Properties

## Type

In [4]:
url = 'https://query.wikidata.org/sparql'
dict_prop = {}

In [None]:
%%time
cpt = 0
step = 250

while True:
    print(cpt)
    query = """
    
    SELECT DISTINCT ?property ?type
    WHERE {
      VALUES ?property {"""+generate_all_prop(cpt, step)+"""} 
      ?property wikibase:propertyType ?type.
    }  LIMIT 10000"""
    
    r = requests.get(url, params = {'format': 'json', 'query': query})
    print(r)
    if str(r) == "<Response [200]>":
        cpt+=1
        data = r.json()
        if len(data["results"]["bindings"]) > 1:
            for line in data["results"]["bindings"]:
                dict_prop[line["property"]["value"]] = {"context":context, "label":set(), "comment":set(), "type":line["type"]["value"], "range":set(), "domain":set()}
        else:
            print("stop")
            break
        time.sleep(15)
        
    else:
        time.sleep(60)
    

0
<Response [200]>
1
<Response [200]>
2
<Response [200]>
3
<Response [200]>
4
<Response [200]>
5
<Response [200]>
6
<Response [429]>
6
<Response [200]>
7
<Response [200]>
8
<Response [200]>
9
<Response [200]>
10
<Response [200]>


## Now that we know which properties do exist we can only queries those from the variable props_possible.

In [None]:
res_array = np.array(list(dict_prop.keys()))
props_possible = pd.Series(list(set(res_array))).map(lambda x: x.split("/")[-1])
props_possible = list(props_possible)

## Label

In [None]:
%%time
step = 250
i=0
while i < len(props_possible): 
    print(i)
    query = """
    
    SELECT DISTINCT ?property ?label
    WHERE {
      VALUES ?property { wd:"""+" wd:".join(props_possible[i:i+step])+""" } 
      ?property rdfs:label ?label.
    }  LIMIT 10000"""
    
    cpt+=1
    r = requests.get(url, params = {'format': 'json', 'query': query})
    print(r)
    if str(r) == "<Response [200]>":
        i+=step
        data = r.json()

        if len(data["results"]["bindings"]) > 1:
            for line in data["results"]["bindings"]:
                dict_prop[line["property"]["value"]]["label"].add((line["label"]["value"], line["label"]["xml:lang"]))
    else:
        time.sleep(60)
    
    time.sleep(15)

## Comment

In [None]:
%%time
step = 250
i=0
while i < len(props_possible):
    print(i)
    query = """
    
    SELECT DISTINCT ?property ?description
    WHERE {
      VALUES ?property { wd:"""+" wd:".join(props_possible[i:i+step])+""" } 
      ?property schema:description ?description.
    }  LIMIT 10000"""
    
    cpt+=1
    r = requests.get(url, params = {'format': 'json', 'query': query})
    print(r)
    if str(r) == "<Response [200]>":
        i+=step
        data = r.json()

        if len(data["results"]["bindings"]) > 1:
            for line in data["results"]["bindings"]:
                dict_prop[line["property"]["value"]]["comment"].add((line["description"]["value"], line["description"]["xml:lang"]))
    else:
        time.sleep(60)
    time.sleep(15)

## Range

In [None]:
classes_seen = set()

In [None]:
url = 'https://query.wikidata.org/sparql'
step = 250
i=0
while i < len(props_possible):
    print(i)
    query = """
    
    SELECT DISTINCT ?property ?range
    WHERE {
      VALUES ?property { wd:"""+" wd:".join(props_possible[i:i+step])+""" } 
      ?property p:P2302 ?statement.
      ?statement ps:P2302 wd:Q21510865.
      ?statement pq:P2308 ?range
    }  LIMIT 10000"""
    
    r = requests.get(url, params = {'format': 'json', 'query': query})
    print(r)
    if str(r) == "<Response [200]>":
        i+=step
        data = r.json()
        if len(data["results"]["bindings"]) > 1:
            for line in data["results"]["bindings"]:

                dict_prop[line["property"]["value"]]["range"].add(line["range"]["value"])
                classes_seen.add(line["range"]["value"])
        else:
            print("Got nothing")
    else:
        time.sleep(60)
        
    time.sleep(15)

## Domain

In [None]:
url = 'https://query.wikidata.org/sparql'
step = 250
i=0
while i < len(props_possible):
    print(i)
    query = """
    
    SELECT DISTINCT ?property ?domain
    WHERE {
      VALUES ?property { wd:"""+" wd:".join(props_possible[i:i+step])+""" } 
      ?property p:P2302 ?v.
      ?v ps:P2302 wd:Q21503250.
      ?v pq:P2308 ?domain
    }  LIMIT 10000"""
    
    r = requests.get(url, params = {'format': 'json', 'query': query})
    print(r)
    
    if str(r) == "<Response [200]>":
        i+=step
        data = r.json()
        if len(data["results"]["bindings"]) > 1:
            for line in data["results"]["bindings"]:

                dict_prop[line["property"]["value"]]["domain"].add(line["domain"]["value"])
                classes_seen.add(line["domain"]["value"])
        else:
            print("Got nothing")
    else:
        time.sleep(60)
        
    time.sleep(15)

## We write the data.

In [None]:
from write_data import *

f = open("Properties.nt", "w", encoding="utf-8")

write_property(f, dict_prop)
    
f.close()

# Classes

## We first retrieve the property data so we can launch the second part without the first and obtain the classes that we need to retrieve from the Domain and Range.

In [None]:
# This function will transfrom a list of URI from their full length "http://wikidata/...." to the short version "wd:" 
def full_to_short(list_classes, short):
    res = ""
    for c in list_classes:
        res += " "+short+c.split("/")[-1]+" "
    return res

## Label and Description

In [None]:
url = 'https://query.wikidata.org/sparql'
step = 250

data_class = {}

list_classes_to_retrieve = list(classes_seen)
i=0
while i < len(list_classes_to_retrieve):
    print(i)
    query = """
    
    SELECT DISTINCT ?class ?label
    WHERE {
      VALUES ?class { """+full_to_short(list_classes_to_retrieve[i:i+step], "wd:")+""" } 
      ?class rdfs:label ?label.
    }  LIMIT 10000"""
    
    r = requests.get(url, params = {'format': 'json', 'query': query})
    print(r)
    
    if str(r) == "<Response [200]>":
        i+=step
        data = r.json()
        if len(data["results"]["bindings"]) > 1:
            for line in data["results"]["bindings"]:

                if line["class"]["value"] in data_class:
                    data_class[line["class"]["value"]]["label"].add((line["label"]["value"], line["label"]["xml:lang"]))
                else:
                    data_class[line["class"]["value"]] = {"context":context, "label":set(), "comment":set()}
                    data_class[line["class"]["value"]]["label"].add((line["label"]["value"], line["label"]["xml:lang"]))
        else:
            print("Got nothing")
    else:
        time.sleep(60)
        
    time.sleep(15)

In [None]:
step = 250
i=0
while i < len(list_classes_to_retrieve):
    print(i)
    query = """
    
    SELECT DISTINCT ?class ?comment
    WHERE {
      VALUES ?class { """+full_to_short(list_classes_to_retrieve[i:i+step], "wd:")+""" } 
      ?class schema:description ?comment.
    }  LIMIT 10000"""
    
    r = requests.get(url, params = {'format': 'json', 'query': query})
    print(r)
    
    if str(r) == "<Response [200]>":
        i+=step
        data = r.json()
        if len(data["results"]["bindings"]) > 1:
            for line in data["results"]["bindings"]:

                if line["class"]["value"] in data_class:
                    data_class[line["class"]["value"]]["comment"].add((line["comment"]["value"], line["comment"]["xml:lang"]))
        else:
            print("Got nothing")
    else:
        time.sleep(60)
        
    time.sleep(15)

## Write the Class Data

In [None]:
def write_class(f, dictionary):

    for key in dictionary:

        prop_data = dictionary[key]
        key = "<"+key+">"

        for label, lang in prop_data["label"]:

            label_to_write = str(label)
            label_to_write = re.sub("\s", " ", label_to_write)
            label_to_write = label_to_write.replace('"',' ')
            label_to_write = label_to_write.replace('\\',' ')

            f.write(f'{key} <http://www.w3.org/2000/01/rdf-schema#label> "{label_to_write}"@{lang}.\n')

        f.write(f'{key} <http://graph/origin> <{prop_data["context"]}>.\n')

        f.write(f'{key} <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/1999/02/22-rdf-syntax-ns#Class>.\n')

        for comment, lang in prop_data["comment"]:

            comment_to_write = str(comment)
            comment_to_write = re.sub("\s", " ", comment_to_write)
            comment_to_write = comment_to_write.replace('"',' ')
            comment_to_write = comment_to_write.replace('\\',' ')

            f.write(f'{key} <http://www.w3.org/2000/01/rdf-schema#description> "{comment_to_write}"@{lang}.\n')


In [None]:
f = open("Classes.nt", "w", encoding="utf-8")

write_class(f, data_class)

f.close()