In [1]:
import requests
import time
from SPARQLWrapper import SPARQLWrapper
import numpy as np
import pandas as pd 

### The goal of this notebook is to retrieve the data from Wikidata however we do not have a direct access to a dump hence we will go through every possible property (until we do not have any results) to retrieve their labels, descriptions, ranges, domains and types.

### However it should not be relaunched as it can take multiple hours to retrieve all the data

In [2]:
#We will use this function to generate the possible property from wd:P_CPT to wd:P_CPT+step
def generate_all_prop(cpt, step):
    return "wd:P"+" wd:P".join([str(i) for i in range(cpt*step, (cpt+1)*step)])

# Properties

## Label, description and type

In [3]:
url = 'https://query.wikidata.org/sparql'
res = []
cpt = 0
step = 250

while True:
    print(cpt)
    query = """
    
    SELECT DISTINCT ?property ?label ?description ?type
    WHERE {
      VALUES ?property {"""+generate_all_prop(cpt, step)+"""} 
      ?property rdfs:label ?label.
      OPTIONAL{?property schema:description ?description}.
      ?property wikibase:propertyType ?type.
      FILTER(LANG(?description) = LANG(?label))
      FILTER(LANG(?description) = "en")
    }  LIMIT 10000"""
    
    cpt+=1
    r = requests.get(url, params = {'format': 'json', 'query': query})
    print(r)
    data = r.json()
    if len(data["results"]["bindings"]) > 1:
        for line in data["results"]["bindings"]:
            prop = line["property"]["value"]
            label = line["label"]["value"]
            description = ""
            if "description" in line:
                description = line["description"]["value"]
            typ =  line["type"]["value"]
            res.append([prop, label, description, typ])
    else:
        print("stop")
        break
    time.sleep(15)

0
<Response [200]>
1
<Response [200]>
2
<Response [200]>
3
<Response [200]>
4
<Response [200]>
5
<Response [200]>
6
<Response [200]>
7
<Response [200]>
8
<Response [200]>
9
<Response [200]>
10
<Response [200]>
11
<Response [200]>
12
<Response [200]>
13
<Response [200]>
14
<Response [200]>
15
<Response [200]>
16
<Response [200]>
17
<Response [200]>
18
<Response [200]>


KeyboardInterrupt: 

## Now that we know which properties do exist we can only queries those from the variable props_possible.

In [None]:
res_array = np.array(res)
props_possible = pd.Series(list(set(res_array[:,0]))).map(lambda x: x.split("/")[-1])
props_possible = list(props_possible)

## Range

In [None]:
url = 'https://query.wikidata.org/sparql'
res_range = []
step = 250

for i in range(0,len(props_possible), step):
    print(i)
    query = """
    
    SELECT DISTINCT ?property ?range
    WHERE {
      VALUES ?property { wd:"""+" wd:".join(props_possible[i:i+step])+""" } 
      ?property p:P2302 ?statement.
      ?statement ps:P2302 wd:Q21510865.
      ?statement pq:P2308 ?range
    }  LIMIT 10000"""
    
    r = requests.get(url, params = {'format': 'json', 'query': query})
    print(r)
    data = r.json()
    if len(data["results"]["bindings"]) > 1:
        for line in data["results"]["bindings"]:
            prop = line["property"]["value"]
            rang = line["range"]["value"]
            res_range.append([prop, rang])
    else:
        print("Got nothing")
    time.sleep(15)

In [None]:
res_range

## Domain

In [None]:
url = 'https://query.wikidata.org/sparql'
res_domain = []
step = 250

for i in range(0,len(props_possible), step):
    print(i)
    query = """
    
    SELECT DISTINCT ?property ?domain
    WHERE {
      VALUES ?property { wd:"""+" wd:".join(props_possible[i:i+step])+""" } 
      ?property p:P2302 ?v.
      ?v ps:P2302 wd:Q21503250.
      ?v pq:P2308 ?domain
    }  LIMIT 10000"""
    
    r = requests.get(url, params = {'format': 'json', 'query': query})
    print(r)
    data = r.json()
    if len(data["results"]["bindings"]) > 1:
        for line in data["results"]["bindings"]:
            prop = line["property"]["value"]
            domain = line["domain"]["value"]
            res_domain.append([prop, domain])
    else:
        print("Got nothing")
    time.sleep(15)

## We merge the data into a single dictionary

In [None]:
dic_prop = {}

for line in res_range:
#     print(line)
    if line[0] in dic_prop:
        dic_prop[line[0]][0].append(line[1])
    else:
        dic_prop[line[0]] = ([line[1]], [])
        
for line in res_domain:
#     print(line)
    if line[0] in dic_prop:
        dic_prop[line[0]][1].append(line[1])
    else:
        dic_prop[line[0]] = ([], [line[1]])

## We write the data.

In [None]:
f = open("wikidata_P_describe.txt", "w", encoding="utf-8")

prefixes = ["http://www.wikidata.org/prop/", "http://www.wikidata.org/prop/direct/", "http://www.wikidata.org/prop/direct-normalized/", "http://www.wikidata.org/prop/statement/", "http://www.wikidata.org/prop/statement/value/", "http://www.wikidata.org/prop/statement/value-normalized/"]

for line in res:
    for prefix in prefixes:
        f.write(prefix+line[0].split("/")[-1]+"\t"+line[1].replace("\\","").replace('"',"")+"\t"+line[2].replace("\\","").replace('"',"")+"\t"+line[3].replace("\\","").replace('"',"")+"\t")
        if (line[0] in dic_prop):
            f.write(",".join(dic_prop[line[0]][0])+"\t"+",".join(dic_prop[line[0]][1]))
        else:
            f.write(" \t ")
        f.write("\n")
    
f.close()

# Classes

## We first retrieve the property data so we can launch the second part without the first and obtain the classes that we need to retrieve from the Domain and Range.

In [35]:
df = pd.read_csv("wikidata_P_describe.txt", sep="\t", header=None, names=["prop", "label", "description", "wikibase", "domain", "range"])

classes_to_retrieve = set()
df = df.replace(np.nan,"")
for i in range(len(df)):
    domain, rang = df.iloc[i][["domain", "range"]]
    classes_to_retrieve = classes_to_retrieve.union(set(domain.split(",")))
    classes_to_retrieve = classes_to_retrieve.union(set(rang.split(",")))
    
classes_to_retrieve.remove("")

In [51]:
# This function will transfrom a list of URI from their full length "http://wikidata/...." to the short version "wd:" 
def full_to_short(list_classes, short):
    res = ""
    for c in list_classes:
        res += " "+short+c.split("/")[-1]+" "
    return res

## Label and Description

In [63]:
url = 'https://query.wikidata.org/sparql'
res_classes = []
step = 250

list_classes_to_retrieve = list(classes_to_retrieve)

for i in range(0,len(list_classes_to_retrieve), step):
    print(i)
    query = """
    
    SELECT DISTINCT ?class ?label ?description
    WHERE {
      VALUES ?class { """+full_to_short(list_classes_to_retrieve[i:i+step], "wd:")+""" } 
      ?class rdfs:label ?label.
      OPTIONAL{?class schema:description ?description.
          FILTER(LANG(?description) = "en")}
      FILTER(LANG(?label) = "en")
    }  LIMIT 10000"""
    
    r = requests.get(url, params = {'format': 'json', 'query': query})
    print(r)
    data = r.json()
    if len(data["results"]["bindings"]) > 1:
        for line in data["results"]["bindings"]:
            prop = line["class"]["value"]
            label = line["label"]["value"]
            description = ""
            if "description" in line:
                description = line["description"]["value"]
            res_classes.append([prop, label, description])
    else:
        print("Got nothing")
    time.sleep(15)

0
<Response [200]>
250
<Response [200]>
500
<Response [200]>
750
<Response [200]>
1000
<Response [200]>
1250
<Response [200]>
1500
<Response [200]>
1750
<Response [200]>
2000
<Response [200]>
2250
<Response [200]>
2500
<Response [200]>
2750
<Response [200]>
3000
<Response [200]>
3250
<Response [200]>
3500
<Response [200]>


## Write the final Data

In [64]:
f = open("wikidata_C_describe.txt", "w", encoding="utf-8")

for line in res_classes:
    f.write("\t".join(line)+"\n")

f.close()