In [1]:
%%capture
! pip install rdflib sparqlwrapper
! wget https://www.github.com/AlexGerry/pyrdf2vec-for-graph-embeddings/blob/master/walks/dbpedia_walks_final.txt?raw=true
! wget https://www.github.com/AlexGerry/pyrdf2vec-for-graph-embeddings/blob/master/walks/wikidata_walks_final.txt?raw=true
! mv dbpedia_walks_final.txt?raw=true dbpedia_walks_final.txt
! mv wikidata_walks_final.txt?raw=true wikidata_walks_final.txt

In [2]:
from SPARQLWrapper import SPARQLWrapper, JSON
from multiprocessing.dummy import Pool as ThreadPool
from tqdm import tqdm
from multiprocessing import Process, Manager
from pprint import pprint
import json

In [3]:
#read dbpedia_walks_final
with open('dbpedia_walks_final.txt', 'rt') as f:
  corpus = map(lambda x: x.replace('\n', ''), f.readlines())
  dbpedia_walks = list(map(lambda x: x.split(' '), corpus))

In [4]:
# create only 1 mgr
manager = Manager()
# create dict for dbpedia to wikidata  
dic_dbpedia_to_wikidata = manager.dict()

In [5]:
#function that map dbpedia entity(as dict key) in wikidata entity(as dict value) with same as link
#using sparql query. Elements are added in dictionary
#if a dbpedia entity is already in dictionary, the function skips the query. 
def toWikidata(entity: str):
    if entity in dic_dbpedia_to_wikidata:
      return dic_dbpedia_to_wikidata[entity]
    else:
      sparql = SPARQLWrapper("http://dbpedia.org/sparql")
      sparql.setQuery(f"""
          SELECT ?o
          WHERE {{
              <http://dbpedia.org/resource/{entity}> owl:sameAs ?o .
              FILTER(CONTAINS(STR(?o), \"wikidata.org/entity\"))  
              }}
          """)
      sparql.setReturnFormat(JSON)
      results = sparql.query().convert()
      dic_dbpedia_to_wikidata[entity]=[r["o"]["value"] for r in results['results']['bindings']][0]
      return dic_dbpedia_to_wikidata[entity]

In [6]:
def toWikidataWalk(walk: list):
    wikidata_walk = [toWikidata(link) for link in walk]
    return wikidata_walk

In [None]:
# !!cell not necessary, just download data from git with next cell

#thread pool for extract wikidata walks from same as link of dbpedia
with ThreadPool(4) as pool:
  res = list(tqdm(pool.imap(toWikidataWalk, dbpedia_walks), total=len(dbpedia_walks)))

#save wikidata walks from same as link in txt file
resource = [[i.replace("http://www.wikidata.org/entity/", "") for i in j ] for j in res]
with open('wiki_walks_from_dbpedia.txt', 'wt') as f:
  for w in resource:
    f.write(' '.join(w) + '\n')

#write dictionary of "dbpedia entity"="wikidata entity"
import json
with open("dictionary_dbpedia_wikidata.json", "w") as outfile:
    json.dump(dic.copy(), outfile)

100%|██████████| 2500/2500 [2:05:58<00:00,  3.02s/it]


In [7]:
#run ONLY if not runned cell above
#download wiki_walks_from_dbpedia
%%capture
! wget https://www.github.com/AlexGerry/pyrdf2vec-for-graph-embeddings/blob/master/walks/wiki_walks_from_dbpedia.txt?raw=true
! mv wiki_walks_from_dbpedia.txt?raw=true wiki_walks_from_dbpedia.txt

In [8]:
#compass.txt for creation of a dictionary that maps wikidata entity in their label
!cat wiki_walks_from_dbpedia.txt wikidata_walks_final.txt >> compass.txt

In [9]:
#read compass and transform in a set with elements with no duplicates
with open('compass.txt', 'rt') as f:
  corpus = map(lambda x: x.replace('\n', ''), f.readlines())
  walks = map(lambda x: x.split(' '), corpus)
  entities = (entity for walk in walks for entity in walk)
  entities = set(entities)

#create dictionary for labels
dic_wikidata_labels = manager.dict()

In [10]:
#function that extract labels from set of wikidata elements
#elements with no label have '*NOLABEL*' as label
def toWikidateLabel(entity: str):
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.setQuery(f"""
        SELECT ?l WHERE {{
          wd:{entity} rdfs:label ?l.
          FILTER(LANGMATCHES(LANG(?l), "EN"))
          }}
        LIMIT 1
        """)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    label = [r["l"]["value"] for r in results['results']['bindings']]
    label = label[0] if len(label) > 0 else '*NOLABEL*'
    return entity, label

In [11]:
#thread pool for extract labels from wikidata entity
with ThreadPool(4) as pool:
 dic_wikidata_labels = {entity : label for (entity, label) in tqdm(pool.imap(toWikidateLabel, entities), total=len(entities))}

#save dictionary in json
with open('wikidata_label_dictionary.json', 'wt') as f:
  json.dump(dic_wikidata_labels, f)

 14%|█▎        | 3627/26520 [01:52<11:48, 32.30it/s]


KeyboardInterrupt: ignored