In [9]:
import nltk
from wikidata.client import Client
from wikidata.entity import Entity
from collections import defaultdict, deque
import requests, json
import pandas as pd
nltk.download('punkt')

MAX_LEVEL = 2

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\carte\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
text = "Home of Mark Zuckerberg is great!"
def get_nouns(sentence):
    tokens = nltk.word_tokenize(sentence)
    tags = nltk.pos_tag(tokens)
    retVal = []
    i = 0
    while i < len(tags):
        key, tag = tags[i]
        if 'NN' in tag:
            current_noun = key
            j = i + 1
            while j < len(tags):
                key_j, tag_j = tags[j]
                if 'NN' in tag_j:
                    current_noun = ' '.join([current_noun, key_j])
                    j += 1
                else:
                    i = j - 1
                    break
            retVal.append(current_noun)
            if j == len(tags):
                return retVal          
        i += 1
    return retVal
    
print(get_nouns(text))

['Home', 'Mark Zuckerberg']


In [11]:
def get_wikidata_id(item):
    try:
        response = requests.get(f'https://en.wikipedia.org/w/api.php?action=query&prop=pageprops&titles={item}&format=json')
        wikidata_id = list(json.loads(response.text)['query']['pages'].values())[0]['pageprops']['wikibase_item']
        return wikidata_id
    except:
        return None


In [12]:
def bfs(id):
    seen = set()
    q = deque([(id, 0)])
    client = Client()
    retDict = defaultdict(list)
    while q:
        ent, level = q.popleft()
        if ent in seen or level > MAX_LEVEL:
            continue
        retDict[level].append(ent)
        seen.add(ent)
        try:
            e = client.get(ent if isinstance(ent, str) else ent.id).values()
            entities = []
            for ent in list(e)[:min(len(e), 20)]:
                if isinstance(ent, Entity):
                    entities.append(ent)
            for entity in entities:
                q.append((entity, level + 1))
        except:
            continue
    return retDict

In [13]:
id = get_wikidata_id('Mark Zuckerberg')
x = bfs(id)
x

defaultdict(list,
            {0: ['Q36215'],
             1: [<wikidata.entity.Entity Q9192 'Mandarin'>,
              <wikidata.entity.Entity Q6581097 'male'>,
              <wikidata.entity.Entity Q462177 'White Plains'>,
              <wikidata.entity.Entity Q5482740 'programmer'>,
              <wikidata.entity.Entity Q13371 'Harvard University'>,
              <wikidata.entity.Entity Q17290934 'Lentapedia'>,
              <wikidata.entity.Entity Q13610143 'Mark'>,
              <wikidata.entity.Entity Q83364 'vegetarianism'>],
             2: [<wikidata.entity.Entity Q34770 'language'>,
              <wikidata.entity.Entity Q6205368 'Category:Mandarin Chinese'>,
              <wikidata.entity.Entity Q7850 'Chinese'>,
              <wikidata.entity.Entity Q8201 'Chinese characters'>,
              <wikidata.entity.Entity Q651641 'subject–verb–object'>,
              <wikidata.entity.Entity Q10948482 'Mandarin'>,
              <wikidata.entity.Entity Q12308941 'male given name'>,
 

In [14]:
def get_training_embeddings(labels_file, embeddings_file):
    def __remove_at__(l : str):
        if '@en' in l:
            return l.replace('@en', '')
        return l

    retVal = {}
    with open(labels_file) as labels, open(embeddings_file) as embeddings: 
        i = 0
        curr_label, line_num = labels.readline().split('\t')
        line_num = int(line_num)
        curr_label = __remove_at__(curr_label)
        for line in embeddings:
            if i == line_num:
                if curr_label not in retVal:
                    retVal[curr_label] = line.strip()
                try:
                    curr_label, line_num = labels.readline().split('\t')
                    line_num = int(line_num)
                    curr_label = __remove_at__(curr_label)
                except:
                    break
            i += 1
    return retVal

In [15]:
label_to_embedding = get_training_embeddings()

In [None]:
embeddings = pd.DataFrame(columns=['1st', '2nd', '3rd'])