In [1]:
import nltk
from wikidata.client import Client
from wikidata.entity import Entity
from collections import defaultdict, deque
import requests, json
import pandas as pd
import numpy as np
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

MAX_LEVEL = 2

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\carte\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\carte\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
text = "Home of Mark Zuckerberg is great!"
def get_nouns(sentence):
    tokens = nltk.word_tokenize(sentence)
    tags = nltk.pos_tag(tokens)
    retVal = []
    i = 0
    while i < len(tags):
        key, tag = tags[i]
        if 'NN' in tag:
            #current_noun = key
            #j = i + 1
            #while j < len(tags):
            #    key_j, tag_j = tags[j]
            #    if 'NN' in tag_j:
            #        current_noun = ' '.join([current_noun, key_j])
            #        j += 1
            #    else:
            #        i = j - 1
            #        break
            #retVal.append(current_noun)
            #if j == len(tags):
            #    return retVal
            retVal.append(key)
        i += 1
    return retVal
    
print(get_nouns(text))

['Home', 'Mark', 'Zuckerberg']


In [3]:
def get_wikidata_id(item):
    try:
        response = requests.get(f'https://en.wikipedia.org/w/api.php?action=query&prop=pageprops&titles={item}&format=json')
        wikidata_id = list(json.loads(response.text)['query']['pages'].values())[0]['pageprops']['wikibase_item']
        return wikidata_id
    except:
        return None


In [28]:
def bfs(noun):
    client = Client()
    id = get_wikidata_id(noun)
    e = client.get(id)
    seen = set()
    q = deque([(e, 0)])
    retDict = defaultdict(list)
    while q:
        ent, level = q.popleft()
        if ent in seen or level > MAX_LEVEL:
            continue
        retDict[level].append(str(ent.label))
        seen.add(ent)
        try:
            e = client.get(ent.id).values()
            entities = []
            for ent in list(e)[:min(len(e), 20)]:
                if isinstance(ent, Entity):
                    entities.append(ent)
            for entity in entities:
                q.append((entity, level + 1))
        except:
            continue
    return retDict

In [36]:
x = bfs('Harvard University')
x

defaultdict(list,
            {0: ['Harvard University'],
             1: ['United States of America',
              'private university',
              'Category:Harvard University'],
             2: ['university',
              'Category:Private universities and colleges',
              'public university',
              'Wikimedia category',
              'human']})

In [6]:
embeddings_np = np.load(r"D:\KG\wiki_trans_v1_vec.npy")

In [7]:
data_train = pd.read_csv(r'C:\Users\carte\OneDrive\Desktop\Datascience\SDS\KGA_Bert\data\glue_data\SST-2\train.tsv', sep='\t', header=0)
nouns = set()
for sentence in data_train['sentence']:
    for noun in get_nouns(sentence):
        nouns.add(noun)

nouns

{'amusing',
 'all-out',
 'mixed-up',
 'enthusiasms',
 'homophobia',
 'tank',
 'artless',
 'uninteresting',
 'transcend',
 'advert',
 'rapt',
 'be-all-end-all',
 'falcon',
 'insinuating',
 'companionship',
 'tendency',
 'brother',
 'perch',
 'mcdormand',
 'little',
 'cortez',
 'ripe',
 'alexander',
 'network',
 'toys',
 'pretention',
 'nonconformity',
 'potato',
 'maturity',
 'verisimilitude',
 'rusi',
 'jell',
 'delight',
 'zap',
 'roll',
 'schizophrenia',
 'beautifully',
 'nijinsky',
 'blip',
 'acknowledges',
 'ham',
 'diatribes',
 'melancholia',
 'bait-and-switch',
 'great-grandson',
 'smacks',
 'cheapen',
 'swear',
 'oddballs',
 'thank',
 'blasting',
 'lightweight',
 'werewolf',
 'clams',
 'dynamism',
 'breezy',
 'limits',
 'woe',
 'shearer',
 'segment',
 'cockettes',
 'form',
 'steers',
 'jr.',
 'maudlin',
 'ritter',
 'leplouff',
 'plod',
 'ratliff',
 'plutonium',
 'impossibly',
 'pompeo',
 'aids',
 'mcgrath',
 'chekhov',
 'hodgepodge',
 'bumbling',
 'fees',
 'strutting',
 'experie

In [8]:
def get_training_embeddings(labels_file):

    def __clean_str__(l : str):
        if '@en' in l:
            l = l.replace('@en', '')
        l = l.replace('"', '')
        return l.lower()
    
    retVal = {}
    with open(labels_file) as labels:
        for line in labels:
            curr_label, line_num = line.split('\t')
            line_num = int(line_num)
            curr_label = __clean_str__(curr_label)
            retVal[curr_label] = embeddings_np[line_num]
    return retVal

In [9]:
label_to_embedding = get_training_embeddings(r'C:\Users\carte\OneDrive\Desktop\Datascience\SDS\KGA_Bert\data\KG_data\english_labels.tsv')

In [34]:
nouns_with_embeddings = {word for word in nouns if word in label_to_embedding}
nouns_with_embeddings, len(nouns_with_embeddings)

({'homophobia',
  'tank',
  'artless',
  'transcend',
  'rapt',
  'falcon',
  'brother',
  'perch',
  'little',
  'cortez',
  'ripe',
  'alexander',
  'network',
  'toys',
  'nonconformity',
  'potato',
  'maturity',
  'verisimilitude',
  'rusi',
  'delight',
  'zap',
  'roll',
  'schizophrenia',
  'nijinsky',
  'blip',
  'ham',
  'melancholia',
  'swear',
  'oddballs',
  'lightweight',
  'werewolf',
  'breezy',
  'limits',
  'woe',
  'shearer',
  'segment',
  'form',
  'steers',
  'jr.',
  'maudlin',
  'ritter',
  'plod',
  'plutonium',
  'ratliff',
  'pompeo',
  'aids',
  'mcgrath',
  'chekhov',
  'fees',
  'experience',
  'pray',
  'video',
  'alien',
  'definition',
  'cooler',
  'arteta',
  'clout',
  'plug',
  'derive',
  'cocktail',
  'criterion',
  'credit',
  'low',
  'conquer',
  'place',
  'track',
  'twice',
  'mechanical',
  'lazier',
  'warn',
  'gentle',
  'experiences',
  'rowdy',
  'trial',
  'asparagus',
  'plympton',
  'kurys',
  'heights',
  'quadrangle',
  'england

In [50]:
embeddings = pd.DataFrame(columns=['1st', '2nd', '3rd'])
for noun in list(nouns_with_embeddings)[:1]:
    entities = bfs(noun.title())
    df2 = {'1st': entities[0], '2nd': entities[1], '3rd': entities[2]}
    print(pd.DataFrame(df2))
    #embeddings = pd.concat([embeddings, pd.DataFrame(df2)], ignore_index = True)

ValueError: All arrays must be of the same length

In [41]:
embeddings

Unnamed: 0,1st,2nd,3rd
