In [1]:
import requests
import numpy as np
import base64
import os
import json
from tqdm import tqdm
import joblib
def vector_decode(s, dtype=np.float32):
    buffer = base64.b64decode(s)
    v = np.frombuffer(buffer, dtype=dtype)
    return v



In [2]:
basePath = "./Datasets/zeshel/documentsTest/"

def load_corpuses():
    corpuses = os.listdir(basePath)
    corpusesDict = {}
    for corpus in corpuses:
        data = []
        with open(basePath + corpus, "r", encoding="utf-8") as file:
            for line in file:
                data.append(json.loads(line))
        corpusesDict[corpus.replace(".json", "")] = data
    return corpusesDict

In [3]:
corpuses = load_corpuses()

In [4]:
entities = []
for key in corpuses.keys():
    currentCorpus = corpuses[key]
    for corpusItem in currentCorpus:
        entities.append({"title": corpusItem["title"], "descr": corpusItem["text"]})

In [5]:
len(entities)

70140

In [8]:
from requests.auth import HTTPBasicAuth

auth = HTTPBasicAuth("DS2023", "eexeegheichai3OhChi5AhcheecaaShe")

In [2]:
mentions = [
    {"context_left": "oggi", "mention":"Milano", "context_right":"lavoro"},
    {"context_left": "domain", "mention":"Roma", "context_right":"vacanza"},
    {"context_left": "ho visto il", "mention":"cane", "context_right":"che abbaiava"},
]

In [13]:
tempEnts = []
encodedEnts = []
for entity in tqdm(entities):
    if len(tempEnts) < 100:
        tempEnts.append(entity)
    else:
        res_mentions = requests.post(
            "http://localhost:20980/api/blink/biencoder/entity",
            json=tempEnts,
            auth=auth,
        )
        entities_embedding = np.array(
            list(map(vector_decode, res_mentions.json()["encodings"]))
        )
        encodedEnts.extend(entities_embedding)
        tempEnts = []

100%|██████████| 70140/70140 [32:03<00:00, 36.47it/s]


In [14]:
joblib.dump(encodedEnts, "./Datasets/zeshel/encodedEntsTest.joblib")

['./Datasets/zeshel/encodedEntsTest.joblib']

In [15]:
with open('./zeshel-conv.json', "r", encoding="utf-8") as file:
    # read file lines
    zeshelDs = json.load(file)

In [15]:
zeshelDs[0]

{'id': 0,
 'input': "Larynda Telenna Larynda Telenna was the high priestess of Kiaransalee in the [START_ENT] Vault of Gnashing Teeth [END_ENT] beneath Vaasa . She was also the leader of Kiaransalee ' s cult across the entirety of Faerûn . History . In 1337 DR , Larynda Telenna brought with her a group of acolytes who aided her in constructing the Acropolis of Thanatos in the ruins of the city V ' elddrinnsshar . Among those who resided with her in the Vault of Gnashing Teeth was the keening spirit Gurden Myrinn .",
 'output': [{'answer': 'Moondeep Sea',
   'provenance': [{'title': 'Moondeep Sea'}]}],
 'meta': {'left_context': 'Larynda Telenna Larynda Telenna was the high priestess of Kiaransalee in the',
  'right_context': "beneath Vaasa . She was also the leader of Kiaransalee ' s cult across the entirety of Faerûn . History . In 1337 DR , Larynda Telenna brought with her a group of acolytes who aided her in constructing the Acropolis of Thanatos in the ruins of the city V ' elddrinn

In [16]:
mentions = [
    {
        "mention": mention["meta"]["mention"],
        "context_left": mention["meta"]["left_context"],
        "context_right": mention["meta"]["right_context"],
        "answer": mention["output"][0]["answer"],
        "output": mention["output"][0]["output"],
        "input": mention["input"],
    }
    for mention in zeshelDs
]

In [18]:
mentions[0]

{'mention': 'Vault of Gnashing Teeth',
 'context_left': 'Larynda Telenna Larynda Telenna was the high priestess of Kiaransalee in the',
 'context_right': "beneath Vaasa . She was also the leader of Kiaransalee ' s cult across the entirety of Faerûn . History . In 1337 DR , Larynda Telenna brought with her a group of acolytes who aided her in constructing the Acropolis of Thanatos in the ruins of the city V ' elddrinnsshar . Among those who resided with her in the Vault of Gnashing Teeth was the keening spirit Gurden Myrinn ."}

In [19]:
tempMents = []
encodedMents = []
for mention in tqdm(mentions):
    if len(tempMents) < 100:
        tempMents.append(mention)
    else:
        res_mentions = requests.post(
            "http://localhost:20980/api/blink/biencoder/mention", json=tempMents
        )
        mentions_embedding = np.array(
            list(map(vector_decode, res_mentions.json()["encodings"]))
        )
        encodedMents.extend(mentions_embedding)
        tempMents = []

100%|██████████| 10000/10000 [01:23<00:00, 119.47it/s]


In [20]:
joblib.dump(encodedMents, "./Datasets/zeshel/encodedMentsTest.joblib")

['./Datasets/zeshel/encodedMentsTest.joblib']

In [45]:
res_mentions = requests.post('http://localhost:20980/api/blink/biencoder/mention', json=mentions)

In [47]:
#res_mentions.json()

In [48]:
entities = [
    {'title': 'gianfranco', 'descr': 'avvocato'},
    {'title': 'dobby', 'descr': 'cane labrador'},
]

In [49]:
res_mentions = requests.post('http://localhost:20980/api/blink/biencoder/entity', json=entities)

In [51]:
mentions_embedding = np.array(list(map(vector_decode, res_mentions.json()['encodings'])))
mentions_embedding

array([[ 0.07842703,  0.07131317,  0.00129652, ..., -0.03120706,
        -0.19704019,  0.02340733],
       [ 0.14617996,  0.12868266,  0.05573903, ...,  0.04600737,
        -0.21320432,  0.00834524],
       [ 0.14868711, -0.11198566,  0.0330006 , ...,  0.07524123,
        -0.05477954,  0.01567708]], dtype=float32)

In [52]:
mentions_embedding.shape

(3, 1024)

In [53]:
entity_embedding = np.array(list(map(vector_decode, res_mentions.json()['encodings'])))
entity_embedding

array([[-0.01721451,  0.22309886,  0.07597879, ..., -0.07587761,
        -0.23545255, -0.22782443],
       [ 0.19822957,  0.13209286,  0.07052226, ..., -0.2937744 ,
        -0.2015146 ,  0.16102695]], dtype=float32)

In [54]:
entity_embedding.shape

(2, 1024)

In [61]:
linking_pred_ids = mentions_embedding.dot(entity_embedding.T).argmax(1)

In [68]:
linking_pred_titles = list(map(lambda x: entities[x], linking_pred_ids))
linking_pred_titles

[{'title': 'gianfranco', 'descr': 'avvocato'},
 {'title': 'dobby', 'descr': 'cane labrador'},
 {'title': 'dobby', 'descr': 'cane labrador'}]

In [71]:
for men, ent in zip(mentions, linking_pred_titles):
    print('{} [{}] {}\t-->\t{}'.format(men['context_left'], men['mention'], men['context_right'], ent['title']))

oggi [Milano] lavoro	-->	gianfranco
domain [Roma] vacanza	-->	dobby
ho visto il [cane] che abbaiava	-->	dobby


In [23]:
encodedMents = np.array(encodedMents)
encodedEnts = np.array(encodedEnts)
# primi 100 candidati 
best_100_entity_ids = np.flip(encodedMents.dot(encodedEnts.T).argsort(1), axis=1)[:, :20]
best_100_entity_ids

array([[ 2093,  7431,  7883, ...,   876,  4647, 28433],
       [51834, 34416, 53796, ..., 42865, 60914, 60287],
       [62170, 64528, 56875, ..., 68145, 67380, 61707],
       ...,
       [41985, 64676, 55877, ..., 48685, 44503, 54490],
       [12899, 11992,  6134, ..., 24764, 64311, 33285],
       [ 7606,  9606,  3235, ..., 42368,  1954, 51816]])

In [24]:
best_100_entities = []
for row in best_100_entity_ids.tolist():
    temp_100_ents = []
    for col in row:
        temp_100_ents.append(entities[col])
    best_100_entities.append(temp_100_ents)
        

In [None]:
best_100_entities

In [29]:
index = 0
for item in tqdm(best_100_entities):
    candidates = []
    for entity in item:
        candidates.append(entity["title"])
    mentions[index]["candidates"] = candidates
    index += 1

  0%|          | 0/9900 [00:00<?, ?it/s]

100%|██████████| 9900/9900 [00:00<00:00, 102474.30it/s]


In [30]:
mentions[0]

{'mention': 'Vault of Gnashing Teeth',
 'context_left': 'Larynda Telenna Larynda Telenna was the high priestess of Kiaransalee in the',
 'context_right': "beneath Vaasa . She was also the leader of Kiaransalee ' s cult across the entirety of Faerûn . History . In 1337 DR , Larynda Telenna brought with her a group of acolytes who aided her in constructing the Acropolis of Thanatos in the ruins of the city V ' elddrinnsshar . Among those who resided with her in the Vault of Gnashing Teeth was the keening spirit Gurden Myrinn .",
 'answer': 'Moondeep Sea',
 'input': "Larynda Telenna Larynda Telenna was the high priestess of Kiaransalee in the [START_ENT] Vault of Gnashing Teeth [END_ENT] beneath Vaasa . She was also the leader of Kiaransalee ' s cult across the entirety of Faerûn . History . In 1337 DR , Larynda Telenna brought with her a group of acolytes who aided her in constructing the Acropolis of Thanatos in the ruins of the city V ' elddrinnsshar . Among those who resided with her 

In [31]:
#dump mentions to json file
with open('./zeshel-conv-blink.json', "w", encoding="utf-8") as file:
    # read file lines
    json.dump(mentions, file)

In [2]:
import json
# dump mentions to json file
with open("./zeshel-conv-blink.json", "r", encoding="utf-8") as file:
    # read file lines
    ds = json.load( file)
    
ds[0]    

{'mention': 'Vault of Gnashing Teeth',
 'context_left': 'Larynda Telenna Larynda Telenna was the high priestess of Kiaransalee in the',
 'context_right': "beneath Vaasa . She was also the leader of Kiaransalee ' s cult across the entirety of Faerûn . History . In 1337 DR , Larynda Telenna brought with her a group of acolytes who aided her in constructing the Acropolis of Thanatos in the ruins of the city V ' elddrinnsshar . Among those who resided with her in the Vault of Gnashing Teeth was the keening spirit Gurden Myrinn .",
 'answer': 'Moondeep Sea',
 'input': "Larynda Telenna Larynda Telenna was the high priestess of Kiaransalee in the [START_ENT] Vault of Gnashing Teeth [END_ENT] beneath Vaasa . She was also the leader of Kiaransalee ' s cult across the entirety of Faerûn . History . In 1337 DR , Larynda Telenna brought with her a group of acolytes who aided her in constructing the Acropolis of Thanatos in the ruins of the city V ' elddrinnsshar . Among those who resided with her 

In [8]:
import random
converted = []
errors = 0
for item in ds:
    try:
      if item["answer"] not in item["candidates"]:
          #generate a random from 0 to len(candidates)/2
          randomIndex = random.randint(0, int(len(item["candidates"])/4))
          #insert at random index the answer
          item["candidates"].insert(randomIndex, item["answer"])
      converted.append(item)
    except:
        errors += 1

errors

100

In [7]:
converted[0]

{'mention': 'Vault of Gnashing Teeth',
 'context_left': 'Larynda Telenna Larynda Telenna was the high priestess of Kiaransalee in the',
 'context_right': "beneath Vaasa . She was also the leader of Kiaransalee ' s cult across the entirety of Faerûn . History . In 1337 DR , Larynda Telenna brought with her a group of acolytes who aided her in constructing the Acropolis of Thanatos in the ruins of the city V ' elddrinnsshar . Among those who resided with her in the Vault of Gnashing Teeth was the keening spirit Gurden Myrinn .",
 'answer': 'Moondeep Sea',
 'input': "Larynda Telenna Larynda Telenna was the high priestess of Kiaransalee in the [START_ENT] Vault of Gnashing Teeth [END_ENT] beneath Vaasa . She was also the leader of Kiaransalee ' s cult across the entirety of Faerûn . History . In 1337 DR , Larynda Telenna brought with her a group of acolytes who aided her in constructing the Acropolis of Thanatos in the ruins of the city V ' elddrinnsshar . Among those who resided with her 

In [6]:
# dump mentions to json file
with open("./zeshel-conv-blink.json", "w", encoding="utf-8") as file:
    # read file lines
    json.dump(converted, file)

In [106]:
for men, ent in zip(mentions, best_100_entities):
    ent_rank = ';\t'.join([str(i+1)+') '+e['title'] for i,e in enumerate(ent)])
    print('{} [{}] {}\t-->\t{}'.format(men['context_left'], men['mention'], men['context_right'], ent_rank))

oggi [Milano] lavoro	-->	1) gianfranco;	2) dobby
domain [Roma] vacanza	-->	1) dobby;	2) gianfranco
ho visto il [cane] che abbaiava	-->	1) dobby;	2) gianfranco
