In [2]:
import gatenlp
from tqdm import tqdm

import re


def get_mention(text):
    pattern = r"\[START_ENT\](.*?)\[END_ENT\]"
    match = re.search(pattern, text)
    if match:
        return match.group(1).strip()
    return None


def convert_to_gatenlp_doc(data):
    # get mention between [START_ENT] and [END_ENT]

    # Create a GATE document
    doc = gatenlp.Document()
    doc.text = data["input"].replace("[START_ENT]", "").replace("[END_ENT]", "")
    # add "gold" annotation set
    annset = doc.annset("gold")
    ment = get_mention(data["input"])
    # find start and end index
    start = doc.text.find(ment)
    end = start + len(ment)

    # add "mention" annotation with linking features

    ann = annset.add(start=start, end=end, anntype="MISC", features=[])

    return doc


# Your dataset
dataset = {
    "id": 0,
    "input": "[START_ENT] Bandar Seri Begawan [END_ENT] 11 15 AFP ...",
    "output": [
        {
            "answer": "Bandar Seri Begawan",
            "provenance": [{"title": "Bandar Seri Begawan"}],
        }
    ],
    "meta": {
        "left_context": "",
        "right_context": "11 15 AFP The United States today Wednesday deemed ...",
        "mention": "Bandar Seri Begawan",
    },
    "candidates": ["Bandar Seri Begawan", "Brunei International Airport"],
}

# read jsonl file
import json

with open("./Datasets/wnum/wnum.json") as f:
    data = json.load(f)

gatenlp_docs = []
for item in tqdm(data):
    gatenlp_docs.append(convert_to_gatenlp_doc(item))

# Print the GATE document
print(len(gatenlp_docs))
# dump to jsonl file

# with open('ace2004-test-kilt.gatenlp.jsonl', 'w') as outfile:
#     for entry in tqdm(gatenlp_docs):


#         outfile.write(json.dumps(entry.to_dict())+"\n")

100%|██████████| 100002/100002 [00:02<00:00, 36219.84it/s]

100002





In [10]:
!pip3 install gatenlp

Defaulting to user installation because normal site-packages is not writeable
Collecting gatenlp
  Downloading gatenlp-1.0.8-py3-none-any.whl (335 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m335.6/335.6 KB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting iobes
  Downloading iobes-1.5.1-py3-none-any.whl (18 kB)
Collecting sortedcontainers>=2.0.0
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl (29 kB)
Installing collected packages: sortedcontainers, iobes, gatenlp
Successfully installed gatenlp-1.0.8 iobes-1.5.1 sortedcontainers-2.4.0


In [3]:
import requests

api_biencoder = 'http://localhost:20980/api/blink/biencoder/mention/doc'
api_indexer = 'http://localhost:20982/api/indexer/search/doc/10'

from requests.auth import HTTPBasicAuth
auth = HTTPBasicAuth('DS2023', 'eexeegheichai3OhChi5AhcheecaaShe')



In [4]:
# help functions

import numpy as np
import base64

def vector_encode(v):
    s = base64.b64encode(v).decode()
    return s

def vector_decode(s, dtype=np.float32):
    buffer = base64.b64decode(s)
    v = np.frombuffer(buffer, dtype=dtype)
    return v

In [17]:
gatenlp_docs[0].to_dict()

{'annotation_sets': {'gold': {'name': 'gold',
   'annotations': [{'type': 'mention',
     'start': 1,
     'end': 20,
     'id': 0,
     'features': {}}],
   'next_annid': 1}},
 'text': ' Bandar Seri Begawan  11 15 AFP The United States today Wednesday deemed the order issued by Palestinian President Yasser Arafat for a ceasefire in territories under Palestinian Authority control as a positive gesture but considered that it does not release constitute a release form the terms of the Sharm el Sheikh agreement James Stewart the White House s spokesman in Bandar Seri Begawan the capital of the Sultanate of Brunei which American President Bill Clinton is visiting said of course we positively welcome the announcement aimed at stopping the violence But the important point is that Palestinian and Israeli officials take the right',
 'features': {},
 'offset_type': 'p',
 'name': ''}

In [35]:
gdoc_encoding

In [5]:
from gatenlp import Document

linked_docs = []


for item in tqdm(gatenlp_docs):
  try:
    # rename needed to use the API
    gold_dict = item.to_dict()
    gold_dict['annotation_sets']['entities_merged'] = gold_dict['annotation_sets']['gold']
    gold_dict['annotation_sets']['entities_merged']['name'] = 'entities_merged'
    for ann in gold_dict['annotation_sets']['entities_merged']['annotations']:
      ann['features'] = {}
    gold_annotations_doc = Document.from_dict(gold_dict)
    gold_annotations_doc.features['pipeline'] = []
    res = requests.post(api_biencoder, auth = auth, json = gold_annotations_doc.to_dict())
    doc = res.json()
    gdoc_encoding = Document.from_dict(doc)
    res = requests.post(api_indexer, auth = auth, json = gdoc_encoding.to_dict())
    doc = res.json()
    gdoc = Document.from_dict(doc)
    linked_docs.append(gdoc)
  except Exception as e:
    print(e)
  

100%|██████████| 100002/100002 [3:16:27<00:00,  8.48it/s]  


In [6]:
index = 0
nils = 0
for item in tqdm(linked_docs):
    candidates = item.to_dict()["annotation_sets"]["entities_merged"]["annotations"][0][
        "features"
    ]["linking"]["candidates"]
    candidatesList = [candidate["title"] for candidate in candidates]
    data[index]["candidates"] = candidatesList

    if data[index]["output"][0]["answer"] not in candidatesList:
        data[index]["output"][0]["answer"] = "Not In Candidates"
        nils += 1
    index += 1

100%|██████████| 100002/100002 [00:02<00:00, 41115.00it/s]


In [7]:
nils

8234

In [9]:
#dump to jsonl file updated data 
with open('./wnum.jsonl', 'w') as outfile:
    for entry in tqdm(data):
        outfile.write(json.dumps(entry)+"\n")

100%|██████████| 100002/100002 [00:00<00:00, 125625.31it/s]


In [41]:
gdoc.to_dict()['annotation_sets']['entities_merged']['annotations'][0]['features']['linking']['candidates']

[{'raw_score': 325.1113586425781,
  'id': 783851,
  'wikipedia_id': 3434750,
  'wikidata_qid': None,
  'redirects_to': None,
  'title': 'United States',
  'url': 'https://en.wikipedia.org/wiki?curid=3434750',
  'type_': None,
  'indexer': 0,
  'score': 81.96341705322266,
  'norm_score': 0.5423622485900884},
 {'raw_score': 334.17315673828125,
  'id': 15763,
  'wikipedia_id': 31769,
  'wikidata_qid': None,
  'redirects_to': None,
  'title': 'United Nations',
  'url': 'https://en.wikipedia.org/wiki?curid=31769',
  'type_': None,
  'indexer': 0,
  'score': 77.43250274658203,
  'norm_score': 0.46290169663524444},
 {'raw_score': 334.7706298828125,
  'id': 3155783,
  'wikipedia_id': 27552742,
  'wikidata_qid': None,
  'redirects_to': None,
  'title': 'Law of the United States',
  'url': 'https://en.wikipedia.org/wiki?curid=27552742',
  'type_': None,
  'indexer': 0,
  'score': 77.1337661743164,
  'norm_score': 0.5066513418138748},
 {'raw_score': 334.77203369140625,
  'id': 197788,
  'wikipedi