In [3]:
import json
import os
from tqdm import tqdm

basePath = "./Datasets/zeshel/documents/"


def load_json(file_path):
    data = []
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            data.append(json.loads(line))
    return data


def load_corpuses():
    corpuses = os.listdir(basePath)
    corpusesDict = {}
    for corpus in corpuses:
        data = []
        with open(basePath + corpus, "r", encoding="utf-8") as file:
            for line in file:
                data.append(json.loads(line))
        corpusesDict[corpus.replace(".json", "")] = data
    return corpusesDict


def findDoc(document_id_to_find, corpus):
    matching_objects = [
        obj for obj in corpus if obj["document_id"] == document_id_to_find
    ]
    return matching_objects[0] if matching_objects else None


def merge_data(train_data, corpus_data):
    merged_dataset = []

    for entry in tqdm(train_data):
        try:
            corpuseName = entry["corpus"]
            corpusDoc = findDoc(entry["context_document_id"], corpus_data[corpuseName])
            entCorpusDoc = findDoc(entry["label_document_id"], corpus_data[corpuseName])
            startIdx = entry["start_index"]
            endIdx = entry["end_index"] + 1
            splittedText = corpusDoc["text"].split(" ")
            n = 150  # Number of tokens to take on each side
            entSurroundedText = " ".join(
                splittedText[max(0, startIdx - n) : startIdx]
                + ["[START_ENT]"]
                + splittedText[startIdx:endIdx]
                + ["[END_ENT]"]
                + splittedText[endIdx : endIdx + n]
            )
            # entSurroundedText = (
            #     corpusDoc["text"][:startIdx]
            #     + "[START_ENT]"
            #     + corpusDoc["text"][startIdx:endIdx]
            #     + "[END_ENT]"
            #     + corpusDoc["text"][endIdx:]
            # )
            merged_entry = {
                "id": len(merged_dataset),
                "input": entSurroundedText,
                "output": [
                    {
                        "answer": entCorpusDoc["title"],
                        "provenance": [{"title": entCorpusDoc["title"]}],
                    }
                ],
                "meta": {
                    "left_context": " ".join(
                        splittedText[max(0, startIdx - n) : startIdx]
                    ),
                    "right_context": " ".join(splittedText[endIdx : endIdx + n]),
                    "mention": " ".join(splittedText[startIdx:endIdx]),
                },
                "candidates": [],
                "answer": entCorpusDoc["text"],
            }

            merged_dataset.append(merged_entry)
        except Exception as e:
            print(e)

    return merged_dataset


corpuses = load_corpuses()
# Load data from JSON files
train_data = load_json("./Datasets/zeshel/mentions/test.json")
# corpus_data = [load_json(entry["corpus"] + ".json") for entry in train_data]

# # Merge data
merged_dataset = merge_data(train_data, corpuses)

# # Save merged dataset to a new JSON file
with open("./zeshel-conv.json", "w", encoding="utf-8") as output_file:
    json.dump(merged_dataset, output_file, indent=2)

100%|██████████| 10000/10000 [00:26<00:00, 378.77it/s]


## Removing NIL For zero shot without NIL

In [3]:
import json
def load_json(file_path):
    data = []
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            data.append(json.loads(line))
    return data

ds = load_json("./zeshel-blink.jsonl")

In [6]:
len(filtered)

785

In [4]:
filtered = []
for item in ds:
    answer= item["output"][0]["answer"]
    if answer != "Not In Candidates":
        filtered.append(item)

In [None]:
with open("./zeshel-blink-noNIL.jsonl", "w", encoding="utf-8") as output_file:
    #write data in lines to jsonl file
    for entry in filtered:
        json.dump(entry, output_file)
        output_file.write("\n")