In [1]:
import requests
import json
import time


def get_wikidata_id(page_titles):
    try:
        url = "https://en.wikipedia.org/w/api.php"
        params = {
            "action": "query",
            "prop": "pageprops",
            "ppprop": "wikibase_item",
            "redirects": "1",
            "format": "json",
            "titles": "|".join(page_titles),
        }

        response = requests.get(url, params=params)
        data = response.json()
        pages = data["query"]["pages"]
        wikidata_ids = {}
        for page in pages.values():
            wikidata_id = page.get("pageprops", {}).get("wikibase_item")
            if wikidata_id:
                wikidata_ids[page["title"]] = wikidata_id

        return wikidata_ids

        
    except Exception as e:
        print(e)
        return None


def get_instance_of(wikidata_ids):
    try:
        url = "https://query.wikidata.org/sparql"
        ids_string = ' '.join(f"wd:{wikidata_id}" for wikidata_id in wikidata_ids)
        query = f"""
        SELECT ?item ?instanceOfLabel WHERE {{
            VALUES ?item {{ {ids_string} }}
            ?item wdt:P31 ?instanceOf.
            SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
        }}
        """
        response = requests.get(url, params={"query": query, "format": "json"})
        data = response.json()
        results = data["results"]["bindings"]
        instances = {result["item"]["value"].split('/')[-1]: result["instanceOfLabel"]["value"] for result in results}
        time.sleep(1)
        return instances
    except Exception as e:
        print(e)
        return None



In [33]:
get_instance_of(["Q42", "Q76", "Q11573", "Qsdf345"])

{'Q42': 'human', 'Q76': 'human', 'Q11573': 'UCUM base unit'}

In [69]:
import json
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from tqdm import tqdm
import multiprocessing as mp
import joblib

tokenized = []
file_path = "./wnum.jsonl"
from datasets import Dataset
all_cands = []
if __name__ == "__main__":
    original_data_aida = []
    with open(file_path, "r") as file:
        for line in tqdm(file, total=18448):
            # Load each line as a JSON object
            data_line = json.loads(line)
            answer = data_line["answers"]["text"][0]
            candidates = data_line["question"].split(" </ec>")
            candidates = [cand.replace("</ec>", "") for cand in candidates]
            candidates = [cand.split(": ")[0] for cand in candidates]
            candidates = [cand.strip() for cand in candidates]

            for cand in candidates:
                if cand not in all_cands:
                    all_cands.append(cand)  

 43%|████▎     | 7939/18448 [00:00<00:00, 14667.69it/s]


In [None]:
len(all_cands)
get_wikidata_id(all_cands[:30])

In [70]:
from tqdm import tqdm
import joblib

id_mapping = {}

for i in tqdm(range(0, len(all_cands), 50)):
    
    current_slice = None
    if i + 50 < len(all_cands):
        current_slice = all_cands[i : i + 50]
    else:
        current_slice = all_cands[i:]
    wikidata_ids = get_wikidata_id(current_slice)
    id_mapping.update(wikidata_ids)



100%|██████████| 77/77 [00:24<00:00,  3.17it/s]


In [23]:
id_mapping["German American"]

KeyError: 'German American'

In [14]:
id_mapping.keys()

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])

In [71]:
instance_of_mapping = {}
for i in tqdm(range(0, len(all_cands), 80)):
    current_slice = None
    if i + 80 < len(all_cands):
        current_slice = all_cands[i : i + 80]
    else:
        current_slice = all_cands[i:]
    current_slice = [id_mapping[cand] if cand in id_mapping else None for cand in current_slice]
    instanceofs = get_instance_of(current_slice)
    instance_of_mapping.update(instanceofs)
    


100%|██████████| 48/48 [01:11<00:00,  1.50s/it]


In [61]:
joblib.dump(id_mapping, "id_mapping.joblib")
joblib.dump(instance_of_mapping, "instanceofs.joblib")

['instanceofs.joblib']

In [72]:
len(instance_of_mapping)

3314

In [73]:
# PID 1101667
import json
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from tqdm import tqdm
import multiprocessing as mp
import joblib

tokenized = []
file_path = "./train4.jsonl"
from datasets import Dataset

if __name__ == "__main__":
    original_data_aida = []
    with open(file_path, "r") as file:
        for line in tqdm(file, total=18448):
            # Load each line as a JSON object
            data_line = json.loads(line)
            answer = data_line["answers"]["text"][0]
            candidates = data_line["question"].split(" </ec>")
            candidates = [cand.replace("</ec>", "") for cand in candidates]
            candidates = [cand.split(": ")[0] for cand in candidates]
            candidates = [cand.strip() for cand in candidates]
            num_cand = 0
            candidates_string = ""
            filtered_cand = []
            for cand in candidates:
                if answer not in candidates_string or num_cand <= 10:
                    filtered_cand.append(cand)

            for can in filtered_cand:
                instance = None
                if can in id_mapping:
                    id = id_mapping[can]
                    if id in instance_of_mapping:
                        instance = instance_of_mapping[id]
                    else:
                        instance = "Unknown"
                else:
                    instance = "Unknown"
                candidates_string += can + f" : instance of {instance}" + " </ec> "
            data_line["question"] = candidates_string
            data_line.pop("linked_ents")
            data_line.pop("most_related")
            original_data_aida.append(data_line)
    with open("./nil_el_instanceof.jsonl", "w") as file:
        for line in original_data_aida:
            json.dump(line, file)
            file.write("\n")

 43%|████▎     | 7939/18448 [00:00<00:00, 46648.62it/s]


## Non converted

In [10]:
import json
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from tqdm import tqdm
import multiprocessing as mp
import joblib

tokenized = []
file_path = "./zeshel-conv-blink.json"
from datasets import Dataset

all_cands = []
if __name__ == "__main__":
    original_data_aida = []
    with open(file_path, "r") as file:
        ds = json.load(file)
        for line in tqdm(ds, total=100002):
            # Load each line as a JSON object
            candidates = line["candidates"]

            for cand in candidates:
                if cand not in all_cands:
                    all_cands.append(cand)

 10%|▉         | 9900/100002 [00:16<02:32, 589.17it/s] 


In [6]:
len(all_cands)

2506

In [11]:
from tqdm import tqdm
import joblib

id_mapping = {}

for i in tqdm(range(0, len(all_cands), 50)):
    current_slice = None
    if i + 50 < len(all_cands):
        current_slice = all_cands[i : i + 50]
    else:
        current_slice = all_cands[i:]
    wikidata_ids = get_wikidata_id(current_slice)
    time.sleep(0.1)
    id_mapping.update(wikidata_ids)

100%|██████████| 789/789 [06:01<00:00,  2.19it/s]


In [8]:
len(id_mapping)

2489

In [12]:
instance_of_mapping = {}
for i in tqdm(range(0, len(all_cands), 80)):
    current_slice = None
    if i + 80 < len(all_cands):
        current_slice = all_cands[i : i + 80]
    else:
        current_slice = all_cands[i:]
    current_slice = [
        id_mapping[cand] if cand in id_mapping else None for cand in current_slice
    ]
    try:
        instanceofs = get_instance_of(current_slice)
        instance_of_mapping.update(instanceofs)
    except:
        None

  0%|          | 0/493 [00:00<?, ?it/s]

100%|██████████| 493/493 [11:24<00:00,  1.39s/it]


In [None]:
instance_of_mapping

In [14]:
# PID 1101667
import json
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from tqdm import tqdm
import multiprocessing as mp
import joblib

tokenized = []

from datasets import Dataset

if __name__ == "__main__":
    original_data_aida = []
    with open(file_path, "r") as file:
        ds = json.load(file)
        for data_line in tqdm(ds, total=100002):
            # Load each line as a JSON object

            candidates_enriched = []

            for can in data_line["candidates"]:
                instance = None
                if can in id_mapping:
                    id = id_mapping[can]
                    if id in instance_of_mapping:
                        instance = instance_of_mapping[id]
                    else:
                        instance = "Unknown"
                else:
                    instance = "Unknown"
                candidates_enriched.append(can + f" : instance of {instance}")
            data_line["candidates"] = candidates_enriched

            original_data_aida.append(data_line)
    with open("./Datasets/InstanceOf/zeshel.jsonl", "w") as file:
        for line in original_data_aida:
            json.dump(line, file)
            file.write("\n")

 10%|▉         | 9900/100002 [00:00<00:00, 127609.86it/s]
