In [1]:
import json
import numpy as np
import random
#from tqdm.auto import tqdm
from tqdm import tqdm
import itertools
import os
from copy import deepcopy
import matplotlib.pyplot as plt

In [2]:
# create a dictionary with entities(keys) and a assigning them an index number as dic value
def build_dicts(entities):
    entity2ind = dict()
    ind2entity = []
    for i in range(len(entities)):
        entity = entities[i]
        if not (entity in ind2entity):
            ind2entity.append(entity)
            entity2ind[entity] = len(ind2entity) - 1
    return ind2entity, entity2ind

# chose ration of dataset used for ID and OOD
def choose(arr, ratio_or_count):
    if type(ratio_or_count) == float:
        num = round(ratio_or_count*len(arr))
    elif type(ratio_or_count) == int:
        num = ratio_or_count
    else:
         assert False
    if num >= len(arr):
        return arr    # if we chose to take more data then in the array just take all the array
    rand_inds = np.random.choice(len(arr), num, replace=False).tolist()
    return [arr[i] for i in rand_inds]

# Splits an array into two parts test-train  
def split(arr, ratio_or_count):
    if type(ratio_or_count) == float:
        num = round(ratio_or_count*len(arr))
    elif type(ratio_or_count) == int:
        num = ratio_or_count
    else:
         assert False
    train, test = [], []
    rand_inds = np.random.choice(len(arr), num, replace=False).tolist()
    for i in tqdm(range(len(arr))):
        if i in rand_inds:
            train.append(arr[i])
        else:
            test.append(arr[i])
    return [train, test]

def form_items(c, t):
    #The join method concatenates all elements in the list c into a single string. For example, if c = ["<e_1>", "<r_1>"], then input_text will be "<e_1><r_1>".
    input_text = "".join(c)
    target_text = input_text + "".join([t, "</a>"]) # in  alist before for making more readable
    item = {
        "input_text": input_text,
        "target_text": target_text
    }
    return item

In [3]:
def build_dataset(num_entities, num_relations, out_degree=20, split_train_inferred=False):
 
    # create a list with all entities names
    entities = ["<e_{}>".format(i) for i in range(num_entities)]
    # create a dictionary with entities as keys and index as values
    ind2entity, entity2ind = build_dicts(entities)

    # create a list with all relations names
    relations = ["<r_{}>".format(i) for i in range(num_relations)]
    # create a dictionary with relations as keys and index as values
    ind2relation, relation2ind = build_dicts(relations)

    #create atomic facts = dictionary head entity -> list of (relation, tail entity) pairs
    atomic_dict = dict()   # maps a head entity to a list of (r, t) pairs
    atomic_facts = []
    atomics = []

    for i in tqdm(range(num_entities)): #  it creates a progress bar that updates as the loop progresses
        # for each subject entity, randomly select some outgoing relations to some random object entity
        num_rows = out_degree
        # randomly select some relations (for each head entity), size=num_rows is the number of relations to be selected=20= out_degree for node
        selected_rows = np.random.choice(num_relations, size=num_rows, replace=False).tolist()
        for row_idx in selected_rows:
            col_idx = np.random.randint(num_entities)  # pick some random tail entity for each selected (h,r)
            h,r,t = ind2entity[i], ind2relation[row_idx], ind2entity[col_idx]
            atomic_facts.append(form_items([h, r], t))
            atomics.append((h,r,t))
            if h not in atomic_dict:  # add the head entity to the dictionary if it's not already there
                atomic_dict[h] = []
            # add the (r, t) pair to the list of pairs for this head entity. In this way a key (head)is associated to all the relations and tail entities
            atomic_dict[h].append((r, t))   
            ############################################################################################################################################  HERE end the ATOMIC FACTS CREATION!!!  ###############

    if not split_train_inferred:   # if we don't want to split the training set into ID and OOD as onlly ID is needed skip this part    NOOOOOOOOOOOO USE
        inferred_facts = []
        for ent in tqdm(entities):
            for (r1, b) in atomic_dict[ent]:  # for each (r1, b) pair associated with the head entity
                for (r2, t) in atomic_dict[b]:   # for each (r2, t) pair associated with the bridge entity
                    inferred_facts.append(form_items([ent, r1, r2], t))  # add the inferred fact to the list of inferred facts
        return entities, relations, atomic_facts, inferred_facts
    ################################################################################################################ continue with the split of the training set in ID and OOD
    # split ID/OOD
    OOD_ratio = 0.05          #atomic=list of tuples = all the atomic facts; atomics.append((h,r,t))
    OOD_facts, ID_facts = split(atomics, round(len(atomics)*OOD_ratio)) #split the atomic facts in ID and OOD
    OOD_facts, ID_facts = set(OOD_facts), set(ID_facts)                 #convert the lists OOD and ID in sets



    # create a list of atomic facts for ID and OOD "iteams" see form_items function
    # FROM A LIST OF TUPLES TO A LIST OF DICTIONARIES!!

#OOD_facts = [("<e_1>", "<r_1>", "<e_2>"), ("<e_3>", "<r_2>", "<e_4>")]
# transform in:
#ood_atomic_facts = [
#   {
#        "input_text": "<e_1><r_1>",
#        "target_text": "<e_1><r_1><e_2></a>"  --> in other words:"(h,r,t)" + "</a>"
#    },{"input ".. } ..]
  #################################################################################### create/convert the atomic in iteams way
    id_atomic_facts = [form_items([h, r], t) for (h,r,t) in ID_facts]
    ood_atomic_facts = [form_items([h, r], t) for (h,r,t) in OOD_facts]


    ########  HERE WE CREATE THE TRAIN AND TEST SETS FOR THE INFERENCE TASK ########
    #lets see what we train on and what we test on!

    train_inferred_facts, test_inferred_iid, test_inferred_ood = [], [], []
    t3_HOP_train_inferred_facts ,t3_HOP_test_inferred_iid , t3_HOP_test_inferred_ood = [], [], []      # 3-hop inference  (MY)

    for ent in tqdm(entities):
        #for each entity we loop over all the stored out_edges (relations and tail entities)   basically each entity is stored like= h : [(r1, b), (r2, c), (r3, d)] 
        for (r1, b) in atomic_dict[ent]:
            for (r2, t) in atomic_dict[b]:   #now thre same looping but starting from the tail entity of before
                if (ent, r1, b) in OOD_facts or (b, r2, t) in OOD_facts:  # at least one of the two entities is in the OOD set

                    
                    #if the head entity and the tail entity are in the OOD set, then we add the fact to the test set(OOD)
                    if (ent, r1, b) in OOD_facts and (b, r2, t) in OOD_facts:
                        test_inferred_ood.append(form_items([ent, r1, r2], t)) #if both entities are in the OOD set, we add the fact to the OOD list

                    #continue # it exits the current iteration and goes to the next one (if both are in OOD we haveadded to OOD list otherwise we don't care either) 
                
                else: 
                    #NOW if none of the two entities are in the OOD set we add the fact to the train set (randomly with a 0.5 probability)
                    if np.random.uniform() > 0.005:
                        train_inferred_facts.append(form_items([ent, r1, r2], t))
                    else:
                        test_inferred_iid.append(form_items([ent, r1, r2], t))




                ###### 3-hop implementation
                for (r3, t_3) in atomic_dict[t]:  # for each (r3, t) pair associated with the tail entity
                    if (ent, r1, b) in OOD_facts or (b, r2, t) in OOD_facts or (t, r3, t_3) in OOD_facts:     #
                        #print(" or check 1\n")
                        if (ent, r1, b) in OOD_facts and (b, r2, t) in OOD_facts and (t, r3, t_3) in OOD_facts:   # check if all the entities are in the OOD set
                            print(" or check 2\n")
                            t3_HOP_test_inferred_ood.append(form_items([ent, r1, r2, r3], t_3))
                        continue     
                        #if at least one of the entities is in the OOD set,(but not all) we skip the current iteration = discard the inferred fact 

                    #if none of the entities are in the OOD set, we add the fact to the train set (randomly with a 0.5 probability)
                    if np.random.uniform() > 0.005:
                        t3_HOP_train_inferred_facts.append(form_items([ent, r1, r2, r3], t_3))
                        
                    else:
                        t3_HOP_test_inferred_iid.append(form_items([ent, r1, r2, r3], t_3))
                               
            

    return entities, relations, id_atomic_facts, ood_atomic_facts, train_inferred_facts, test_inferred_iid, test_inferred_ood , t3_HOP_train_inferred_facts ,t3_HOP_test_inferred_iid , t3_HOP_test_inferred_ood
    


In [4]:
NUM_ENTITY_IN = 200    # complete 2000
NUM_RELATION =  20     #complete 200   # 20

train_entities, train_relations, id_atomic_facts, ood_atomic_facts, train_inferred_facts, test_inferred_iid, test_inferred_facts ,      t_3_train_inferred_facts, t_3_test_inferred_iid, t_3_test_inferred_facts = build_dataset(NUM_ENTITY_IN, NUM_RELATION, split_train_inferred=True)
#train_entities



100%|██████████| 200/200 [00:00<00:00, 8868.29it/s]


100%|██████████| 4000/4000 [00:00<00:00, 863780.88it/s]
  2%|▏         | 3/200 [00:00<00:06, 28.73it/s]

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2



  9%|▉         | 18/200 [00:00<00:04, 41.22it/s]

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2



 14%|█▍        | 28/200 [00:00<00:04, 42.30it/s]

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2



 19%|█▉        | 38/200 [00:00<00:03, 42.25it/s]

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2



 24%|██▍       | 48/200 [00:01<00:03, 41.95it/s]

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2



 29%|██▉       | 58/200 [00:01<00:03, 42.64it/s]

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2



 34%|███▍      | 68/200 [00:01<00:03, 43.10it/s]

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2



 42%|████▏     | 83/200 [00:01<00:02, 42.62it/s]

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2



 46%|████▋     | 93/200 [00:02<00:02, 43.13it/s]

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2



 56%|█████▋    | 113/200 [00:02<00:01, 43.74it/s]

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2



 62%|██████▏   | 123/200 [00:02<00:01, 43.50it/s]

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2



 66%|██████▋   | 133/200 [00:03<00:01, 43.23it/s]

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2



 72%|███████▏  | 143/200 [00:03<00:01, 43.12it/s]

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2



 76%|███████▋  | 153/200 [00:03<00:01, 42.74it/s]

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2



 82%|████████▏ | 163/200 [00:03<00:00, 42.56it/s]

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2



 86%|████████▋ | 173/200 [00:04<00:00, 42.50it/s]

 or check 2

 or check 2



 92%|█████████▏| 183/200 [00:04<00:00, 42.69it/s]

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2



 96%|█████████▋| 193/200 [00:04<00:00, 43.14it/s]

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2



100%|██████████| 200/200 [00:04<00:00, 42.60it/s]

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2

 or check 2






In [5]:
# vocab is a list of all the entities and relations in the dataset =list of strings
vocab = []
vocab = vocab + train_entities + train_relations
# special tokens added to the vocabulary for the model to understand the input and output format
vocab = vocab + ["<mask>", "<sep>", "<a>", "</a>", "<q>", "</q>"]
assert len(vocab) == len(set(vocab))
print("vocab size:", len(vocab))

vocab size: 226


In [6]:
test_size = 300       #3000 complete
id_atomic_facts_ds = choose(id_atomic_facts, test_size)
ood_atomic_facts_ds = choose(ood_atomic_facts, test_size)
test_inferred_iid = choose(test_inferred_iid, test_size) #IID
test_inferred_facts_ds = choose(test_inferred_facts, test_size) #OOD

all_atomics = id_atomic_facts + ood_atomic_facts
len(all_atomics)

4000

In [7]:
# downsampling train_inferred
for phi in [18.0,12.6,9.0,7.2,5.4,3.6]:  #phi is the ratio (Inferred facts / Atomic facts)
    dataset_name = "composition.{}.{}.{}".format(NUM_ENTITY_IN, NUM_RELATION, phi)
    os.makedirs("data_MIO/{}".format(dataset_name), exist_ok=True)
    train_inferred_facts_ds = choose(train_inferred_facts, round(phi * len(id_atomic_facts))) #downsampling the train_inferred_facts (ID)

    probes = []
    for item in id_atomic_facts_ds:
        probes.append(deepcopy(item))
        probes[-1]["type"] = "id_atomic"
    
    for item in ood_atomic_facts_ds:
        probes.append(deepcopy(item))
        probes[-1]["type"] = "ood_atomic"

    for item in choose(train_inferred_facts_ds, test_size):
        probes.append(deepcopy(item))
        probes[-1]['type'] = 'train_inferred'

    for item in test_inferred_iid:
        probes.append(deepcopy(item))
        probes[-1]['type'] = 'test_inferred_iid'

    for item in test_inferred_facts_ds:
        probes.append(deepcopy(item))
        probes[-1]["type"] = "test_inferred_ood"

    # save the dataset Train, Test, Valid
    #train.json = all_atomics + train_inferred_facts_ds
    with open("data_MIO/{}/train.json".format(dataset_name), "w", encoding='utf-8') as f:
        json.dump(all_atomics + train_inferred_facts_ds, f)
    #valid.json (only test_inferred_ OOD  why??? for what is that good?) 
    with open("data_MIO/{}/valid.json".format(dataset_name), "w", encoding='utf-8') as f:
        json.dump(test_inferred_facts_ds, f)

    # my validation with the test_inferred_iid    
    with open("data_MIO/{}/valid_iid.json".format(dataset_name), "w", encoding='utf-8') as f:
        json.dump(test_inferred_iid, f)

    #test.json = probes (all the atomic facts and the inferred facts how I know are not used in training??)
    with open("data_MIO/{}/test.json".format(dataset_name), "w", encoding='utf-8') as f:
        json.dump(probes, f)
    # all vocab in the dataset (entities and relations) + special tokens
    with open("data_MIO/{}/vocab.json".format(dataset_name), "w", encoding='utf-8') as f:
        json.dump(vocab, f)

In [8]:
# 3-hop inference save

In [9]:

# vocab is a list of all the entities and relations in the dataset =list of strings
vocab = []
print(vocab)   # to test the vocab list is refreshed
vocab = vocab + train_entities + train_relations
# special tokens added to the vocabulary for the model to understand the input and output format
vocab = vocab + ["<mask>", "<sep>", "<a>", "</a>", "<q>", "</q>"]
assert len(vocab) == len(set(vocab))
print("vocab size:", len(vocab))

[]
vocab size: 226


In [10]:
test_size = 300       #3000 complete
id_atomic_facts_ds = choose(id_atomic_facts, test_size)   # the atomic facts are the same for all the datasets
ood_atomic_facts_ds = choose(ood_atomic_facts, test_size)  # the atomic facts are the same for all the datasets
# here change

t_3_test_inferred_iid = choose(t_3_test_inferred_iid, test_size) #IID
t_3_test_inferred_facts_ds = choose(t_3_test_inferred_facts, test_size) #OOD

all_atomics = id_atomic_facts + ood_atomic_facts
len(all_atomics)

4000

In [11]:
# downsampling train_inferred
for phi in [18.0,12.6,9.0,7.2,5.4,3.6]:  #phi is the ratio (Inferred facts / Atomic facts)
    dataset_name = "3_HOP_composition.{}.{}.{}".format(NUM_ENTITY_IN, NUM_RELATION, phi)
    os.makedirs("data_MIO/{}".format(dataset_name), exist_ok=True)
    t_3_train_inferred_facts_ds = choose(t_3_train_inferred_facts, round(phi * len(id_atomic_facts))) #downsampling the train_inferred_facts (ID)

    
    probes_3 = []
    for item in id_atomic_facts_ds:
        probes_3.append(deepcopy(item))
        probes_3[-1]["type"] = "id_atomic"
    
    for item in ood_atomic_facts_ds:
        probes_3.append(deepcopy(item))
        probes_3[-1]["type"] = "ood_atomic"

    ######### change here for 3 hope 

    for item in choose(t_3_train_inferred_facts_ds, test_size):
        probes_3.append(deepcopy(item))
        probes_3[-1]['type'] = 'train_inferred'

    for item in t_3_test_inferred_iid:
        probes_3.append(deepcopy(item))
        probes_3[-1]['type'] = 'test_inferred_iid'

    for item in t_3_test_inferred_facts_ds:
        probes_3.append(deepcopy(item))
        probes_3[-1]["type"] = "test_inferred_ood"

    # save the dataset Train, Test, Valid
    #train.json = all_atomics + train_inferred_facts_ds
    with open("data_MIO/{}/train.json".format(dataset_name), "w", encoding='utf-8') as f:
        json.dump(all_atomics + t_3_train_inferred_facts_ds, f)
    #valid.json (only test_inferred_ OOD  why??? for what is that good?) 
    with open("data_MIO/{}/valid.json".format(dataset_name), "w", encoding='utf-8') as f:
        json.dump(t_3_test_inferred_facts_ds, f)

    # my validation with the test_inferred_iid    
    with open("data_MIO/{}/valid_iid.json".format(dataset_name), "w", encoding='utf-8') as f:
        json.dump(t_3_test_inferred_iid, f)

    #test.json = probes_3 
    with open("data_MIO/{}/test.json".format(dataset_name), "w", encoding='utf-8') as f:
        json.dump(probes_3, f)
    # all vocab in the dataset (entities and relations) + special tokens
    with open("data_MIO/{}/vocab.json".format(dataset_name), "w", encoding='utf-8') as f:
        json.dump(vocab, f)