In [2]:
import json


#In order to make the labels simpler when doing analysis, both before and after imputting the data to the model, the mapping to the simple labels is done here. This is the same mapping that was also used for the REBEL model
relations_mapper = {'/people/person/nationality': 'country of citizenship', '/sports/sports_team/location': 'headquarters location', 
            '/location/country/administrative_divisions': 'contains administrative territorial entity', '/business/company/major_shareholders': 'shareholders', 
            '/people/ethnicity/people': 'country of origin', '/people/ethnicity/geographic_distribution': 'denonym', 
            '/business/company_shareholder/major_shareholder_of': 'major shareholder', '/location/location/contains': 'location',
            '/business/company/founders': 'founded by', '/business/person/company': 'employer', '/business/company/advisors': 'advisors', 
            '/people/deceased_person/place_of_death': 'place of death', '/business/company/industry': 'industry', 
            '/people/person/ethnicity': 'ethnicity', '/people/person/place_of_birth': 'place of birth', 
            '/location/administrative_division/country': 'country', '/people/person/place_lived': 'residence', 
            '/sports/sports_team_location/teams': 'member of sports team', '/people/person/children': 'child', 
            '/people/person/religion': 'religion', '/location/neighborhood/neighborhood_of': 'neighborhood of', 
            '/location/country/capital': 'capital', '/business/company/place_founded': 'location of formation', 
            '/people/person/profession': 'occupation'}

The script that does everything in the conversion to the sequence labeling task (in MaChamp format):

In [3]:
def get_concatenations(the_list_of_lists,concatenation_type,seperator=None):
    #this script takes all the lists of an instance (if there are more than one relation) and concatenates these 
    #(e.g. [o,o,likes_person:-1] and [friends_with:2,o,o] become [friends_with:2,o,likes_person:-1]
    #or [o,o,like_person:-1] and [o,o,lives_in:-2] become [o,o,like_person:-1|lives_in:-2])

    #To make the concatenation easier, a list of lists for each token will be made, such that all the labels a specific token would have gets placed into the same inner list.
    #This means, for the next bit of code that we can easily see which labels should be concatenated, and which token they would belong to

    zipper_stuff = [[] for _ in range(len(the_list_of_lists[0]))]
    for list_inst in the_list_of_lists:
        for i,el in enumerate(list_inst):
            zipper_stuff[i].append(el)
        
    output_stuff = []

    #In case we are looking at relation labels, if there are only 'o' labels for a token, we just append a single 'o' label; 
    #if there is an 'o' label and something else, we ignore the 'o' label alltogether; 
    #if there are more than two non-'o' labels, we concatenate these using the specified seperator

    if concatenation_type=='relations':
        for lst in zipper_stuff:
            first_element=True
            s = ''
            for el in lst:
                if not el=='o':
                    if first_element:
                        s+=el
                        first_element=False
                    else:
                        s+=seperator+el
            if len(s)==0:
                output_stuff.append('o')
            else:
                output_stuff.append(s)
    
    #For entities, since a check to remove nested entities is done earlier, it is assumed there will only be one label per token

    if concatenation_type=='entities':
        for lst in zipper_stuff:
            if len(set(lst))==2:  #This is done in case there is an 'o' and something else for this token. In that case it is the non-'o' label that should be concatenated
                for el in lst:
                    if not el=='o':
                        output_stuff.append(el)
                        break
            else:
                output_stuff.append(lst[0])
    return output_stuff


#since I did not do the solution with overlapping entities, there should not be problems with the len-set-2 part (unless they are part of different relations)


def has_nested_entities(relations_list,len_token_list):    #Checks for nested entities for an instance. Another check was made that showed there were no B-B nested entities (where there are two entities that start at the same token, but one is longer than the other). This check can be found in the next cell

    the_checker = ['o' for _ in range(len_token_list)]

    for rel_inst in relations_list:
        #We make these lists so we can easily compare its entries to the window in 'the_checker' (the full checker)
        tmp_checker_head = [None for _ in range(rel_inst[1]-rel_inst[0])]
        tmp_checker_tail = [None for _ in range(rel_inst[5]-rel_inst[4])]

        tmp_checker_head[0] = 'B-'+rel_inst[2]
        tmp_checker_tail[0] = 'B-'+rel_inst[6]

        for i in range(1,rel_inst[1]-rel_inst[0]):
            tmp_checker_head[i]='I-'+rel_inst[2]

        for i in range(1,rel_inst[5]-rel_inst[4]):
            tmp_checker_tail[i]='I-'+rel_inst[6]

        # If it finds something where the head entity should be that is not the same entity and is npt just 'o's        
        if ((not tmp_checker_head==the_checker[rel_inst[0]:rel_inst[1]]) and (not set(the_checker[rel_inst[0]:rel_inst[1]])==set(['o']))):
            return True

        #Stored so we can check for nested entities across relations
        else: 
            the_checker[rel_inst[0]:rel_inst[1]]=tmp_checker_head


        #the same thing as above but for tail
        if ((not tmp_checker_tail==the_checker[rel_inst[4]:rel_inst[5]]) and (not set(the_checker[rel_inst[4]:rel_inst[5]])==set(['o']))):
            return True

        else:
            the_checker[rel_inst[4]:rel_inst[5]]=tmp_checker_tail

    return False    




def preprocessor(data_to_get,seperator,savepath):
    
    #the main script that does most of the work

    with open('nyt/{}.json'.format(data_to_get)) as f:
        nyt_data=json.load(f)
    
    all_relation_labels = []
    all_entity_labels = []
    all_tokens = []

    nested_entities_counter = 0    #Analysis was done to find the number of nested entities

    for inst in nyt_data:
        relation_labels = []
        entity_labels = []

        if has_nested_entities(inst['spo_details'],len(inst['tokens'])): #checking for nested entities, and discarding the instance if it has it
            nested_entities_counter+=1
            continue
        else:
            for el in inst['spo_details']:
                #A list is made for each relation, containing the entities in it
                ent = ['o' for _ in range(len(inst['tokens']))]

                ent[el[0]]='B-'+el[2]
                ent[el[4]]='B-'+el[6]

                for i in range(el[4]+1,el[5]):
                    ent[i]='I-'+el[6]

                for i in range(el[0]+1,el[1]):
                    ent[i]='I-'+el[2]

                entity_labels.append(ent)

        

            concatted_entity_list = get_concatenations(entity_labels,'entities')  #the entities for all relations in the instance are concatenated

            #In order to get the relational index, a dictionary with the start index of each entity is made
            entities_looker_upper = dict()
            i=0
            for idx,el in enumerate(concatted_entity_list):
                if el[0]=='B':
                    entities_looker_upper[idx]=i
                    i+=1


            for el in inst['spo_details']:
                rel = ['o' for _ in range(len(inst['tokens']))]

                #From this it can be seen how the relational index is made, taking the index from the list of entities of the head entity in the relation, and subtracting it for the index for the tail entity. 
                #Remember that the arrows go in the opposite direction (from tail to head) due to fewer clashes
                rel[el[4]]='B-'+relations_mapper[el[3]]+':'+str(entities_looker_upper[el[0]]-entities_looker_upper[el[4]])  

                for i in range(el[4]+1,el[5]):
                    rel[i]='I-'+relations_mapper[el[3]]+':'+str(entities_looker_upper[el[0]]-entities_looker_upper[el[4]])

                relation_labels.append(rel)


            all_relation_labels.append(get_concatenations(relation_labels,'relations',seperator))
            all_entity_labels.append(concatted_entity_list)
            all_tokens.append(inst['tokens'])
            

    print(data_to_get,': ',nested_entities_counter)
    
    #Here the contents of the lists are outputted in the MaChamp format, line by line
    with open(savepath+"{}".format(data_to_get),'w') as outfile:
        for inst_tok, inst_rel,inst_ent in zip (all_tokens,all_relation_labels,all_entity_labels):
            for tok,rel,ent in zip(inst_tok,inst_rel,inst_ent):
                outfile.write('{}\t{}\t{}\n'.format(tok,rel,ent))
            outfile.write('\n') #an empty line to indicate a new instance


data = 'train'
preprocessor(data,'|',"seq_lab_data/new_pipe_sep/nyt.")

data = 'test'
preprocessor(data,'|',"seq_lab_data/new_pipe_sep/nyt.")

data = 'dev'
preprocessor(data,'|',"seq_lab_data/new_pipe_sep/nyt.")


train :  564
test :  55
dev :  55


The script below is a check to see if there existed B-B entities (where there are two entities that start at the same token, but either one has a different type than the other, or one entity extends the other, or both) in any of the three datasets. As this did not return anything for any of the three datasets, some extra checks in the 'has_nested_entities' method could be omitted.

In [6]:
#checking if B-B cases exist

data_to_get='test'

with open('nyt/{}.json'.format(data_to_get)) as f:
        nyt_data=json.load(f)

for inst in nyt_data:
    types_set = set()
    start_idxs_set = set()
    entity_position_set = set()
    for rel_inst in inst['spo_details']:
        #The former two of these checks sees if there are two entities that start at the same token but have different types
        #The latter two of these checks sees if there are cases where one entitiy extends another
        if (not str(rel_inst[0])+'-'+rel_inst[2] in types_set) and (str(rel_inst[0]) in start_idxs_set) and (not (rel_inst[0],rel_inst[1]) in entity_position_set):
            print(True)
        start_idxs_set.add(rel_inst[0])
        types_set.add(str(rel_inst[0])+'-'+rel_inst[2])
        entity_position_set.add((rel_inst[0],rel_inst[1]))

        #The same check but for tail
        if (not str(rel_inst[4])+'-'+rel_inst[6] in types_set) and (str(rel_inst[4]) in start_idxs_set) and (not (rel_inst[4],rel_inst[5]) in entity_position_set):
            print(True)
        start_idxs_set.add(rel_inst[4])
        types_set.add(str(rel_inst[4])+'-'+rel_inst[6])
        entity_position_set.add((rel_inst[4],rel_inst[5]))

A test to see if using blanks ('_') instead of labels (thus avoiding the nested entity problem) for the datasets the model was evaluated on would perform better

In [5]:
### Alternative way to do the dev and test sets (without labels)

def preprocessor(data_to_get,savepath):
    
    with open('nyt/{}.json'.format(data_to_get)) as f:
        nyt_data=json.load(f)
    
    all_relation_labels = []
    all_entity_labels = []
    all_tokens = []


    for inst in nyt_data:
        
        all_relation_labels.append(['_' for _ in range(len(inst['tokens']))])
        all_entity_labels.append(['_' for _ in range(len(inst['tokens']))])

        all_tokens.append(inst['tokens'])
            


    with open(savepath+"{}".format(data_to_get),'w') as outfile:
        for inst_tok, inst_rel,inst_ent in zip (all_tokens,all_relation_labels,all_entity_labels):
            for tok,rel,ent in zip(inst_tok,inst_rel,inst_ent):
                outfile.write('{}\t{}\t{}\n'.format(tok,rel,ent))
            outfile.write('\n')


data = 'test'
preprocessor(data,"seq_lab_data/new_pipe_sep_unlabeled/nyt.")

data = 'dev'
preprocessor(data,"seq_lab_data/new_pipe_sep_unlabeled/nyt.")