In [1]:
import re

# Entity marks ##

For flat entities:

I - Word is inside a phrase of type TYPE
B - If two phrases of the same type immediately follow each other, the first word of the second phrase will have tag B-TYPE 
O - Word is not part of a phrase
E - End ( E will not appear in a prefix-only partial match )
S - Single

For nested entities:

B - Begin: The first word of a multi-word entity.
I - Inside: Any non-initial word of a multi-word entity. (M - middle)
L - Last: The last word of a multi-word entity. (E - end)
U - Unit: A single-word entity.
O - Outside: A word that is not part of any entity. (S / Single)

BILOU or IOBES

In [2]:
# Define BILOU tags
bilou_tags = ["B", "I", "L", "U", "O"]
START_SENTENCE_TOKEN = '[CLS]'
SEP_SENTENCE_TOKEN = '[SEP]'

## Dependencies ##

https://wiki.gucorpling.org/gum/dependencies

List of dependency function labels used in GUM

    acl
    acl:relcl
    advcl
    advcl:relcl
    advmod
    amod
    appos
    aux
    aux:pass
    case
    cc
    cc:preconj
    ccomp
    compound
    compound:prt
    conj
    cop
    csubj
    csubj:pass
    dep
    det
    det:predet
    discourse
    dislocated
    expl
    fixed
    flat
    goeswith
    iobj
    list
    mark
    nmod
    nmod:npmod
    nmod:tmod
    nmod:poss
    nsubj
    nsubj:pass
    nummod
    obj
    obl
    obl:agent
    obl:npmod
    obl:tmod
    orphan
    parataxis
    punct
    reparandum
    root
    vocative
    xcomp

## Entities types
 
https://wiki.gucorpling.org/gum/entities

There are 10 entity type:

    person - any person, including fictitious figures, groups of people, and semi-human entities (Pinocchio)
    place - a country (Iceland), region (Sahara)), or other place being referred to as a location (the factory - when used as a place, not to refer to the physical building)
    organization - a company, government, sports team and others
    object - a concrete tangible object
    event - includes reference to nouns ('War', 'the performance') and clauses that are referred back to ('that John came')
    time - dates, times of day, days, years...
    substance - water, mercury, gas, poison ... includes context-dependent substances, such as Skittles or baking chocolate
    animal - any animal, potentially including bacteria, aliens and others construed as animals
    plant - interpreted broadly to include fruits, seeds and other living plant parts, but not substances (e.g. 'wood' is not classified as a plant)
    abstract - abstract notions (luck), emotions (excitement) or intangible properties (predisposition)

## Entity salience

https://wiki.gucorpling.org/gum/entities

An entity is considered salient if and only if it appears in the summary of a document
Annotate the first mention of a salient entity as salient, there is no need to annotate subsequent mentions as salient

In the header: 

meta::salientEntities = 1, 2, 36, 41, 42, 46, 76, 99

## Coreference

https://wiki.gucorpling.org/gum/entities

In [3]:
import scene_desc as sd
import re

In [4]:
TEXT_MARKER = "# text = "
SENTENCE_ID_MARKER = "# sent_id = "
TEXT_MARKER_LEN = len(TEXT_MARKER)
ENTITY_START_MARKER = "Entity="
ENTITY_STOP_MARKER = "|"

entity_marker_len = len(ENTITY_START_MARKER)

def extract_entity_info(text: str) -> (str, list[(str,str)]):
    entity_start_mark_index = text.find(ENTITY_START_MARKER)
    if entity_start_mark_index == -1:
        return '_', []
    
    entity_stop_mark_index = text.find(ENTITY_STOP_MARKER)          
    if entity_stop_mark_index > entity_start_mark_index:
         entity_tag = text[entity_start_mark_index + entity_marker_len:entity_stop_mark_index]
    else:
        entity_tag = text[entity_start_mark_index + entity_marker_len:]
    entity_matches = re.findall(r'(\(\d+-\w+)(?=-|\:)|(\d*?\))', entity_tag)
    
    return entity_tag, entity_matches

def append_to_open_entities(word_id: int,
                            open_entities: list[sd.Entity], 
                            result_entities = list[sd.Entity]):    
    for entity in open_entities:
        if entity.word_id == word_id:
                continue
            
        result_entities.append(
            sd.Entity(word_id=word_id,coref_id=entity.coref_id,
                      bilou_tag='I', level=entity.level))
    
        
def convert_entity_matches(word_id: int,
                           entity_matches: list[(str,str)],
                           coref_entities: dict[int, str],
                           open_entities: list[sd.Entity]) -> list[sd.Entity]:
    
    result_entities = []
    #if no openings or closings for entities
    if len(entity_matches) == 0:
        #if no already opened entities - return BILOU tag  - O - Outside
        if len(open_entities) == 0:      
            return [sd.Entity(word_id=word_id, 
                              coref_id=0, bilou_tag='O', level=0)]
        
        #update already opened entities with nested info
        else:
            append_to_open_entities(word_id=word_id,
                                    open_entities=open_entities, 
                                    result_entities=result_entities)
            return result_entities 
    
    #if there are some openings or closings for entities
    for match in entity_matches:
        entity_start = match[0]
        entity_end = match[1]
        # for an opening
        if entity_start != '':
            spl = entity_start[1:].split('-')
            (coref_id, entity_type) = int(spl[0]), spl[1]
            
            # if opening is newly mentioned, update coreference dictionary
            if coref_id not in coref_entities:
                coref_entities[coref_id] = entity_type
            
            #update already opened entities with nested info
            append_to_open_entities(word_id=word_id,
                                    open_entities=open_entities, 
                                    result_entities=result_entities)
            #get new level
            level = 0
            if len(open_entities) > 0:
                level = open_entities[-1].level + 1
            
            #create a new entity with B - Begin BILOU tag as default
            entity = sd.Entity(word_id=word_id,
                               coref_id=coref_id, bilou_tag='B', level=level)
            open_entities.append(entity) 
            result_entities.append(entity)
        # for a closing            
        else:
            spl = entity_end[:-1]
            # if not links to coref_if then the last opening should be closed and it's 
            # tag is U - Unique (in BILOU terms)
            if spl == '':                
                result_entities[-1].bilou_tag = 'U'
            
            # if there is the link to coref_if, then it is not unique, 
            # so L - Last BILOU tag should be used
            else:
                coref_id = int(spl)
                result_entities.append(
                    sd.Entity(word_id=word_id,
                              coref_id=coref_id, bilou_tag='L',
                              level=open_entities[-1].level))
            
            # remove the last entity from the open entities 
            open_entities.pop()            
    return  result_entities 
            
        
    
def convert_from_conllu_file(file_path):      
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()      
        
        text = sd.Text(text_full="", sentences=[], coref_entities={})
        open_entities = []   
        sentence_text = ''  
        sentence = None
        # Iterate through each line in the file
        for line in lines:
            # collect text
            if line.startswith(TEXT_MARKER):
                sentence_text = line[TEXT_MARKER_LEN:-1]
                text.text_full += sentence_text + " "
            
            # init new sentence
            if line.startswith(SENTENCE_ID_MARKER):
                if sentence is not None:
                    text.sentences.append(sentence)
                sentence_id = int(line[(line.rfind('-') + 1):])
                sentence = sd.Sentence(sentence_id = sentence_id, sentence_text = sentence_text,
                                       words=[])
                
            # collect line info for words and punctuation
            if line[0].isdigit():                
                parts = line.split("\t")
                word_id=parts[0]
                entity_tag, entity_matches = extract_entity_info(text=parts[9])
                # get BILOU entities for each words 
                entities = convert_entity_matches(word_id=word_id, 
                                                  entity_matches=entity_matches, 
                                                  coref_entities=text.coref_entities, 
                                                  open_entities=open_entities)
                # collect word info
                word = sd.Word(word_id=word_id, word=parts[1], lemma_init=parts[2],
                                pos_tag=parts[3], dep_type=parts[7], dep_parent_id=parts[6],
                                entities=entities)           
                
                # print(f'{sentence_id}_{word_id}\t{parts[1]}\t{entity_tag}\n'
                #       f'\t\t[{" | ".join("(" + str(e) + ")" for e in entities)}]\n'
                #       f'\t\t[{" | ".join("(" + str(e) + ")" for e in open_entities)}]'
                #       )
                
                #append word to current sentence
                sentence.words.append(word)
        
        text.text_full = text.text_full.strip()
        return text

In [5]:
def text_to_labels(text: sd.Text, max_level=3, cls_tag='[CLS]', sep_tag='[SEP]'):
    lines = []
    
    for sentence in text.sentences:
        sentence_id = sentence.sentence_id
        
        #append [CLS] Tag
        if cls_tag is not None and cls_tag != '':
            line = sd.LabelLine(sentence_id=sentence_id,
                                      word=sd.Word(word_id=0, word='[CLS]',
                                                   lemma_init='', pos_tag='', 
                                                   dep_type='', dep_parent_id=-1, 
                                                   entities=None),
                                      max_level=max_level)
            lines.append(str(line))
        
        #append words lines        
        for word in sentence.words:
            line = sd.LabelLine(sentence_id=sentence_id, word=word, max_level=max_level)            
            for entity in word.entities:
                if entity.bilou_tag == 'O':
                    line.entities.append((-1, 'O'))
                else:
                    bilou_tag = entity.bilou_tag + '-' + text.coref_entities[entity.coref_id]
                    line.entities.append((entity.coref_id, bilou_tag))               
            lines.append(str(line))
            
        #append [CLS] Tag
        if sep_tag is not None and sep_tag != '':
            line = sd.LabelLine(sentence_id=sentence_id,
                                      word=sd.Word(word_id=len(sentence.words) + 1, 
                                                   word='[SEP]',
                                                   lemma_init='', pos_tag='', 
                                                   dep_type='', dep_parent_id=-1, 
                                                   entities=None),
                                      max_level=max_level) 
            lines.append(str(line))
        
    return '\n'.join(lines)    

In [6]:
entity_tag = 'Entity=(4-place-giv:act-cf3-1-coref-Russia)14)'
re.findall(r'(\(\d+-\w+)(?=-|\:)|(\d*?\))', entity_tag)

[('(4-place', ''), ('', ')'), ('', '14)')]

In [7]:
text = convert_from_conllu_file('../datasets/gum/dep/GUM_fiction_error.conllu')

In [8]:
a = text_to_labels(text)
print(a)

	-1	
	-1		-1	
	-1		-1		-1	
	-1		-1		-1		-100	O
	-1	
	-1		-1	
	-1		-1		-1	
	-1		-1		-1		-1	O
	-1	
	-1		-1	
	-1		-1		-1	
	-1		-1		-1		1	U-person
	-1	
	-1		-1	
	-1		-1		-1	
	-1		-1		-1		-1	O
	-1	
	-1		-1	
	-1		-1		-1	
	-1		-1		-1		-1	O
	-1	
	-1		-1	
	-1		-1		-1	
	-1		-1		-1		-1	O
	-1	
	-1		-1	
	-1		-1		-1	
	-1		-1		-1		-1	O
	-1	
	-1		-1	
	-1		-1		-1	
	-1		-1		-1		2	B-person
	-1	
	-1		-1	
	-1		-1		-1	
	-1		-1		-1		2	L-person
	-1	
	-1		-1	
	-1		-1		-1	
	-1		-1		-1		-1	O
	-1	
	-1		-1	
	-1		-1		-1	
	-1		-1		-1		2	B-person
	-1	
	-1		-1	
	-1		-1		-1	
	-1		-1		-1		2	I-person
	-1	
	-1		-1	
	-1		-1		-1	
	-1		-1		-1		2	L-person
	-1	
	-1		-1	
	-1		-1		-1	
	-1		-1		-1		-1	O
	-1	
	-1		-1	
	-1		-1		-1	
	-1		-1		-1		3	B-place
	-1	
	-1		-1	
	-1		-1		-1	
	-1		-1		-1		3	L-place
	-1	
	-1		-1	
	-1		-1		-1	
	-1		-1		-1		-1	O
	-1	
	-1		-1	
	-1		-1		-1	
	-1		-1		-1		-1	O
	-1	
	-1		-1	
	-1		-1		-1	
	-1		-1		-1		-1	O
	-1	
	-1		-1	
	-1		-1		-1	
	-1		-1		-1		4	B-person
	-1	
	-1		-1	
	-1		-1		-1	
	-1		-1		-1		4	I-pe

In [12]:
for s in text.sentences:
    for w in s.words:
        print(w)

word_id : 1, word: When, lemma_init: when, pos_tag: ADV, dep_type: advmod, dep_parent_id: 5, 
entities: [{'(level: 0, coref_id: 0, bilou_tag: O)'}])
word_id : 2, word: Tyler, lemma_init: Tyler, pos_tag: PROPN, dep_type: nsubj, dep_parent_id: 5, 
entities: [{'(level: 0, coref_id: 1, bilou_tag: U)'}])
word_id : 3, word: was, lemma_init: be, pos_tag: AUX, dep_type: cop, dep_parent_id: 5, 
entities: [{'(level: 0, coref_id: 0, bilou_tag: O)'}])
word_id : 4, word: very, lemma_init: very, pos_tag: ADV, dep_type: advmod, dep_parent_id: 5, 
entities: [{'(level: 0, coref_id: 0, bilou_tag: O)'}])
word_id : 5, word: young, lemma_init: young, pos_tag: ADJ, dep_type: advcl, dep_parent_id: 12, 
entities: [{'(level: 0, coref_id: 0, bilou_tag: O)'}])
word_id : 6, word: ,, lemma_init: ,, pos_tag: PUNCT, dep_type: punct, dep_parent_id: 5, 
entities: [{'(level: 0, coref_id: 0, bilou_tag: O)'}])
word_id : 7, word: his, lemma_init: his, pos_tag: PRON, dep_type: nmod:poss, dep_parent_id: 8, 
entities: [{'(le

In [9]:
text.coref_entities

{1: 'person',
 2: 'person',
 3: 'place',
 4: 'person',
 5: 'person',
 6: 'abstract',
 7: 'person',
 8: 'abstract',
 9: 'abstract',
 10: 'person',
 11: 'animal',
 12: 'person',
 13: 'abstract',
 14: 'abstract',
 15: 'abstract',
 16: 'abstract',
 17: 'time',
 18: 'place',
 19: 'event',
 20: 'object',
 21: 'object',
 22: 'abstract',
 23: 'abstract',
 24: 'person',
 25: 'abstract',
 26: 'abstract',
 27: 'time',
 28: 'abstract',
 29: 'abstract',
 30: 'abstract',
 31: 'abstract',
 32: 'abstract',
 33: 'abstract',
 34: 'time',
 35: 'event',
 36: 'abstract',
 37: 'event',
 38: 'abstract',
 39: 'person',
 40: 'abstract',
 41: 'object',
 42: 'place',
 43: 'object',
 44: 'abstract',
 45: 'object',
 46: 'place',
 47: 'abstract',
 48: 'object',
 49: 'abstract',
 50: 'time',
 51: 'abstract',
 52: 'abstract',
 53: 'abstract',
 54: 'abstract',
 55: 'abstract',
 56: 'abstract',
 57: 'time',
 58: 'person',
 59: 'place',
 60: 'place',
 61: 'abstract',
 62: 'time',
 63: 'abstract',
 64: 'abstract',
 65: '