In [3]:
import re

# Entity marks ##

For flat entities:

I - Word is inside a phrase of type TYPE
B - If two phrases of the same type immediately follow each other, the first word of the second phrase will have tag B-TYPE 
O - Word is not part of a phrase
E - End ( E will not appear in a prefix-only partial match )
S - Single

For nested entities:

B - Begin: The first word of a multi-word entity.
I - Inside: Any non-initial word of a multi-word entity. (M - middle)
L - Last: The last word of a multi-word entity. (E - end)
U - Unit: A single-word entity.
O - Outside: A word that is not part of any entity. (S / Single)

BILOU or IOBES

In [4]:
# Define BILOU tags
bilou_tags = ["B", "I", "L", "U", "O"]
START_SENTENCE_TOKEN = '[CLS]'
SEP_SENTENCE_TOKEN = '[SEP]'

## Dependencies ##

https://wiki.gucorpling.org/gum/dependencies

List of dependency function labels used in GUM

    acl
    acl:relcl
    advcl
    advcl:relcl
    advmod
    amod
    appos
    aux
    aux:pass
    case
    cc
    cc:preconj
    ccomp
    compound
    compound:prt
    conj
    cop
    csubj
    csubj:pass
    dep
    det
    det:predet
    discourse
    dislocated
    expl
    fixed
    flat
    goeswith
    iobj
    list
    mark
    nmod
    nmod:npmod
    nmod:tmod
    nmod:poss
    nsubj
    nsubj:pass
    nummod
    obj
    obl
    obl:agent
    obl:npmod
    obl:tmod
    orphan
    parataxis
    punct
    reparandum
    root
    vocative
    xcomp

## Entities types
 
https://wiki.gucorpling.org/gum/entities

There are 10 entity type:

    person - any person, including fictitious figures, groups of people, and semi-human entities (Pinocchio)
    place - a country (Iceland), region (Sahara)), or other place being referred to as a location (the factory - when used as a place, not to refer to the physical building)
    organization - a company, government, sports team and others
    object - a concrete tangible object
    event - includes reference to nouns ('War', 'the performance') and clauses that are referred back to ('that John came')
    time - dates, times of day, days, years...
    substance - water, mercury, gas, poison ... includes context-dependent substances, such as Skittles or baking chocolate
    animal - any animal, potentially including bacteria, aliens and others construed as animals
    plant - interpreted broadly to include fruits, seeds and other living plant parts, but not substances (e.g. 'wood' is not classified as a plant)
    abstract - abstract notions (luck), emotions (excitement) or intangible properties (predisposition)

## Entity salience

https://wiki.gucorpling.org/gum/entities

An entity is considered salient if and only if it appears in the summary of a document
Annotate the first mention of a salient entity as salient, there is no need to annotate subsequent mentions as salient

In the header: 

meta::salientEntities = 1, 2, 36, 41, 42, 46, 76, 99

## Coreference

https://wiki.gucorpling.org/gum/entities

In [5]:
import scene_desc as sd
import conllu_converter
import re

In [12]:
TEXT_MARKER = "# text = "
SENTENCE_ID_MARKER = "# sent_id = "
TEXT_MARKER_LEN = len(TEXT_MARKER)
ENTITY_START_MARKER = "Entity="
ENTITY_STOP_MARKER = "|"

entity_marker_len = len(ENTITY_START_MARKER)
max_level = 5

def extract_entity_info(text: str) -> (str, list[(str,str)]):
    entity_start_mark_index = text.find(ENTITY_START_MARKER)
    if entity_start_mark_index == -1:
        return '_', []
    
    entity_stop_mark_index = text.find(ENTITY_STOP_MARKER)          
    if entity_stop_mark_index > entity_start_mark_index:
         entity_tag = text[entity_start_mark_index + entity_marker_len:entity_stop_mark_index]
    else:
        entity_tag = text[entity_start_mark_index + entity_marker_len:]
    entity_matches = re.findall(r'(\(\d+-\w+)(?=-|\:)|(\d*?\))', entity_tag)
    
    return entity_tag, entity_matches


def append_still_open_entities_to_result(word_id: int, open_entities: list[sd.Entity], 
                            result_entities = list[sd.Entity]):    
    for entity in open_entities:
        if entity.word_id == word_id:
                continue            
        entity = sd.Entity(word_id=word_id,coref_id=entity.coref_id, bilou_tag='I', level=entity.level)
        result_entities.insert(entity.level, entity)

    
def convert_entity_matches(word_id: int,
                           entity_matches: list[(str,str)],
                           coref_entities: dict[int, str],
                           open_entities: list[sd.Entity]) -> list[sd.Entity]:
    
    result_entities = []
    # if no openings or closings or already opened entities return tag O (Outside)
    if len(entity_matches) == 0 and len(open_entities) == 0:
            return [sd.Entity(word_id=word_id, coref_id=0, bilou_tag='O', level=0)]
    
    #if there are some openings or closings for entities
    for match in entity_matches:
        entity_start, entity_end = match[0], match[1]
        
        # for an opening
        if entity_start != '':
            spl = entity_start[1:].split('-')
            (coref_id, entity_type) = int(spl[0]), spl[1]            
            #get new level
            level = 0 if len(open_entities) == 0 else open_entities[-1].level + 1            
            # all levels deeper than max_level should be ignored 
            if level <= max_level:               
                # if opening is newly mentioned, update coreference dictionary
                if coref_id not in coref_entities:
                    coref_entities[coref_id] = entity_type                
                #create a new entity with B - Begin BILOU tag as default
                entity = sd.Entity(word_id=word_id, coref_id=coref_id, bilou_tag='B', level=level)
                open_entities.append(entity)
                result_entities.append(entity)
                
        # for a closing            
        elif len(open_entities) > 0:
            spl = entity_end[:-1]
            # if not links to coref_if then the last opening should be tagged U (Unique) if its level is not ignored
            if spl == '':
                if len(result_entities) > 0:
                    result_entities[-1].bilou_tag = 'U'
                    open_entities.pop()
            
            # if there is the link to coref_if, then tag L (Last) should be used
            else:
                coref_id = int(spl)        
                open_entity_index = -100
                for i, o in enumerate(open_entities):
                    if o.coref_id == coref_id:
                        open_entity_index = i
                        break
                if open_entity_index >= 0:
                    entity = sd.Entity(word_id=word_id, coref_id=coref_id, bilou_tag='L', 
                              level=open_entities[open_entity_index].level)
                    result_entities.insert(open_entities[open_entity_index].level, entity)
                    open_entities.pop(open_entity_index)

    #update already opened entities with nested info
    append_still_open_entities_to_result(word_id=word_id, open_entities=open_entities, 
                                        result_entities=result_entities)
    return  result_entities 
            
        
    
def convert_from_conllu_file(file_path):      
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()      
        
        text = sd.Text(text_full="", sentences=[], coref_entities={})
        open_entities = []   
        sentence_text = ''  
        sentence = None
        # Iterate through each line in the file
        for line in lines:
            # collect text
            if line.startswith(TEXT_MARKER):
                sentence_text = line[TEXT_MARKER_LEN:-1]
                text.text_full += sentence_text + " "
            
            # init new sentence
            if line.startswith(SENTENCE_ID_MARKER):
                if sentence is not None:
                    text.sentences.append(sentence)
                sentence_id = int(line[(line.rfind('-') + 1):])
                sentence = sd.Sentence(sentence_id = sentence_id, sentence_text = sentence_text,
                                       words=[])
                
            # collect line info for words and punctuation
            if line[0].isdigit():                
                parts = line.split("\t")
                word_id=parts[0]
                entity_tag, entity_matches = extract_entity_info(text=parts[9])
                # get BILOU entities for each words 
                entities = convert_entity_matches(word_id=word_id, 
                                                  entity_matches=entity_matches, 
                                                  coref_entities=text.coref_entities, 
                                                  open_entities=open_entities)
                # collect word info
                word = sd.Word(word_id=word_id, word=parts[1], lemma_init=parts[2],
                                pos_tag=parts[3], dep_type=parts[7], dep_parent_id=parts[6],
                                entities=entities)           
                
                print(f'{sentence_id}_{word_id}\t{parts[1]}\t{entity_tag}\n'
                      f'\t\t[{" | ".join("(" + str(e) + ")" for e in entities)}]\n'
                      f'\t\t[{" | ".join("(" + str(e) + ")" for e in open_entities)}]'
                      )
                
                #append word to current sentence
                sentence.words.append(word)
        
        if sentence is not None:
            text.sentences.append(sentence)
        text.text_full = text.text_full.strip()
        return text

In [7]:
def text_to_labels(text: sd.Text, cls_tag='[CLS]', sep_tag='[SEP]'):
    lines = []
    
    for sentence in text.sentences:
        sentence_id = sentence.sentence_id
        
        #append [CLS] Tag
        if cls_tag is not None and cls_tag != '':
            line = sd.LabelLine(sentence_id=sentence_id,
                                      word=sd.Word(word_id=0, word='[CLS]',
                                                   lemma_init='[CLS]', pos_tag='_', 
                                                   dep_type='_', dep_parent_id=-1, 
                                                   entities=None),
                                      max_level=max_level)
            lines.append(str(line))
        
        #append words lines        
        for word in sentence.words:
            line = sd.LabelLine(sentence_id=sentence_id, word=word, max_level=max_level)            
            for entity in word.entities:
                if entity.bilou_tag == 'O':
                    line.entities.append((-1, 'O'))
                else:
                    bilou_tag = entity.bilou_tag + '-' + text.coref_entities[entity.coref_id]
                    line.entities.append((entity.coref_id, bilou_tag))               
            lines.append(str(line))
            
        #append [CLS] Tag
        if sep_tag is not None and sep_tag != '':
            line = sd.LabelLine(sentence_id=sentence_id,
                                      word=sd.Word(word_id=len(sentence.words) + 1, 
                                                   word='[SEP]',
                                                   lemma_init='[SEP]', pos_tag='_', 
                                                   dep_type='_', dep_parent_id=-1, 
                                                   entities=None),
                                      max_level=max_level) 
            lines.append(str(line))
        
    return '\n'.join(lines)    

In [8]:
entity_tag = 'Entity=(6-abstract-giv:act-cf2-1-ana)4)'
re.findall(r'(\(\d+-\w+)(?=-|\:)|(\d*?\))', entity_tag)

[('(6-abstract', ''), ('', ')'), ('', '4)')]

In [13]:
text = convert_from_conllu_file('../datasets/gum/dep/GUM_fiction_error.conllu')

1_1	When	_
		[(level: 0, coref_id: 0, bilou_tag: O)]
		[]
1_2	Tyler	(1-person-new-cf3-1-coref)

		[(level: 0, coref_id: 1, bilou_tag: U)]
		[]
1_3	was	_
		[(level: 0, coref_id: 0, bilou_tag: O)]
		[]
1_4	very	_
		[(level: 0, coref_id: 0, bilou_tag: O)]
		[]
1_5	young	_
		[(level: 0, coref_id: 0, bilou_tag: O)]
		[]
1_6	,	_
		[(level: 0, coref_id: 0, bilou_tag: O)]
		[]
1_7	his	(2-person-new-cf1-2-coref(1-person-giv:act-cf3-1-ana)

		[(level: 0, coref_id: 2, bilou_tag: B) | (level: 1, coref_id: 1, bilou_tag: U)]
		[(level: 0, coref_id: 2, bilou_tag: B)]
1_8	grandmother	2)
		[(level: 0, coref_id: 2, bilou_tag: L)]
		[]
1_9	was	_
		[(level: 0, coref_id: 0, bilou_tag: O)]
		[]
1_10	his	(2-person-giv:act-cf1-3-coref(1-person-giv:act-cf3-1-ana)

		[(level: 0, coref_id: 2, bilou_tag: B) | (level: 1, coref_id: 1, bilou_tag: U)]
		[(level: 0, coref_id: 2, bilou_tag: B)]
1_11	favorite	_
		[(level: 0, coref_id: 2, bilou_tag: I)]
		[(level: 0, coref_id: 2, bilou_tag: B)]
1_12	person	2)

		[(level:

In [14]:
a = text_to_labels(text)
print(a)

1	0	[CLS]	[CLS]	_	_	-1	-100	O	-100	O	-100	O	-100	O	-100	O	-100	O
1	1	When	when	ADV	advmod	5	-1	O	-100	O	-100	O	-100	O	-100	O	-100	O
1	2	Tyler	Tyler	PROPN	nsubj	5	1	U-person	-100	O	-100	O	-100	O	-100	O	-100	O
1	3	was	be	AUX	cop	5	-1	O	-100	O	-100	O	-100	O	-100	O	-100	O
1	4	very	very	ADV	advmod	5	-1	O	-100	O	-100	O	-100	O	-100	O	-100	O
1	5	young	young	ADJ	advcl	12	-1	O	-100	O	-100	O	-100	O	-100	O	-100	O
1	6	,	,	PUNCT	punct	5	-1	O	-100	O	-100	O	-100	O	-100	O	-100	O
1	7	his	his	PRON	nmod:poss	8	2	B-person	1	U-person	-100	O	-100	O	-100	O	-100	O
1	8	grandmother	grandmother	NOUN	nsubj	12	2	L-person	-100	O	-100	O	-100	O	-100	O	-100	O
1	9	was	be	AUX	cop	12	-1	O	-100	O	-100	O	-100	O	-100	O	-100	O
1	10	his	his	PRON	nmod:poss	12	2	B-person	1	U-person	-100	O	-100	O	-100	O	-100	O
1	11	favorite	favorite	ADJ	amod	12	2	I-person	-100	O	-100	O	-100	O	-100	O	-100	O
1	12	person	person	NOUN	root	0	2	L-person	-100	O	-100	O	-100	O	-100	O	-100	O
1	13	in	in	ADP	case	15	-1	O	-100	O	-100	O	-100	O	-100	O	-100	O
1	

In [10]:
text.coref_entities

{1: 'person',
 2: 'person',
 3: 'place',
 4: 'person',
 5: 'person',
 6: 'abstract',
 7: 'person',
 8: 'abstract',
 9: 'abstract',
 10: 'person',
 11: 'animal',
 12: 'person',
 13: 'abstract',
 14: 'abstract',
 15: 'abstract',
 16: 'abstract',
 17: 'time',
 18: 'place',
 19: 'event',
 20: 'object',
 21: 'object',
 22: 'abstract',
 23: 'abstract',
 24: 'person',
 25: 'abstract',
 26: 'abstract',
 27: 'time',
 28: 'abstract',
 29: 'abstract',
 30: 'abstract',
 31: 'abstract',
 32: 'abstract',
 33: 'abstract',
 34: 'time',
 35: 'event',
 36: 'abstract',
 37: 'event',
 38: 'abstract',
 39: 'person',
 40: 'abstract',
 41: 'object',
 42: 'place',
 43: 'object',
 44: 'abstract',
 45: 'object',
 46: 'place',
 47: 'abstract',
 48: 'object',
 49: 'abstract',
 50: 'time',
 51: 'abstract',
 52: 'abstract',
 53: 'abstract',
 54: 'abstract',
 55: 'abstract',
 56: 'abstract',
 57: 'time',
 58: 'person',
 59: 'place',
 60: 'place',
 61: 'abstract',
 62: 'time',
 63: 'abstract',
 64: 'abstract',
 65: '

In [1]:
import scene_desc as sd
import conllu_converter
import re

converter = conllu_converter.ConlluFilesConverter(max_level=3) 
converter.convert_and_save_file(source_file_path='../datasets/gum/dep/GUM_fiction_error.conllu',
                                target_text_file_path='../datasets/gum_parsed/texts/1.txt',
                                target_labels_file_path='../datasets/gum_parsed/labels/1.txt',
                                coref_dict_file_path='../datasets/gum_parsed/coref_dict/1.txt')