In [1]:
from path import Path
def read_txt(file_path: Path) -> str:
    with open(file_path) as file:
        return file.read() 

In [2]:
def form_list_of_T_entity(text: str) -> list[list[str]]:
    splited_text = text.split('\n')
    filtered = []
    for line in splited_text:
        if len(line) > 1 and line[0] == 'T' and not ';' in line:  
            data = line.split('\t')[1].split(' ')
            data[1] = int(data[1])
            data[2] = int(data[2])
            filtered.append(data)
    return filtered


In [3]:
def filter_list_of_T_entity(data: list[list[str]]) -> list[list[str]]:
    return sorted(data, key=lambda entity: (entity[1], - entity[2]))

In [4]:
def filter_spans(data:  list[list[str]]) ->  list[list[str]]:
    span_arange = -1
    result = []
    for entity in data:
        if entity[1] > span_arange:
            result.append(entity)
            span_arange = entity[2]
    return result
def form_lables_list(file_path: Path):
    text = read_txt(file_path)
    lables = form_list_of_T_entity(text)
    lables = filter_list_of_T_entity(lables)
    return filter_spans(lables)

In [5]:
form_lables_list('test/1130.ann')


[['NATIONALITY', 0, 9],
 ['PROFESSION', 10, 16],
 ['PERSON', 17, 28],
 ['EVENT', 29, 38],
 ['ORGANIZATION', 40, 46],
 ['CITY', 49, 55],
 ['NATIONALITY', 58, 67],
 ['PROFESSION', 68, 74],
 ['PERSON', 75, 86],
 ['PROFESSION', 98, 141],
 ['CITY', 143, 149],
 ['AGE', 163, 172],
 ['PROFESSION', 173, 181],
 ['PROFESSION', 278, 304],
 ['PERSON', 305, 311],
 ['NATIONALITY', 320, 331],
 ['PROFESSION', 332, 343],
 ['PERSON', 344, 361],
 ['DATE', 378, 396],
 ['PERSON', 397, 403],
 ['ORGANIZATION', 416, 420],
 ['AWARD', 440, 477]]

In [6]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [9]:
def form_bio_mapping(mapping, text, labels):
    tokens = []
    bio_label = []
    for token in mapping:
        tokens.append(text[token[0]:token[1]])
        for lable in labels:
            if lable[1] == token[0]:
#                 print('BBBBB')
                bio_label.append('B-'+lable[0])
                break
            elif lable[1] < token[0] and lable[2] >= token[1]:
#                 print('IIII')
                bio_label.append('I-'+lable[0])
                break
            elif lable[1] > token[0]:
#                 print("OOO")
                bio_label.append('O')
                break
        else:
            bio_label.append('O')
    return tokens, mapping, bio_label
    
def form_span_from_text(file_path: Path):
    text = read_txt(file_path)
    offset_mapping = tokenizer(text,return_offsets_mapping = True, add_special_tokens=False).offset_mapping
    ann_file = file_path.split('.')[0] + '.ann'
    labels = form_lables_list(ann_file)
    tokens, mapping, bio_label = form_bio_mapping(offset_mapping, text, labels)
    return tokens, mapping, bio_label

    

In [10]:
tokens, mapping, bio_label = form_span_from_text('train/90693_text.txt')
print(len(tokens), len(mapping), len(bio_label))
bio_label

318 318 318


['B-NATIONALITY',
 'I-NATIONALITY',
 'B-PROFESSION',
 'O',
 'B-PENALTY',
 'I-PENALTY',
 'I-PENALTY',
 'O',
 'O',
 'O',
 'B-PERSON',
 'I-PERSON',
 'B-PERSON',
 'I-PERSON',
 'B-NATIONALITY',
 'I-NATIONALITY',
 'B-PROFESSION',
 'O',
 'O',
 'O',
 'O',
 'B-COUNTRY',
 'O',
 'O',
 'O',
 'O',
 'B-DATE',
 'I-DATE',
 'I-DATE',
 'I-DATE',
 'I-DATE',
 'I-DATE',
 'I-DATE',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-ORGANIZATION',
 'I-ORGANIZATION',
 'I-ORGANIZATION',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-PENALTY',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-PROFESSION',
 'I-PROFESSION',
 'I-PROFESSION',
 'O',
 'B-ORGANIZATION',
 'I-ORGANIZATION',
 'B-PERSON',
 'I-PERSON',
 'O',
 'B-PERSON',
 'I-PERSON',
 'I-PERSON',
 'I-PERSON',
 'I-PERSON',
 'I-PERSON',
 'I-PERSON',
 'O',
 'B-AGE',
 'I-AGE',
 'I-AGE',
 'B-PROFESSION',
 'O',
 'B-CITY',
 'I-CITY',
 'O',
 'O',
 'O',
 'B-ORGANIZATION',
 'I-ORGANIZATION',
 'I-ORGANIZATION',
 'O',
 'B-CITY',
 

In [11]:
import os
def form_data_sets(library: Path):
    names = os.listdir(os.getcwd()+'/'+library)
    token_seq = []
    spans = []
    lables = []
    for name in names:
        if name.split('.')[1] == 'txt':
            tmp_token_seq, tmp_spans, tmp_lables = form_span_from_text(os.getcwd()+'/'+library+'/'+name)
            token_seq.append(tmp_token_seq)
            spans.append(tmp_spans)
            print(lables)
            lables.append(lables)
            tmp_token_seq = tmp_spans = tmp_lables = []
    return token_seq, spans, lables

In [12]:
train_token_seq, train_spans_seq, train_label_seq = form_data_sets('train')
# dev_data_set = form_data_sets('dev')
# test_data_set = form_data_sets('test')


[]
[[...]]
[[...], [...]]
[[...], [...], [...]]
[[...], [...], [...], [...]]
[[...], [...], [...], [...], [...]]
[[...], [...], [...], [...], [...], [...]]
[[...], [...], [...], [...], [...], [...], [...]]
[[...], [...], [...], [...], [...], [...], [...], [...]]
[[...], [...], [...], [...], [...], [...], [...], [...], [...]]
[[...], [...], [...], [...], [...], [...], [...], [...], [...], [...]]
[[...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...]]
[[...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...]]
[[...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...]]
[[...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...]]
[[...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...]]
[[...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...]]
[[...], [...], [...], [...], 

[[...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...]]
[[...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...]]
[[...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [..

[[...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...]]
[[...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...

[[...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...]

[[...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...]

[[...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...]

[[...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...]

Token indices sequence length is longer than the specified maximum sequence length for this model (2392 > 2048). Running this sequence through the model will result in indexing errors


[[...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...]

[[...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...]

[[...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...]

[[...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...]

[[...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...], [...]

In [None]:
from collections import Counter, defaultdict, namedtuple
token2cnt = Counter([token for sentence in train_token_seq for token in sentence])

In [None]:
token2cnt.most_common(10)

In [None]:
from typing import Tuple, List, Dict, Any
def get_token2idx(
    token2cnt: Dict[str, int],
    min_count: int,
) -> Dict[str, int]:
    """
    Get mapping from tokens to indices to use with Embedding layer.
    """

    token2idx: Dict[str, int] = {}

    # YOUR CODE HERE
    token2idx = {'<PAD>': 0, '<UNK>': 1}
    
    token_len = len(token2idx)
    for token, count in token2cnt.items():
        if count >= min_count:
            token2idx[token] = token_len
            token_len += 1

    return token2idx



In [None]:
token2idx = get_token2idx(token2cnt, min_count=2)

In [None]:
def sort_labels_func(x: str) -> int:
    if x == "O":
        return 0
    elif x.startswith("B-"):
        return 1
    else:
        return 2
print(train_label_seq)
label_set = sorted(
    set(label for sentence in train_label_seq for label in sentence),
    key=lambda x: (sort_labels_func(x), x),
)
