In [1]:
import pandas as pd
import regex as re

import ahocorasick

import torch
from transformers import T5Tokenizer, MT5ForConditionalGeneration

from sklearn.model_selection import train_test_split   # not important for pipeline. only for test

from tqdm import tqdm




In [2]:
SEP_TOKEN = '▁<extra_id_0>'
SEQ_MAX_LENGTH = 150

#USED_MODEL_NAME = './saved_model_/coint_rut5small_finetune_ttsseed14_7722'
#USED_MODEL_NAME = './coint_rut5small_finetune_6171-8031'
#USED_MODEL_NAME = './coint_rut5small_finetune_fulltrain_novalid'
USED_MODEL_NAME = './coint_rut5small_finetune_fulltrain_novalid'

In [3]:
tokenizer = T5Tokenizer.from_pretrained (USED_MODEL_NAME)

device = torch.device ('cuda' if torch.cuda.is_available () else 'cpu')
model = MT5ForConditionalGeneration.from_pretrained (USED_MODEL_NAME)
model.to (device)
print (model.device)

cuda:0


In [4]:
def raw_splitter (text, delimiters = ['...', '.', '?!', '?', '!']):

    delimiters_pattern = '|'.join (map (re.escape, delimiters))
    paragraphs = re.split (f'(?<=\n[ ]*)', text)
    
    sentences_with_indices = []
    current_start_index = 0

    for paragraph in paragraphs:

        sentences = re.split (f'(?<=[{delimiters_pattern}] )(?=[A-ZА-ЯЁ])', paragraph)#.strip ())

        for sentence in sentences:

            start_index = current_start_index
            end_index = start_index + len (sentence)

            sentences_with_indices.append ((sentence, (start_index, end_index)))

            current_start_index = end_index
    
    return sentences_with_indices


LABEL_PREFIX_TOKEN = '▁<extra_id_1>'
def get_set (tensor, tokenizer = tokenizer):

    separator = SEP_TOKEN

    seq = tensor [tensor != 0]
    seq = seq [seq != 1]
    seq = seq [seq != - 100]
    
    txt = tokenizer.decode (seq)
    res = set ([item.strip () for item in txt.split (separator)])

    if len (res) > 1:
        res -= set ([''])
    res -= set ([LABEL_PREFIX_TOKEN[1:]])
    if len (res) == 0: res |= set ([''])

    return res



def one_finder (text, phrases):

    if len (phrases) == 1 and '' in phrases:
        return []

    A = ahocorasick.Automaton ()
    
    for idx, phrase in enumerate (phrases):
        A.add_word (phrase, (idx, phrase))
    
    A.make_automaton ()
    
    found = []
    for end_index, (idx, phrase) in A.iter (text):
        start_index = end_index - len (phrase) + 1

        if start_index > 0 and text [start_index - 1].isalpha ():
            continue
        if end_index + 1 < len (text) and text [end_index + 1].isalpha ():
            continue

        found.append ((start_index, end_index + 1, phrase))
    
    return found

In [5]:

def predict_with_model (texts, model = model, tokenizer = tokenizer):

    model.eval ()

    predictions = []

    for text in tqdm (texts):

        sentences_w_ind = raw_splitter (text)

        answers = []
        for sentence, (start, end) in sentences_w_ind:

            sentence_tokenized = tokenizer (sentence, padding = 'max_length', truncation = True, max_length = SEQ_MAX_LENGTH, return_tensors = 'pt')

            with torch.no_grad ():

                input_ids = sentence_tokenized ['input_ids'].to (model.device)
                attention_mask = sentence_tokenized ['attention_mask'].to (model.device)

                #out = model.generate (input_ids = input_ids, attention_mask = attention_mask, max_length = SEQ_MAX_LENGTH)
                #out = model.generate (input_ids = input_ids, attention_mask = attention_mask, max_length = SEQ_MAX_LENGTH)
                out = model.generate (input_ids = input_ids, attention_mask = attention_mask, max_length = SEQ_MAX_LENGTH)

                term_set = get_set (out [0])
                #print (term_set)

            found = one_finder (sentence, term_set)
            answers += [[item [0] + start, item [1] + start] for item in found]

        predictions.append (answers)

    return (predictions)





def predict_with_model_effective (texts, model = model, tokenizer = tokenizer):

    model.eval ()

    predictions = []

    for text in tqdm (texts):

        sentences_w_ind = raw_splitter (text)
        sentences = [sent for sent, (_, _) in sentences_w_ind]

        batch_size = 8
        out = []
        if len (sentences) > batch_size:
            num_batches = (len (sentences) + batch_size - 1) // batch_size
            for i in range (num_batches):
                batch_sentences = sentences [i * batch_size: (i + 1) * batch_size]
                
                sentences_tokenized = tokenizer (batch_sentences, padding = 'max_length', truncation = True, max_length = SEQ_MAX_LENGTH, return_tensors = 'pt')
                
                input_ids = sentences_tokenized ['input_ids'].to (model.device)
                attention_mask = sentences_tokenized ['attention_mask'].to (model.device)
                
                output = model.generate (input_ids = input_ids, attention_mask = attention_mask, max_length = SEQ_MAX_LENGTH)
                
                out.extend ([item for item in output])
        
        else:
            sentences_tokenized = tokenizer (sentences, padding = 'max_length', truncation = True, max_length = SEQ_MAX_LENGTH, return_tensors = 'pt')
                
            input_ids = sentences_tokenized ['input_ids'].to (model.device)
            attention_mask = sentences_tokenized ['attention_mask'].to (model.device)

            out = model.generate (input_ids = input_ids, attention_mask = attention_mask, max_length = SEQ_MAX_LENGTH)
            out = [item for item in out]

        answers = []
        for i in range (len (out)):
            sentence = sentences_w_ind [i] [0]
            output = out [i]
            start = sentences_w_ind [i] [1] [0]
            term_set = get_set (output)

            found = one_finder (sentence, term_set)

            answers += [[item [0] + start, item [1] + start] for item in found]

        predictions.append (answers)

    return (predictions)

In [6]:
def label_constructor (labels):
    res = []
    for label in labels:
        one_label = []
        for start, end, cls in label:
            one_label.append ([start, end])
        res.append (one_label)
    return res

In [7]:
df = pd.read_json ('./test_data/test1_t12_full_v2.jsonl', lines = True)
df = df [['text', 'label']]
print (df.head ())

                                                text  \
0  АВТОМАТИЧЕСКИЙ АНАЛИЗ ТОНАЛЬНОСТИ ТЕКСТОВ НА О...   
1  InBASE: ТЕХНОЛОГИЯ ПОСТРОЕНИЯ ЕЯ-ИНТЕРФЕЙСОВ К...   
2  Выражение уважительности с помощью личных мест...   
3  ДА ЧЕРТ ЛИ В ДЕТАЛЯХ?.. МЕРА ДЛЯ ОЦЕНКИ СОВПАД...   
4  КОРПУСНАЯ ОЦЕНКА СОЧЕТАЕМОСТИ СЛОВ С ИСПОЛЬЗОВ...   

                                               label  
0  [[0, 33, specific], [22, 33, specific], [52, 7...  
1  [[0, 6, nomen], [19, 44, specific], [30, 44, s...  
2  [[0, 24, specific], [35, 53, specific], [42, 5...  
3  [[51, 70, specific], [61, 70, specific], [116,...  
4  [[0, 16, specific], [17, 34, specific], [30, 3...  


In [8]:
val_data_txt = df ['text']
val_labels_txt = df ['label']

In [9]:
print (val_data_txt)

0      АВТОМАТИЧЕСКИЙ АНАЛИЗ ТОНАЛЬНОСТИ ТЕКСТОВ НА О...
1      InBASE: ТЕХНОЛОГИЯ ПОСТРОЕНИЯ ЕЯ-ИНТЕРФЕЙСОВ К...
2      Выражение уважительности с помощью личных мест...
3      ДА ЧЕРТ ЛИ В ДЕТАЛЯХ?.. МЕРА ДЛЯ ОЦЕНКИ СОВПАД...
4      КОРПУСНАЯ ОЦЕНКА СОЧЕТАЕМОСТИ СЛОВ С ИСПОЛЬЗОВ...
                             ...                        
108    ВЛИЯНИЕ ОБЪЕМА ОПЕРАТИВНОЙ ПАМЯТИ НА ИНТЕРПРЕТ...
109    АНАЛИЗ ПАРАМЕТРОВ РЕЧЕВОГО СИГНАЛА СОЗДАЮЩИХ В...
110    СТАБИЛЬНОСТЬ ИСТОЧНИКОВ КАК ОДИН ИЗ ПАРАМЕТРОВ...
111    О ГРАММАТИКЕ КОНЦЕПТУАЛЬНЫХ ОТНОШЕНИЙ\nВ рамка...
112    МЕТОД КОНТЕКСТНОГО РАЗРЕШЕНИЯ ФУНКЦИОНАЛЬНОЙ О...
Name: text, Length: 113, dtype: object


In [10]:
def comparator (pred, labl):

    pred = set ([tuple (item) for item in pred])
    labl = set ([tuple (item) for item in labl])

    true_positives = len (pred & labl)
    false_positives = len (pred - labl)
    false_negatives = len (labl - pred)

    return true_positives, false_positives, false_negatives

def metricator (preds, labels):

    tps_sum = 0
    fps_sum = 0
    fns_sum = 0 

    for i in range (len (labels)):

        true_positives, false_positives, false_negatives = comparator (preds [i], labels [i])

        tps_sum += true_positives
        fps_sum += false_positives
        fns_sum += false_negatives

    precision = tps_sum / (tps_sum + fps_sum) if (tps_sum + fps_sum) > 0 else 0
    recall = tps_sum / (tps_sum + fns_sum) if (tps_sum + fns_sum) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1_score

In [11]:
val_data_lst = val_data_txt.tolist ()
val_labels_lst = label_constructor (val_labels_txt)

#test_data_lst = test_data_txt.tolist ()
#test_labels_lst = test_labels_txt.tolist ()



In [12]:
predictions = predict_with_model_effective (val_data_lst)

 54%|█████▍    | 61/113 [00:26<00:22,  2.34it/s]


KeyboardInterrupt: 

In [None]:
res = metricator (predictions, val_labels_lst)
res

### My own leader models

name | metrics | smth
-----|---------|----------
./coint_rut5small_finetune_fulltrain_novalid | (0.7016524423461049, 0.7734187349879904, 0.7357897743501857) | 
./coint_rut5-base-absum_finetune_5982-7597 | (0.6875249103228378, 0.6905524419535628, 0.6890353505092869) | 
./coint_rut5small_finetune_6171-8031 | (0.6939553457977854, 0.7652121697357886, 0.7278438838648262) | 
./coint_rut5small_finetune_6077-7948 | (0.6774367259019924, 0.7554043234587671, 0.7142992334626669) |
./coint_rut5small_finetune_ttsseed14_7722 | (0.6761090326028861, 0.7596076861489192, 0.7154302950325194) | 
BSET | 0.7035753374680773, 0.7689393939393939, 0.7348066298342542 | 
rut5small_7332-7420 | 0.7447399761810242, 0.7510008006405124, 0.7478572852302173
rut5small_foldlearn_250ep_68-71 | (0.6639388009251023, 0.7469975980784628, 0.7030234529528115)
rut5small_fulltrain_dictpostfix_avg7209-7591 | (0.6214299405788768, 0.6489191353082466, 0.6348771173993929)
rut5small_fulltrain_folds_f1opt_avg--------- | (0.6287981093855503, 0.7455964771817454, 0.6822344322344321)
rut5small_fulltrain_folds_f1opt_avg-v2-7250-7633- | (0.6041174485318933, 0.7165732586068855, 0.6555575901849477)
rut5small_fulltrain_novalid_dictionary_postfix | (0.7344150298889838, 0.344275420336269, 0.4687925865358408) | no postfix
rut5small_fulltrain100ep_novalid_dictionary_postfix | (0.695127402771569, 0.3112489991993595, 0.4299737315083645) | no postfix
rut5small_posttrain_folds_f1opt_avg-7531-7816- | (0.6763404707131481, 0.7650120096076861, 0.717948717948718)

In [None]:
unmatched_counter = 0

def predict_with_model_effective_prints (texts, model = model, tokenizer = tokenizer):

    global unmatched_counter

    model.eval ()

    predictions = []

    for text in tqdm (texts):

        sentences_w_ind = raw_splitter (text)
        sentences = [sent for sent, (_, _) in sentences_w_ind]

        batch_size = 8
        out = []
        if len (sentences) > batch_size:
            num_batches = (len (sentences) + batch_size - 1) // batch_size
            for i in range (num_batches):
                batch_sentences = sentences [i * batch_size: (i + 1) * batch_size]
                
                sentences_tokenized = tokenizer (batch_sentences, padding = 'max_length', truncation = True, max_length = SEQ_MAX_LENGTH, return_tensors = 'pt')
                
                input_ids = sentences_tokenized ['input_ids'].to (model.device)
                attention_mask = sentences_tokenized ['attention_mask'].to (model.device)
                
                output = model.generate (input_ids = input_ids, attention_mask = attention_mask, max_length = SEQ_MAX_LENGTH)
                
                out.extend ([item for item in output])
        
        else:
            sentences_tokenized = tokenizer (sentences, padding = 'max_length', truncation = True, max_length = SEQ_MAX_LENGTH, return_tensors = 'pt')
                
            input_ids = sentences_tokenized ['input_ids'].to (model.device)
            attention_mask = sentences_tokenized ['attention_mask'].to (model.device)

            out = model.generate (input_ids = input_ids, attention_mask = attention_mask, max_length = SEQ_MAX_LENGTH)
            out = [item for item in out]

        answers = []
        for i in range (len (out)):
            sentence = sentences_w_ind [i] [0]
            output = out [i]
            start = sentences_w_ind [i] [1] [0]
            term_set = get_set (output)

            found = one_finder (sentence, term_set)

            mfoundar = set ([item [2] for item in found])
            unmatched_counter += len (term_set - mfoundar)
            print (f'>>>>В строке: {sentence}')
            print (f'  Не найдены: {[item for item in term_set - mfoundar]}')

            answers += [[item [0] + start, item [1] + start] for item in found]

        predictions.append (answers)

    return (predictions)

In [14]:
predictions = predict_with_model_effective_prints (val_data_lst)

  0%|          | 0/113 [00:00<?, ?it/s]


UnboundLocalError: local variable 'unmatched_counter' referenced before assignment

In [None]:
unmatched_counter

In [None]:
print 4

In [None]:
df = pd.read_json ('./test2_t12_v2.jsonl', lines = True)
print (df.head ())

test_data_txt = df ['text']

test_data_lst = test_data_txt.tolist ()

In [None]:
predictions = predict_with_model (test_data_lst)
predictions

In [None]:
idx = 2
print (test_data_lst [idx])
for item in predictions [idx]:
    print (f'{test_data_lst [idx] [item [0]: item [1]]}', end = ', ')

In [None]:
df ['label'] = predictions

In [None]:
df.to_json ('res-digr-test2_t12_v2-moretrain.jsonl', orient = 'records', lines = True, force_ascii = False)

In [None]:


predictions = predict_with_model (test_data_lst)

In [None]:
res = metricator (predictions, test_labels_lst)
res

max_length | val metrics | test metrics
-----------|-------------|---------------
50  | (0.7762276785714286, 0.762609649122807, 0.7693584070796461) | (0.7506617257808365, 0.7611379495437467, 0.7558635394456289)
100 | (0.7714884696016772, 0.8070175438596491, 0.7888531618435156) | (0.7517552657973922, 0.8046162104133119, 0.7772880477054704)
150 | (0.7723958333333333, 0.8130482456140351, 0.7922008547008546) | (0.7521281922884326, 0.8062265163714439, 0.7782383419689118)
200 | (0.7725143154606976, 0.8135964912280702, 0.7925233644859813) | (0.7521281922884326, 0.8062265163714439, 0.7782383419689118)

params | val metrics | test metrics
-------|-------------|---------------
baseline | (0.7723958333333333, 0.8130482456140351, 0.7922008547008546) | (0.7521281922884326, 0.8062265163714439, 0.7782383419689118)
do_sample = True | (0.7603661820140011, 0.7741228070175439, 0.7671828307525129) | (0.745850622406639, 0.7718733225979603, 0.7586388815615933)
top_p = 0.95 | (0.7733405288720993, 0.7856359649122807, 0.779439760674463) | (0.751922091235264, 0.7874396135265701, 0.7692711064499212)
top_p = 0.9 | (0.7749057619816909, 0.7889254385964912, 0.7818527574028797) | (0.7537198563365829, 0.7885131508319914, 0.7707240293809026)
top_p = 0.85 | (0.7633262260127932, 0.7850877192982456, 0.774054054054054) | (0.7581060216160577, 0.7906602254428341, 0.7740409879138204)
top_p = 0.8 | (0.7774813233724653, 0.7987938596491229, 0.7879935100054083) | (0.7516472377090725, 0.7960279119699409, 0.7732012513034411)
 |  | 
top_k = 5 | (0.7621359223300971, 0.774671052631579, 0.768352365415987) | (0.748829953198128, 0.7729468599033816, 0.7606973058637084)
top_k = 10 | (0.7706868577609519, 0.78125, 0.7759324802613667) | (0.7476780185758514, 0.7777777777777778, 0.7624309392265193)

In [None]:
# SEQ_LENGTH (0.7521281922884326, 0.8062265163714439, 0.7782383419689118)
# 150        

In [None]:
idx = 5

print (val_data_lst [idx])

pred_idx = predict_with_model ([val_data_lst [idx]])

print (metricator (pred_idx, [val_labels_lst [idx]]))

for item in pred_idx [0]:
    print (f'{val_data_lst [idx] [item [0]: item [1]]}', end = ', ')
print ()
for item in val_labels_lst [idx]:
    print (f'{val_data_lst [idx] [item [0]: item [1]]}', end = ', ')

In [None]:
def label_constructor (labels):
    res = []
    for label in labels:
        one_label = []
        for start, end, cls in label:
            one_label.append ([start, end])
        res.append (one_label)
    return res

In [None]:
#df = pd.read_json ('./test_data/dev.json', lines = True)

#df = pd.read_json ('train_t1_v1.jsonl', lines = True)
df = pd.read_json ('cl-ruterm3-sample.json')
df = df [['text', 'label']]
print (df.head ())

In [None]:
data_lst = df ['text'].tolist ()
labels_lst = df ['label'].tolist ()

labels_lst = label_constructor (labels_lst)
labels_lst

In [None]:
predictions = predict_with_model (data_lst)

In [None]:
res = metricator (predictions, labels_lst)
res