In [1]:
from RuNNE import RuNNEBuilder

builder = RuNNEBuilder()
builder.download_and_prepare()
dataset = builder.as_dataset()
df = dataset['train'].to_pandas().drop(columns='id')
df

Unnamed: 0,text,entities
0,Ким Чен Нама убили с помощью запрещённого химо...,"[0 12 PERSON, 56 72 ORGANIZATION, 64 72 COUNTR..."
1,Смена портретов на долларах\nГарриет Табмен\nК...,"[19 27 MONEY, 28 42 PERSON, 52 67 MONEY, 68 77..."
2,Новым генсеком ООН станет португалец Гутерреш\...,"[6 14 PROFESSION, 15 18 ORGANIZATION, 26 36 NA..."
3,Вахту принял\n\nУоррен Баффет назвал своего по...,"[14 27 PERSON, 199 209 DATE, 232 260 PROFESSIO..."
4,В Японии скончался старейший житель Земли — Дз...,"[2 8 COUNTRY, 79 96 DATE, 116 122 COUNTRY, 134..."
...,...,...
456,Электрик руководит социал-демократами Дюссельд...,"[0 8 PROFESSION, 68 72 ORGANIZATION, 73 101 TI..."
457,Мужчина женился на тёще\n\nРумынский мужчина у...,"[25 34 NATIONALITY, 129 138 PERSON, 154 161 CI..."
458,Названы лауреаты премии «World Press Photo»\nФ...,"[61 67 COUNTRY, 68 78 PERSON, 258 286 ORGANIZA..."
459,Первое поражение Серены Уильямс в финале Austr...,"[0 6 ORDINAL, 17 31 PERSON, 41 56 EVENT, 57 71..."


In [2]:
# load the model
from transformers import T5TokenizerFast, AutoModelForSeq2SeqLM

tokenizer = T5TokenizerFast.from_pretrained("UrukHan/t5-russian-summarization")
model = AutoModelForSeq2SeqLM.from_pretrained("UrukHan/t5-russian-summarization")

In [3]:
train = dataset['train']
test = dataset['test']
dev = dataset['dev']

# preprocess the dataset
max_input_length = 256
max_target_length = 256

def preprocess_function(examples):
    inputs = examples['text']
    _targets = examples['entities']
    targets = []
    for t in _targets:
        targets.append(','.join(t))
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding=True)

    model_inputs["labels"] = labels["input_ids"]
    #model_inputs.drop
    return model_inputs

In [4]:
train_processed = train.map(preprocess_function, batched=True)
test_processed = test.map(preprocess_function, batched=True)
#dev_processed = dev.map(preprocess_function, batched=True)

In [5]:
train_processed

Dataset({
    features: ['id', 'text', 'entities', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 461
})

In [6]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import numpy as np
import pandas as pd

batch_size = 4
args = Seq2SeqTrainingArguments(
    "RuNNE-Training",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=50,
    predict_with_generate=True,
    fp16=True, # set to True if you have CUDA, False if CUDA is not available
    generation_max_length=128,
    
)

# set up the data collator to pad the inputs and labels
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# set up the metrics for the training process.
from datasets import load_metric

metric = load_metric("sacrebleu") # using the metric from the example

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

# set up the trainer itself
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_processed,
    eval_dataset=test_processed,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

2024-04-28 12:40:15.920100: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-04-28 12:40:15.920146: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2024-04-28 12:40:16.775576: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-04-28 12:40:16.775717: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
  metric = load_metric("sacrebleu") # using the metric from the example
You can avoid this mes

In [7]:
# do the thing. That would take some time.
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,No log,1.663738,0.1385,81.2151
2,No log,1.466263,0.5475,123.1075
3,No log,1.39399,0.5552,124.3548
4,No log,1.338489,0.4484,121.2688
5,1.743700,1.298689,0.6641,123.4516
6,1.743700,1.267874,0.8825,124.914
7,1.743700,1.24144,0.7785,123.6022
8,1.743700,1.223199,0.6759,121.8065
9,1.293700,1.205301,0.9704,121.172
10,1.293700,1.196971,0.8016,121.0968


Non-default generation parameters: {'max_length': 64}
Non-default generation parameters: {'max_length': 64}
Non-default generation parameters: {'max_length': 64}
Non-default generation parameters: {'max_length': 64}
Non-default generation parameters: {'max_length': 64}
Non-default generation parameters: {'max_length': 64}
Non-default generation parameters: {'max_length': 64}
Non-default generation parameters: {'max_length': 64}
Non-default generation parameters: {'max_length': 64}
Non-default generation parameters: {'max_length': 64}
Non-default generation parameters: {'max_length': 64}


TrainOutput(global_step=5800, training_loss=1.170102354904701, metrics={'train_runtime': 7627.0328, 'train_samples_per_second': 3.022, 'train_steps_per_second': 0.76, 'total_flos': 7018239688704000.0, 'train_loss': 1.170102354904701, 'epoch': 50.0})

In [8]:
trainer.save_model('t5rs-RuNNE-128')

Non-default generation parameters: {'max_length': 64}


In [10]:
tokenizer_trained = T5TokenizerFast.from_pretrained("t5rs-RuNNE-128")
model_trained = AutoModelForSeq2SeqLM.from_pretrained("t5rs-RuNNE-128")

In [11]:
# test if it is working on a part of test dataset
from tqdm.notebook import trange
input = []
result = [] 
target = []
N = 10

for i in trange(N):
    input_ids = tokenizer_trained(test['text'][i], return_tensors="pt").input_ids
    outputs = model_trained.generate(input_ids, max_new_tokens = 128)
    input.append(test['text'][i])
    result.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
    target.append(test['entities'][i])

pd.DataFrame.from_dict({'input': input, 'result': result, 'target': target}).head(N)

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,input,result,target
0,FakTyrA анонсировал сингл «Психопат» и назначи...,"0 8 ORGANIATION,17 31 ORGANIATION,40 50 ORGANI...","[0 7 PERSON, 27 35 WORK_OF_ART, 90 98 DATE, 10..."
1,Умер создатель первого в мире индексного фонда...,"107 113 PERSON,114 137 DATE,140 164 ORGANIATIO...","[15 22 ORDINAL, 47 56 PERSON, 127 145 ORGANIZA..."
2,Художник Александр Шилов отмечает юбилей\n29 о...,"0 9 PERSON,35 50 PROFESSION,55 67 PERSON,68 76...","[0 8 PROFESSION, 9 24 PERSON, 41 61 DATE, 79 9..."
3,Полноценная встреча Трампа с Путиным не состои...,"0 5 PROFESSION,15 23 PERSON,45 54 PERSON,61 73...","[20 26 PERSON, 29 36 PERSON, 56 63 PERSON, 64 ..."
4,День памяти и скорби начался в Брянске с тради...,"2 8 CIT,27 38 FACILIT,48 56 FACILIT,68 78 FACI...","[0 20 EVENT, 31 38 CITY, 127 138 EVENT, 153 16..."
5,Умер Эдуард Лимонов\nЭдуард Лимонов\nВо вторни...,"0 12 PERSON,27 39 PROFESSION,50 61 PERSON,71 8...","[35 65 DATE, 72 86 PERSON, 97 103 AGE, 122 129..."
6,Полицейские во Франции убили страсбургского ст...,"0 8 NATIONALIT,16 23 ORGANIATION,39 49 CIT,60 ...","[15 22 COUNTRY, 29 43 CITY, 100 107 COUNTRY, 1..."
7,Литва празднует 100-летие независимости\n\nГер...,"0 9 COUNTR,15 23 DATE,35 46 EVENT,58 67 COUNTR...","[0 5 COUNTRY, 16 25 AGE, 46 51 COUNTRY, 52 83 ..."
8,В элитном доме на Никитском бульваре полностью...,"0 6 FACILIT,27 42 FACILIT,51 67 FACILIT,70 86 ...","[18 36 FACILITY, 71 83 PERSON, 84 101 FACILITY..."
9,Владимирский вице-губернатор покинул пост из-з...,"0 8 PROFESSION,10 22 PERSON,52 61 PERSON,65 73...","[0 12 STATE_OR_PROVINCE, 92 111 PERSON, 162 17..."
