In [5]:
import pandas as pd

model_type = 'small'
model_id= f"google/flan-t5-{model_type}"

In [2]:
import pandas as pd
from datasets import Dataset

# Загружаем наш CSV файл в DataFrame
train_df = pd.read_json("gazeta_train.jsonl", lines=True)
test_df = pd.read_json("gazeta_train.jsonl", lines=True)
val_df = pd.read_json("gazeta_val.jsonl", lines=True)

# Убедимся, что поля 'text' и 'summary' в правильном формате, например, нет null значений
train_df.dropna(subset=['text', 'summary'], inplace=True)
test_df.dropna(subset=['text', 'summary'], inplace=True)
val_df.dropna(subset=['text', 'summary'], inplace=True)

# Преобразуем DataFrame в Dataset библиотеки datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
val_dataset = Dataset.from_pandas(val_df)

In [3]:
from transformers import T5Tokenizer

# Инициализируем токенизатор для нашей модели
tokenizer = T5Tokenizer.from_pretrained(f"t5-{model_type}")

# Функция для токенизации наших пар 'text' и 'summary'
def tokenize_function(examples):
    # Энкодируем текст и суммари, обрезаем при необходимости до максимальной длины
    inputs = ["summarize: " + item for item in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    # Так как T5 использует префиксы, мы добавляем их для обучения задачи суммаризации
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Применяем функцию токенизации к нашему датасету
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/60964 [00:00<?, ? examples/s]



Map:   0%|          | 0/60964 [00:00<?, ? examples/s]

Map:   0%|          | 0/6369 [00:00<?, ? examples/s]

In [6]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained(model_id, device_map="auto")

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [7]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)


In [9]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Metric
metric = evaluate.load("rouge")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

[nltk_data] Downloading package punkt to /home/mikhail/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Предположим, что model_id и dataset_id уже определены
out_dir = "/media/mikhail/e0420deb-cbc8-4a2d-9ff7-7907e9bff3d9/flan_fine_tune_out"

# Определим аргументы для обучения
training_args = Seq2SeqTrainingArguments(
    output_dir=out_dir,  # Указание локальной директории
    per_device_train_batch_size=8,  # Можешь изменить в зависимости от памяти GPU
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    learning_rate=5e-5,
    num_train_epochs=5,
    logging_dir=f"{out_dir}/logs",  # Локальная директория для логов
    # Тактика журналирования и оценивания
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    # Тебе нужно будет определить compute_metrics, если это необходимо для твоей задачи
)

# Создадим экземпляр Trainer
trainer = Seq2SeqTrainer(
    model=model,  # Убедись, что модель загружена и подходит для обучения
    args=training_args,
    data_collator=data_collator,  # Инициализируй корректно
    train_dataset=tokenized_train,  # Убедись в доступности данных
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,  # Опционально, зависит от задачи
)

In [11]:
trainer.train()

  0%|          | 0/38105 [00:00<?, ?it/s]

{'loss': 1.8457, 'learning_rate': 4.93439181209815e-05, 'epoch': 0.07}
{'loss': 1.7376, 'learning_rate': 4.8687836241962995e-05, 'epoch': 0.13}
{'loss': 1.7088, 'learning_rate': 4.80317543629445e-05, 'epoch': 0.2}
{'loss': 1.6724, 'learning_rate': 4.7375672483926e-05, 'epoch': 0.26}
{'loss': 1.6417, 'learning_rate': 4.6719590604907493e-05, 'epoch': 0.33}
{'loss': 1.6232, 'learning_rate': 4.606350872588899e-05, 'epoch': 0.39}
{'loss': 1.6163, 'learning_rate': 4.540742684687049e-05, 'epoch': 0.46}
{'loss': 1.6037, 'learning_rate': 4.4751344967851985e-05, 'epoch': 0.52}
{'loss': 1.5916, 'learning_rate': 4.409526308883349e-05, 'epoch': 0.59}
{'loss': 1.5731, 'learning_rate': 4.343918120981499e-05, 'epoch': 0.66}
{'loss': 1.5734, 'learning_rate': 4.2783099330796484e-05, 'epoch': 0.72}
{'loss': 1.5515, 'learning_rate': 4.2127017451777984e-05, 'epoch': 0.79}
{'loss': 1.5464, 'learning_rate': 4.147093557275948e-05, 'epoch': 0.85}
{'loss': 1.5396, 'learning_rate': 4.0814853693740976e-05, 'epoch



  0%|          | 0/7621 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
trainer.evaluate()