In [2]:
!pip install transformers datasets peft accelerate evaluate rouge-score bert-score


Collecting evaluate
  Using cached evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge-score
  Using cached rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert-score
  Using cached bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->peft)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu1

In [3]:
from datasets import load_dataset
import pandas as pd
from datasets import Dataset

df = pd.read_csv("/content/sample_data/merged_bbc_fmp2.csv")

dataset = Dataset.from_pandas(df)
data = dataset.train_test_split(test_size=0.1)


In [4]:
len(df)

4127

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-small")

max_source = 512
max_target = 128

def preprocess(examples):
    inputs = ["summarize: " + t for t in examples["text"]]
    model_inputs = tokenizer(inputs, truncation=True, padding="max_length", max_length=max_source)
    labels = tokenizer(examples["summary"], truncation=True, padding="max_length", max_length=max_target).input_ids
    model_inputs["labels"] = [
        [(lbl if lbl != tokenizer.pad_token_id else -100) for lbl in label]
        for label in labels
    ]
    return model_inputs

tokenized = data.map(preprocess, batched=True, remove_columns=data["train"].column_names)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/3714 [00:00<?, ? examples/s]

Map:   0%|          | 0/413 [00:00<?, ? examples/s]

In [6]:
from transformers import AutoModelForSeq2SeqLM
from peft import LoraConfig, get_peft_model, TaskType

model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=8,
    lora_alpha=16,
    target_modules=["q", "v"],
    lora_dropout=0.05
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 294,912 || all params: 60,801,536 || trainable%: 0.4850


In [7]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

training_args = Seq2SeqTrainingArguments(
    output_dir="t5-lora-output",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=3e-4,
    num_train_epochs=3,
    logging_steps=100,
    eval_steps=500,
    save_steps=1000,
    save_total_limit=2,
    fp16=True,
    predict_with_generate=True,
    report_to="none"
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,

)


  trainer = Seq2SeqTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [8]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
100,1.2932
200,1.1526
300,1.065
400,1.0312
500,0.994
600,0.9697
700,1.0032
800,0.9294
900,0.9665
1000,0.9125


TrainOutput(global_step=1395, training_loss=1.011670135169901, metrics={'train_runtime': 247.8954, 'train_samples_per_second': 44.946, 'train_steps_per_second': 5.627, 'total_flos': 1518072665997312.0, 'train_loss': 1.011670135169901, 'epoch': 3.0})

In [9]:
model.save_pretrained("t5-2000-summarizer")
tokenizer.save_pretrained("t5-2000-summarizer")

('t5-2000-summarizer/tokenizer_config.json',
 't5-2000-summarizer/special_tokens_map.json',
 't5-2000-summarizer/spiece.model',
 't5-2000-summarizer/added_tokens.json',
 't5-2000-summarizer/tokenizer.json')

In [10]:
import evaluate
rouge = evaluate.load("rouge")
preds = trainer.predict(tokenized["test"]).predictions
decoded = tokenizer.batch_decode(preds, skip_special_tokens=True)
refs = data["test"]["summary"]
print(rouge.compute(predictions=decoded, references=refs))

Downloading builder script: 0.00B [00:00, ?B/s]

{'rouge1': np.float64(0.2266784078966924), 'rouge2': np.float64(0.12936339659467372), 'rougeL': np.float64(0.20333923288003675), 'rougeLsum': np.float64(0.2032873179212622)}


In [11]:
from bert_score import score
P, R, F1 = score(decoded, refs, lang="en")


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
for i in range(len(decoded)):
    print(f"Summary {i+1}:")
    print(f"  Precision: {P[i]:.3f} (How much info is correct)")
    print(f"  Recall:    {R[i]:.3f} (How much reference is covered)")
    print(f"  F1:        {F1[i]:.3f} (Balance)\n")

Summary 1:
  Precision: 0.925 (How much info is correct)
  Recall:    0.823 (How much reference is covered)
  F1:        0.871 (Balance)

Summary 2:
  Precision: 0.812 (How much info is correct)
  Recall:    0.817 (How much reference is covered)
  F1:        0.815 (Balance)

Summary 3:
  Precision: 0.753 (How much info is correct)
  Recall:    0.811 (How much reference is covered)
  F1:        0.781 (Balance)

Summary 4:
  Precision: 0.811 (How much info is correct)
  Recall:    0.789 (How much reference is covered)
  F1:        0.800 (Balance)

Summary 5:
  Precision: 0.851 (How much info is correct)
  Recall:    0.804 (How much reference is covered)
  F1:        0.827 (Balance)

Summary 6:
  Precision: 0.925 (How much info is correct)
  Recall:    0.878 (How much reference is covered)
  F1:        0.901 (Balance)

Summary 7:
  Precision: 0.963 (How much info is correct)
  Recall:    0.878 (How much reference is covered)
  F1:        0.918 (Balance)

Summary 8:
  Precision: 0.864 (How

In [13]:
baseline = [" ".join(text.split(".")[:3]) for text in refs]
P_base, R_base, F1_base = score(baseline, refs, lang="en")

if F1.mean() > F1_base.mean():
    print("Your model outperforms baseline!")
else:
    print("Try improving your model architecture.")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Try improving your model architecture.


In [16]:
from google.colab import files
import shutil

# # Создаем ZIP-архив папки
shutil.make_archive('/content/t5-2000-summarizer', 'zip', '/content/t5-2000-summarizer')
# shutil.make_archive('/content/t5-lora-summarizer', 'zip', '/content/t5-lora-summarizer')

# # Скачиваем архив
# files.download('/content/t5-lora-output.zip')
# files.download('/content/t5-lora-summarizer.zip')

'/content/t5-2000-summarizer.zip'