In [1]:
!pip install evaluate
!pip install py7zr
!pip install sacrebleu
!pip install rouge_score

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.1
Collecting py7zr
  Downloading py7zr-0.20.6-py3-none-any.whl (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.7/66.7 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting pycryptodomex>=3.6.6 (from py7zr)
  Downloading pycryptodomex-3.19.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m47.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting pyzstd>=0.14.4 (from py7zr)
  Downloading pyzstd-0.15.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (412 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m412.3/412.3 kB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[?

# Initialization

In [2]:
import tqdm

tqdm.tqdm.pandas()

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments

device = "cuda" if torch.cuda.is_available() else "cpu"
max_length = 512



Loading model and its tokenizer.

In [4]:
model_name = "google/pegasus-xsum"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.config.decoder_start_token_id = tokenizer.get_vocab()["<n>"]

Downloading (…)okenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)neration_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

Loading dataset

In [5]:
from datasets import load_dataset, DatasetDict

dataset = load_dataset('samsum')
dataset

Downloading builder script:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/770 [00:00<?, ?B/s]

Downloading and preparing dataset samsum/samsum (download: 2.81 MiB, generated: 10.04 MiB, post-processed: Unknown size, total: 12.85 MiB) to /root/.cache/huggingface/datasets/samsum/samsum/0.0.0/3f7dba43be72ab10ca66a2e0f8547b3590e96c2bd9f2cbb1f6bb1ec1f1488ba6...


Downloading data: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

Dataset samsum downloaded and prepared to /root/.cache/huggingface/datasets/samsum/samsum/0.0.0/3f7dba43be72ab10ca66a2e0f8547b3590e96c2bd9f2cbb1f6bb1ec1f1488ba6. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

# Data Preprocessing
Preprocess dataset:

In [6]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=64, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_dataset = dataset.map(
    preprocess_function, batched=True, remove_columns=['id', 'dialogue', 'summary']
    )

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [9]:
columns = ['input_ids', 'labels', 'attention_mask']
tokenized_dataset.set_format(type='torch', columns=columns)

Removing some validation data to make evaluation process faster

In [7]:
tokenized_dataset = tokenized_dataset.shuffle()
tokenized_dataset['validation'] = tokenized_dataset['validation'].shard(num_shards=4, index=0)
tokenized_dataset.shape

{'train': (14732, 3), 'test': (819, 3), 'validation': (205, 3)}

# Metrics
Metric function

In [8]:
import numpy as np
import evaluate

rouge = evaluate.load("rouge")
blue = evaluate.load('bleu')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    blue_score = blue.compute(predictions=decoded_preds, references=decoded_labels)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()} | {'bleu': blue_score['bleu']}

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

# Training

In [10]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

Initialize optimizer and scheduler. I use the same scheduler as in paper "Attention is all you need".

In [11]:
from torch.optim import Adam
from torch.optim import Optimizer
from torch.optim.lr_scheduler import _LRScheduler

class Scheduler(_LRScheduler):
    def __init__(self, optimizer: Optimizer, dim_embed: int, warmup_steps: int, last_epoch: int=-1, verbose: bool=False) -> None:
        self.dim_embed = dim_embed
        self.warmup_steps = warmup_steps
        self.num_param_groups = len(optimizer.param_groups)
        super().__init__(optimizer, last_epoch, verbose)

    def get_lr(self) -> float:
        lr = calc_lr(self._step_count, self.dim_embed, self.warmup_steps)
        return [lr] * self.num_param_groups

def calc_lr(step, dim_embed, warmup_steps):
    return dim_embed**(-0.5) * min(step**(-0.5), step * warmup_steps**(-1.5))


optimizer = Adam(model.parameters(), lr=1e-9, betas=(0.9, 0.98))

train_batch_size = 4
val_batch_size = 1
num_epochs = 6

scheduler = Scheduler(
    optimizer,
    warmup_steps=0.1 * num_epochs * (tokenized_dataset['train'].num_rows / train_batch_size),
    dim_embed=1024
)

In [12]:
from transformers import GenerationConfig

gen_config = GenerationConfig(
    bos_token_id=model.config.decoder_start_token_id,
)

Setup training arguments and trainer:

In [13]:
training_args = Seq2SeqTrainingArguments(
    output_dir="output",
    overwrite_output_dir=True,

    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=val_batch_size,

    evaluation_strategy="epoch",
    save_strategy="epoch",

    num_train_epochs=num_epochs,

    predict_with_generate=True,

    disable_tqdm=False,
    generation_config=gen_config,
    fp16=True,
)

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    optimizers=(optimizer, scheduler)
)

Fine-tune the model:

In [14]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a PegasusTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu
1,1.9764,1.726112,0.4931,0.2427,0.4085,0.407,64.0,0.166814
2,1.6136,1.64437,0.4866,0.24,0.3951,0.3943,64.0,0.157319
3,1.2659,1.658141,0.4877,0.2498,0.402,0.4019,64.0,0.176767


KeyboardInterrupt: 

# Results

In [15]:
import os
for dirname, _, filenames in os.walk('/kaggle/working/output/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/working/output/checkpoint-3683/scheduler.pt
/kaggle/working/output/checkpoint-3683/optimizer.pt
/kaggle/working/output/checkpoint-3683/trainer_state.json
/kaggle/working/output/checkpoint-3683/rng_state.pth
/kaggle/working/output/checkpoint-3683/pytorch_model.bin
/kaggle/working/output/checkpoint-3683/training_args.bin
/kaggle/working/output/checkpoint-3683/config.json
/kaggle/working/output/checkpoint-3683/generation_config.json
/kaggle/working/output/runs/Nov05_15-08-14_420d3b5a0eaa/events.out.tfevents.1699196899.420d3b5a0eaa.32.0
/kaggle/working/output/checkpoint-7366/scheduler.pt
/kaggle/working/output/checkpoint-7366/optimizer.pt
/kaggle/working/output/checkpoint-7366/trainer_state.json
/kaggle/working/output/checkpoint-7366/rng_state.pth
/kaggle/working/output/checkpoint-7366/pytorch_model.bin
/kaggle/working/output/checkpoint-7366/training_args.bin
/kaggle/working/output/checkpoint-7366/config.json
/kaggle/working/output/checkpoint-7366/generation_config.json
/kaggle/wor

In [44]:
from transformers import pipeline
pipe = pipeline('summarization', model='/kaggle/working/output/checkpoint-11049', tokenizer=tokenizer)

In [46]:
text = tokenized_dataset['test']['input_ids'][:10]

Prediction:

In [47]:
pipe(tokenizer.batch_decode(text))

Your max_length is set to 64, but your input_length is only 54. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=27)
Your max_length is set to 64, but your input_length is only 31. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=15)


[{'summary_text': "<n> Mazie and her friends are having a drink tonight. Lee can't stand some of her friends."},
 {'summary_text': "<n> There was a weird smell at Kasia's place last night. Clara suspects it was her boyfriend. Ron doesn't know what it was."},
 {'summary_text': "<n> The readings for the next session of Stephen's seminar are not in the print shop yet. Rita will go to the shop tomorrow morning and let them know if the texts are available."},
 {'summary_text': "<n> Marleen will be in half an hour to go to town. Barry has no idea what to buy Rita for Christmas. Marleen is at Kaiser's. Marleen and Barry will go for 50/50."},
 {'summary_text': '<n> Sam started the career mode in 18 months.'},
 {'summary_text': '<n> There was a big media fuss about a meet and greet with James Charles in Birmingham. There were 8000 people in the mall. A host from LBC tried to answer the question "Who is James Charles".'},
 {'summary_text': '<n> Patricia sent a file to Elle and Florence about Fai

Target:

In [48]:
tokenizer.batch_decode(tokenized_dataset['test']['labels'][:10])

["Mazie and her girlfriends are having a drink tonight. Lee doesn't like some of her friends, for example Sheryl. Lee had spilt a drink on her.</s>",
 "Clara and Ron are wondering what that weird smell at Kasia's place last night was.</s>",
 "Chae-yeong and Arthur inform Mariana that the readings for the next session of Stephen's seminar weren't available in the print shop on Monday. Rita decides to go and chcek if the situation's changed tomorrow and she'll let everyone know.</s>",
 "Marleen will be leaving in half an hour to go to town. She will get a present for Rita - silk kerchief with a yellow pattern from Kaiser's. It costs 39, and Barley will share the cost 50/50 with Marleen.</s>",
 'Sam started a career mode in FIFA.</s>',
 'There was a meet-and-greet with James Charles in Birmingham which gathered 8000 people.</s>',
 'Patricia is recommending a fair-trade brand to Elle and Florence.</s>',
 'Eva is at a party, while Olivia is taking care of her daughter, Linta. Eva will leave

Dialogue:

In [49]:
tokenizer.batch_decode(text)

["summarize: Mazie: Me and mah girls are having a drink tonight. Lee: Oh really? Where? Mazie: Wouldn't you like to know... Lee: Yes! So I can stay away! Mazie: That's cold. Lee: True, can't stand some of your friends. Mazie: OIC...who? Lee: Sheryl for one. She's a b-word that rhymes with witch. Mazie: You just saw her on a bad day. Lee: Don't think so! Mazie: You did spill a drink on her. Lee: Well... Mazie: You did! Lee: I said sorry! Mazie: LOL</s>",
 "summarize: Clara: Did you notice that weird smell at Kasia's place last night? Ron: YES!!!!!! I didn't want to say anything about it, though. I didn't want to be rude. Clara: I think it was her 21 cats roaming around lol Ron: lol don't say that, those cats were cute. Clara: so what? they can still smell Ron: i think it was her sleazy boyfriend Clara: lol you're bad Ron: jk Clara: in all honesty I don't know what it was. Ron: i guess we'll never know</s>",
 'summarize: Mariana: Hi, just a quick question. Do you know if the readings for

Conclusion: 
We evaluate our model using BLUE and ROUGE scores. Let's describe their pros and cons in general:

BLEU (Bilingual Evaluation Understudy) Score:
- Pros:
  - It's a fast and cost-effective way to measure the quality of Machine Translation output.
  - It's language-independent and correlates positively with human evaluation.
- Cons:
  - BLEU score measures syntactical matches rather than semantics.
  - It doesn't manage different words that have the same meaning.
  - It's not a percentage measure of accuracy.

ROUGE (Recall-Oriented Understudy for Gisting Evaluation) Score:
- Pros:
  - It correlates positively with human evaluation, it’s inexpensive to compute and language-independent.
  - It measures the longest matching sequence of words using LCS (Longest Common Subsequence), which does not require consecutive matches but in-sequence matches that reflect sentence-level word order.
- Cons:
  - ROUGE does not manage different words that have the same meaning, as it measures syntactical matches rather than semantics.
  - It focuses on recall, which means it measures how much the words (and/or n-grams) in the human references appear in the candidate model outputs. This might not be ideal in scenarios where precision is also important.

We can clearly see all the pros and cons of these metrics on generated results of our model. 
BLUE score is low due to the fact that model can't clearly generate grammar and sometimes describe semantic with not exact words, but synonyms.
ROUGE score is higher, because there are a lot of cases when model generates exact subsequences as in target. For example:
   - Target: **Mazie and her girlfriends are having a drink tonight**. Lee doesn't like **some of her friends**, for example Sheryl. Lee had spilt a drink on her.
   - Prediction: **Mazie and her friends are having a drink tonight**. Lee can't **stand some of her friends**.


# LoRa attempt

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
max_length = 512

model_name = "google/pegasus-xsum"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.config.decoder_start_token_id = tokenizer.get_vocab()["<n>"]

dataset = load_dataset('samsum')

prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=64, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_dataset = dataset.map(
    preprocess_function, batched=True, remove_columns=['id', 'dialogue', 'summary']
)

columns = ['input_ids', 'labels', 'attention_mask']
tokenized_dataset.set_format(type='torch', columns=columns)

In [None]:
rouge = evaluate.load("rouge")
blue = evaluate.load('bleu')


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    blue_score = blue.compute(predictions=decoded_preds, references=decoded_labels)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()} | {'bleu': blue_score['bleu']}

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
def print_trainable_parameters(model):
    """ 
    Prints the number of trainable parameters in the model. 
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", 'fc1', 'fc2'],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

In [None]:
from transformers import AdamW, get_cosine_schedule_with_warmup

optimizer = AdamW(model.parameters(), lr=0.0001)

train_batch_size = 4
val_batch_size = 1
num_epochs = 4

scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0.1 * num_epochs * (tokenized_dataset['train'].num_rows / train_batch_size),
    num_training_steps=num_epochs * (tokenized_dataset['train'].num_rows / train_batch_size)
)

In [None]:
from transformers import GenerationConfig

gen_config = GenerationConfig(bos_token_id=model.config.decoder_start_token_id)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="output",
    overwrite_output_dir=True,

    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=val_batch_size,

    evaluation_strategy="epoch",
    save_strategy="epoch",

    num_train_epochs=num_epochs,

    predict_with_generate=True,

    disable_tqdm=False,
    generation_config=gen_config,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    optimizers=(optimizer, scheduler)
)

In [None]:
trainer.train()

Model was successfully trained, but I can't find the way how load it from checkpoint and make predictions on test data. 