In [18]:
from datasets import load_dataset

ds = load_dataset("d0rj/samsum-ru")

Downloading readme: 100%|██████████| 2.21k/2.21k [00:00<00:00, 8.01MB/s]
Downloading data: 100%|██████████| 8.60M/8.60M [00:00<00:00, 8.99MB/s]
Downloading data: 100%|██████████| 472k/472k [00:00<00:00, 1.58MB/s]
Downloading data: 100%|██████████| 484k/484k [00:00<00:00, 2.29MB/s]
Generating train split: 100%|██████████| 14731/14731 [00:00<00:00, 200078.02 examples/s]
Generating validation split: 100%|██████████| 818/818 [00:00<00:00, 399225.12 examples/s]
Generating test split: 100%|██████████| 819/819 [00:00<00:00, 338303.62 examples/s]


In [19]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14731
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
})

In [1]:
!pip install --upgrade accelerate
!pip uninstall -y transformers accelerate
!pip install transformers accelerate

Found existing installation: transformers 4.43.3
Uninstalling transformers-4.43.3:
  Successfully uninstalled transformers-4.43.3
Found existing installation: accelerate 0.33.0
Uninstalling accelerate-0.33.0:
  Successfully uninstalled accelerate-0.33.0
Collecting transformers
  Using cached transformers-4.43.3-py3-none-any.whl.metadata (43 kB)
Collecting accelerate
  Using cached accelerate-0.33.0-py3-none-any.whl.metadata (18 kB)
Using cached transformers-4.43.3-py3-none-any.whl (9.4 MB)
Using cached accelerate-0.33.0-py3-none-any.whl (315 kB)
Installing collected packages: accelerate, transformers
Successfully installed accelerate-0.33.0 transformers-4.43.3


In [2]:
from transformers import pipeline, set_seed
from datasets import load_dataset, load_from_disk
import matplotlib.pyplot as plt
from datasets import load_dataset
import pandas as pd
from datasets import load_dataset, load_metric

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch

nltk.download("punkt")

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/amoghagadde/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [4]:
import torch
torch.cuda.empty_cache()

In [5]:
model_ckpt = "google/pegasus-cnn_dailymail"

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
%pip install wget

Note: you may need to restart the kernel to use updated packages.


In [7]:
#dowload & unzip data

# !wget https://github.com/entbappy/Branching-tutorial/raw/master/summarizer-data.zip
# !curl -L -o summarizer-data.zip https://github.com/entbappy/Branching-tutorial/raw/master/summarizer-data.zip
# !unzip summarizer-data.zip

In [8]:
dataset_samsum = load_from_disk('samsum_dataset')
dataset_samsum

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [9]:
split_lengths = [len(dataset_samsum[split])for split in dataset_samsum]

print(f"Split lengths: {split_lengths}")
print(f"Features: {dataset_samsum['train'].column_names}")
print("\nDialogue:")

print(dataset_samsum["test"][1]["dialogue"])

print("\nSummary:")

print(dataset_samsum["test"][1]["summary"])

Split lengths: [14732, 819, 818]
Features: ['id', 'dialogue', 'summary']

Dialogue:
Eric: MACHINE!
Rob: That's so gr8!
Eric: I know! And shows how Americans see Russian ;)
Rob: And it's really funny!
Eric: I know! I especially like the train part!
Rob: Hahaha! No one talks to the machine like that!
Eric: Is this his only stand-up?
Rob: Idk. I'll check.
Eric: Sure.
Rob: Turns out no! There are some of his stand-ups on youtube.
Eric: Gr8! I'll watch them now!
Rob: Me too!
Eric: MACHINE!
Rob: MACHINE!
Eric: TTYL?
Rob: Sure :)

Summary:
Eric and Rob are going to watch a stand-up on youtube.


In [10]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['dialogue'] , max_length = 1024, truncation = True )
    
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['summary'], max_length = 128, truncation = True )
        
    return {
        'input_ids' : input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }
    

In [11]:
dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features, batched = True)

In [12]:
dataset_samsum_pt["train"]

Dataset({
    features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 14732
})

In [13]:
# Training

from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

In [15]:
from transformers import TrainingArguments, Trainer

trainer_args = TrainingArguments(
    output_dir='pegasus-samsum', num_train_epochs=1, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10,
    eval_strategy='steps', eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=4,
    fp16=False
) 

In [16]:

trainer = Trainer(model=model_pegasus, args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_samsum_pt["test"], 
                  eval_dataset=dataset_samsum_pt["validation"]
                  )

In [17]:
# trainer.train()
torch.mps.empty_cache()

trainer.train()

  0%|          | 0/204 [00:00<?, ?it/s]

RuntimeError: MPS backend out of memory (MPS allocated: 6.20 GB, other allocations: 2.57 GB, max allowed: 9.07 GB). Tried to allocate 375.40 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).