In [1]:
import tqdm

tqdm.tqdm.pandas()

In [2]:
import torch
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, TextDataset, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

device = "cuda" if torch.cuda.is_available() else "cpu"
max_length = 1024

In [3]:
# Set your model and tokenizer name
model_name = "gpt2"  # You can use other variants like "gpt2-medium", "gpt2-large" etc
tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.__call__
model = GPT2LMHeadModel.from_pretrained(model_name)
# model.resize_token_embeddings(len(tokenizer))

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding='max_length',
    max_length=max_length
)

In [8]:
from datasets import load_dataset

dataset = load_dataset('samsum')
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [12]:
prefix = "summarize the following document:\n\n"
suffix = '\n\nsummary:'
def preprocess_function(examples):
    prompt = [prefix + doc + suffix for doc in examples['dialogue']]
    examples['input_ids'] = tokenizer(prompt, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt', return_length=max_length)['input_ids']
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            text_target=examples['summary'], 
            max_length=max_length,
        )
        labels = labels['input_ids']
    
    labels[0] = labels[0] + [-100] * (max_length - len(labels[0]))
    labels = [torch.LongTensor(label) for label in labels]
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)
    examples['labels'] = labels
    return examples

In [5]:

tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(['id', 'dialogue', 'summary'])
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 818
    })
})

In [6]:
len(tokenized_dataset['train']['labels'][0])

1024

In [7]:
tokenized_dataset = tokenized_dataset.shuffle()
tokenized_dataset['train'] = tokenized_dataset['train'].shard(num_shards=10, index=0)
tokenized_dataset.shape

{'train': (1474, 2), 'test': (819, 2), 'validation': (818, 2)}

In [None]:
import numpy as np
import sacrebleu
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels tensor with tokenizer.pad_token_id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute BLEU score
    bleu = sacrebleu.corpus_bleu(decoded_preds, [decoded_labels])

    return {
        'bleu': bleu.score,
    }

In [0]:
# Set up training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="output",
    overwrite_output_dir=True,
    
    per_device_train_batch_size=2,
    per_device_eval_batch_size=1,
    
    evaluation_strategy="epoch",
    save_strategy="epoch",
    
    num_train_epochs=3,
    
    learning_rate=2e-5,
    weight_decay=0.01,
    predict_with_generate=True,
    fp16=True,
    disable_tqdm=False,
    logging_steps=1,


)

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    compute_metrics=compute_metrics,
)

In [11]:

# Start training
trainer.train()

Epoch,Training Loss,Validation Loss


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` t

KeyboardInterrupt: 

In [19]:
from transformers import pipeline
pipe = pipeline('text-generation', model='gpt2-medium')

Downloading model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [15]:
text = prefix + dataset['train']['dialogue'][1] + suffix

In [20]:
text = dataset['train']['dialogue'][1] + "\nTL;DR:\n"

In [21]:
pipe(text, clean_up_tokenization_spaces=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Olivia: Who are you voting for in this election? \r\nOliver: Liberals as always.\r\nOlivia: Me too!!\r\nOliver: Great\nTL;DR:\nOliver: The next generation'}]

In [22]:
text

'Olivia: Who are you voting for in this election? \r\nOliver: Liberals as always.\r\nOlivia: Me too!!\r\nOliver: Great\nTL;DR:\n'