In [1]:
import tqdm

tqdm.tqdm.pandas()

In [2]:
import torch
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, TextDataset, DataCollatorForSeq2Seq, Seq2SeqTrainer, \
    Seq2SeqTrainingArguments

device = "cuda" if torch.cuda.is_available() else "cpu"
max_length = 1024

In [3]:
# Set your model and tokenizer name
model_name = "gpt2"  # You can use other variants like "gpt2-medium", "gpt2-large" etc
tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

model = GPT2LMHeadModel.from_pretrained(model_name)
model.config.pad_token_id = model.config.eos_token_id
# model.resize_token_embeddings(len(tokenizer))

ValueError: Unrecognized configuration class <class 'transformers.models.gpt2.configuration_gpt2.GPT2Config'> for this kind of AutoModel: AutoModelForSeq2SeqLM.
Model type should be one of BartConfig, BigBirdPegasusConfig, BlenderbotConfig, BlenderbotSmallConfig, EncoderDecoderConfig, FSMTConfig, GPTSanJapaneseConfig, LEDConfig, LongT5Config, M2M100Config, MarianConfig, MBartConfig, MT5Config, MvpConfig, NllbMoeConfig, PegasusConfig, PegasusXConfig, PLBartConfig, ProphetNetConfig, SwitchTransformersConfig, T5Config, UMT5Config, XLMProphetNetConfig.

In [4]:
from datasets import load_dataset, DatasetDict

dataset = load_dataset('samsum')
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [5]:
prefix = "summarize the following document:\n\n"
suffix = '\nTL;DR:\n'


def preprocess_function(examples):
    prompt = [doc + suffix for doc in examples['dialogue']]
    model_input = tokenizer(
        text=prompt, text_target=prompt, text_pair=examples['summary'], text_pair_target=examples['summary'],
        max_length=max_length, truncation=True
        )

    return model_input


In [17]:
def get_features(examples):
    prompt = [doc + suffix for doc in examples['dialogue']]
    model_input = tokenizer(
        text=prompt, text_target=examples['summary'], max_length=max_length, truncation=True
    )
    labels = [[-100] * len(sample[0]) + sample[1] for sample in zip(model_input['input_ids'], model_input['labels'])]
    model_input['input_ids'] = [sample[0] + sample[1] for sample in zip(model_input['input_ids'], model_input['labels'])]
    model_input['labels'] = labels
    model_input['attention_mask'] = [[1] * sample.count(-100) + [0] * (len(sample) - sample.count(-100)) for sample in model_input['labels']]

    return model_input
tokenized_train_dataset = dataset['train'].map(get_features, batched=True, remove_columns=['id', 'dialogue', 'summary'])
tokenized_valid_dataset = dataset['validation'].map(get_features, batched=True, remove_columns=['id', 'dialogue', 'summary'])

tokenized_dataset = DatasetDict({
    'train': tokenized_train_dataset,
    'validation': tokenized_valid_dataset
})
len(tokenized_dataset['train']['input_ids'][9]), len(tokenized_dataset['train']['labels'][9])

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

(283, 283)

In [14]:
def preprocess_function(examples):
    prompt = [doc + suffix for doc in examples['dialogue']]
    model_input = tokenizer(
        text=prompt, text_target=examples['summary'], max_length=max_length, truncation=True
    )
    return model_input

In [9]:
(tokenized_dataset['validation']['labels'][9])

[-100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,

In [18]:
tokenized_dataset = tokenized_dataset.shuffle()
tokenized_dataset['train'] = tokenized_dataset['train'].shard(num_shards=100, index=0)
tokenized_dataset.shape

{'train': (148, 3), 'validation': (818, 3)}

In [0]:
import numpy as np
import sacrebleu


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions[:, :labels.shape[1]]
    predictions = np.where(labels != -100, predictions, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels tensor with tokenizer.pad_token_id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute BLEU score
    bleu = sacrebleu.corpus_bleu(decoded_preds, [decoded_labels])

    return {
        'bleu': bleu.score,
    }

In [0]:
columns = ['input_ids', 'labels', 'attention_mask']
tokenized_dataset.set_format(type='torch', columns=columns)

In [0]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [0]:

from transformers import AdamW, get_cosine_schedule_with_warmup

optimizer = AdamW(model.parameters(), lr=0.001)

train_batch_size = 2
val_batch_size = 1
num_epochs = 10

scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0.1 * num_epochs * (tokenized_dataset['train'].num_rows / train_batch_size),
    num_training_steps=num_epochs * (tokenized_dataset['train'].num_rows / val_batch_size)
)

In [0]:
from transformers import GenerationConfig

gen_config = GenerationConfig(max_new_tokens=20, max_length=-1)

In [0]:
# Set up training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="output",
    overwrite_output_dir=True,

    per_device_train_batch_size=2,
    per_device_eval_batch_size=1,

    evaluation_strategy="epoch",
    save_strategy="epoch",

    num_train_epochs=num_epochs,

    predict_with_generate=True,
    fp16=True,
    disable_tqdm=False,
    generation_config=gen_config,
)

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    optimizers=(optimizer, scheduler)
)

In [19]:

# Start training
trainer.train()



Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
from transformers import pipeline

pipe = pipeline('text-generation', model='gpt2-medium')

In [None]:
text = prefix + dataset['train']['dialogue'][1] + suffix

In [None]:
text = dataset['train']['dialogue'][1] + "\nTL;DR:\n"

In [None]:
pipe(text, clean_up_tokenization_spaces=True)

In [None]:
text