In [None]:
%pip install --upgrade \
    torch torchvision \
    --index-url https://download.pytorch.org/whl/cu118


In [None]:
!pip install -U transformers accelerate


In [None]:
%pip install --upgrade transformers datasets tokenizers torch

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer


In [None]:
dataset=load_dataset("wikitext", "wikitext-2-raw-v1")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token})

In [None]:
def tokenize_function(examples):
    tokenized_batch = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=32
    )

    tokenized_batch["labels"] = tokenized_batch["input_ids"].copy()
    return tokenized_batch


In [None]:
tokenized = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)


In [None]:
!pip install evaluate

In [None]:
%env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [None]:
import torch
print(torch.cuda.memory_summary())


In [None]:
from transformers import GPT2LMHeadModel, TrainingArguments, Trainer
model = GPT2LMHeadModel.from_pretrained("gpt2")
print(" Imported OK, torch version:", torch.__version__)


In [None]:

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    fp16=True,
    gradient_checkpointing=True,
    eval_strategy="steps",
    eval_steps=500,
    logging_steps=100,
    save_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="perplexity",
    greater_is_better=False
)

In [None]:
import evaluate

In [None]:
perplexity = evaluate.load("perplexity")
import math
def compute_metrics(eval_pred, top_k=5):

    predictions, labels = eval_pred
    logits = torch.from_numpy(predictions)
    labels = torch.from_numpy(labels)

    shift_logits = logits[..., :-1, :].contiguous()
    shift_labels = labels[..., 1:].contiguous()
    loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100)
    loss = loss_fct(
        shift_logits.view(-1, shift_logits.size(-1)),
        shift_labels.view(-1)
    )
    perplexity = math.exp(loss.item())

    topk_vals, topk_inds = shift_logits.topk(top_k, dim=-1)
    matches = topk_inds.eq(shift_labels.unsqueeze(-1))
    topk_hits = matches.any(dim=-1)
    valid_mask = shift_labels != -100
    topk_acc = topk_hits[valid_mask].float().mean().item()

    return {
        "perplexity": perplexity,
        f"top_{top_k}_accuracy": topk_acc
    }


In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [None]:

small_train_size = 1000  # number of training examples
small_val_size   = 250   # number of validation examples
small_test_size  = 250   # number of test examples


small_train = tokenized["train"].shuffle(seed=42).select(range(small_train_size))
small_val   = tokenized["validation"].shuffle(seed=42).select(range(small_val_size))
small_test  = tokenized["test"].shuffle(seed=42).select(range(small_test_size))

In [None]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train,
    eval_dataset=small_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

If outputs are not visible, while running the entire file following training cell may ask for the api key which is ought to be provided the person who is running the program. It will ask for wandb api key.

In [None]:
trainer.train()

In [None]:
eval_results = trainer.evaluate(small_test)
print(f"Perplexity: {eval_results['eval_perplexity']:.2f}")
print(f"Top-5 Accuracy: {eval_results['eval_top_5_accuracy']:.4f}")

In [None]:
eval_results = trainer.evaluate(small_test)
print(eval_results)
print(f"Top-5 Accuracy: {eval_results['eval_top_5_accuracy']:.4f}")