In [None]:
%pip install fsspec==2023.9.0 --force-reinstall

In [None]:
pip install transformers datasets torch

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import pipeline
from datasets import load_dataset
import torch
from transformers import Trainer, TrainingArguments

In [None]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

In [None]:
db = load_dataset('wikitext', 'wikitext-2-raw-v1')

In [None]:
train_db = db['train']
test_db = db['test']

In [None]:
train_db[644]

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], return_attention_mask=True)

tokenized_datasets = db.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],
)

In [None]:
block_size=1024

def group_texts(examples):
    concatenated_input_ids = sum(examples["input_ids"], [])
    concatenated_attention_mask = sum(examples["attention_mask"], [])
    total_length = (len(concatenated_input_ids) // block_size) * block_size
    result = {
        "input_ids": [
            concatenated_input_ids[i: i + block_size]
            for i in range(0, total_length, block_size)
        ],
        "attention_mask": [
            concatenated_attention_mask[i: i + block_size]
            for i in range(0, total_length, block_size)
        ],
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
)

In [None]:
lm_datasets['train'][43]

In [None]:
training_args = TrainingArguments(
    output_dir="./finetuned_gpt2_wikitext2",
    eval_strategy='epoch',
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    learning_rate=5e-5,
    logging_steps=100,
    save_steps=500,
    fp16=True,  # Optional: Mixed precision if using a GPU
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
    tokenizer=tokenizer,
)

trainer.train()

In [None]:
model.save_pretrained("./finetuned_gpt2")
tokenizer.save_pretrained("./finetuned_gpt2")

In [None]:
import math
eval_results = trainer.evaluate()
perplexity = math.exp(eval_results["eval_loss"])
print(f"Perplexity: {perplexity:.2f}")

In [None]:
eval_dataset = lm_datasets["test"]
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)
from torch.utils.data import DataLoader


eval_loader = DataLoader(
    eval_dataset,
    batch_size=1,
    collate_fn=data_collator,
)
import torch
from tqdm import tqdm

def evaluate_topk_accuracy(model, dataloader, k=5):
    model.eval()
    device = model.device
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc=f"Evaluating Top-{k} Accuracy"):
            # Ensure each item in the batch is a tensor and move it to the device
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids)
            logits = outputs.logits

            # Shift logits & labels for next-token prediction
            shifted_logits = logits[:, :-1, :]
            shifted_labels = labels[:, 1:]

            # Flatten for easier computation
            shifted_logits = shifted_logits.view(-1, shifted_logits.size(-1))
            shifted_labels = shifted_labels.view(-1)

            # Compute top-k predictions
            topk_preds = torch.topk(shifted_logits, k, dim=-1).indices  # (tokens, k)

            # Check if true labels are in top-k
            correct += (topk_preds == shifted_labels.unsqueeze(-1)).any(dim=-1).sum().item()
            total += shifted_labels.numel()

    return correct / total
topk = 5
topk_acc = evaluate_topk_accuracy(model, eval_loader, k=topk)
print(f"Top-{topk} Accuracy after fine-tuning: {topk_acc:.2%}")