In [4]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from tqdm import tqdm

In [5]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Avoid pad token issues
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [6]:
!nvidia-smi      # switching to 4t - gpu

Wed Jul  9 13:29:22 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   48C    P0             27W /   70W |     644MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [7]:
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
train_texts = dataset["train"]["text"]

# Filter out empty lines
train_texts = [t for t in train_texts if len(t.strip()) > 0]

# Tokenize
tokenized = [tokenizer.encode(t, return_tensors="pt").squeeze(0) for t in train_texts if len(t) > 10]

Downloading readme: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/733k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [8]:
class GPTDataset(Dataset):
    def __init__(self, tokenized_texts, block_size=64):
        self.samples = []
        for text in tokenized_texts:
            for i in range(0, len(text) - block_size, block_size):
                self.samples.append(text[i:i+block_size])

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        return sample, sample  # input and target are same

train_dataset = GPTDataset(tokenized)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=lambda batch: pad_sequence([x[0] for x in batch], batch_first=True, padding_value=tokenizer.pad_token_id))

In [9]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(1):
    loop = tqdm(train_loader, desc="Training")
    for batch in loop:
        batch = batch.to(device)
        outputs = model(input_ids=batch, labels=batch)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        loop.set_postfix(loss=loss.item())


Training: 100%|██████████| 14062/14062 [23:18<00:00, 10.05it/s, loss=3.79]


In [10]:
model.eval()
prompt = "The future of AI is"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
output = model.generate(input_ids, max_new_tokens=20)
print(tokenizer.decode(output[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The future of AI is not clear. The AI researchers have been working on AI for a long time, and have been working


In [11]:
import math

In [12]:
def evaluate_perplexity(model, eval_loader):
    model.eval()
    total_loss = 0
    total_tokens = 0

    with torch.no_grad():
        for batch in tqdm(eval_loader, desc="Evaluating"):
            batch = batch.to(device)
            outputs = model(input_ids=batch, labels=batch)
            loss = outputs.loss
            total_loss += loss.item() * batch.size(0)
            total_tokens += batch.size(0)

    avg_loss = total_loss / total_tokens
    perplexity = math.exp(avg_loss)
    return perplexity

# Example usage
perplexity = evaluate_perplexity(model, train_loader)  # or test_loader
print(f"Perplexity: {perplexity:.2f}")

Evaluating: 100%|██████████| 14062/14062 [04:06<00:00, 56.96it/s]

Perplexity: 23.54





In [14]:
def evaluate_perplexity(model, eval_loader):
    import math
    model.eval()
    total_loss = 0
    total_tokens = 0

    with torch.no_grad():
        for i, batch in enumerate(tqdm(eval_loader, desc="Evaluating")):
            batch = batch.to(device)
            outputs = model(input_ids=batch, labels=batch)
            loss = outputs.loss
            total_loss += loss.item() * batch.size(0)
            total_tokens += batch.size(0)

    avg_loss = total_loss / total_tokens
    perplexity = math.exp(avg_loss)
    return perplexity

# Usage:
perplexity = evaluate_perplexity(model, train_loader)
print(f"Perplexity (small eval): {perplexity:.2f}")

Evaluating: 100%|██████████| 14062/14062 [04:16<00:00, 54.79it/s]

Perplexity (small eval): 23.54





In [16]:
def top_k_accuracy(model, loader, k=5):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for i, batch in enumerate(tqdm(loader, desc="Top-k Accuracy")):
            batch = batch.to(device)
            outputs = model(batch)
            logits = outputs.logits

            shift_logits = logits[:, :-1, :]
            shift_labels = batch[:, 1:]

            top_k_preds = torch.topk(shift_logits, k, dim=-1).indices
            match = (top_k_preds == shift_labels.unsqueeze(-1)).any(dim=-1)
            correct += match.sum().item()
            total += match.numel()

    accuracy = correct / total
    return accuracy

# Usage:
acc = top_k_accuracy(model, train_loader, k=5)
print(f"Top-5 Accuracy (small eval): {acc:.2%}")

Top-k Accuracy: 100%|██████████| 14062/14062 [04:15<00:00, 54.93it/s]

Top-5 Accuracy (small eval): 63.30%



