In [None]:
from datasets import load_dataset
from transformers import GPT2Tokenizer
import os
import torch

# Load raw WikiText-2 dataset
print("📥 Downloading WikiText-2...")
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

# Load and configure GPT2 

print("🔧 Loading tokenizer...")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

📥 Downloading WikiText-2...
🔧 Loading tokenizer...


1

In [6]:
# Define block size for input sequences
block_size = 128

def tokenize(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=block_size
    )

# Tokenize the dataset
print("🔠 Tokenizing dataset...")
tokenized_datasets = dataset.map(
    tokenize,
    batched=True,
    remove_columns=["text"]
)

🔠 Tokenizing dataset...


In [7]:

# Convert input_ids to torch tensors
print("🔄 Converting to PyTorch tensors...")
train_input_ids = torch.tensor(tokenized_datasets["train"]["input_ids"], dtype=torch.long)
val_input_ids = torch.tensor(tokenized_datasets["validation"]["input_ids"], dtype=torch.long)

# Save tensors to disk for training
print("💾 Saving tokenized tensors to ./data")
os.makedirs("./data", exist_ok=True)
torch.save(train_input_ids, "./data/wikitext2_train_tensor.pt")
torch.save(val_input_ids, "./data/wikitext2_val_tensor.pt")

print("✅ All done. Tensors are ready for training.")

🔄 Converting to PyTorch tensors...
💾 Saving tokenized tensors to ./data
✅ All done. Tensors are ready for training.
