In [7]:
# Step 1: Install required libraries
!pip install --upgrade transformers torch datasets

# Step 2: Import necessary modules
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import Dataset
from torch.utils.data import DataLoader, Dataset as TorchDataset
from torch.optim import AdamW
from transformers import get_scheduler
import torch

# Step 3: Prepare your dataset (example with dummy data)
data = {
    "prompt": [
        "What is the capital of France?",
        "Explain quantum computing in simple terms.",
        "Who wrote the play Romeo and Juliet?",
        "What is the speed of light?",
    ],
    "completion": [
        "The capital of France is Paris.",
        "Quantum computing uses qubits to perform calculations.",
        "William Shakespeare wrote Romeo and Juliet.",
        "The speed of light is approximately 299,792 kilometers per second.",
    ],
}

# Convert to Hugging Face Dataset
hf_dataset = Dataset.from_dict(data)

# Step 4: Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Set padding token (GPT-2 doesn't have one by default)
tokenizer.pad_token = tokenizer.eos_token

# Define a function to tokenize both prompt and completion
def tokenize_function(examples):
    inputs = [f"{prompt} {completion}" for prompt, completion in zip(examples["prompt"], examples["completion"])]
    return tokenizer(inputs, padding="max_length", truncation=True, max_length=50, return_special_tokens_mask=True)

# Tokenize the dataset
tokenized_datasets = hf_dataset.map(tokenize_function, batched=True, remove_columns=["prompt", "completion"])

# Step 5: Split dataset into train and test
split_dataset = tokenized_datasets.train_test_split(test_size=0.25, seed=42)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

# Step 6: Create custom Dataset class and DataLoaders
class CustomTextDataset(TorchDataset):
    def __init__(self, tokenized_dataset):
        self.encodings = tokenized_dataset[:]

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

train_custom_dataset = CustomTextDataset(train_dataset)
test_custom_dataset = CustomTextDataset(test_dataset)

train_loader = DataLoader(train_custom_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_custom_dataset, batch_size=2)

# Step 7: Load pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.config.pad_token_id = tokenizer.pad_token_id

# Step 8: Set up optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 100
total_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Step 9: Training loop (simplified)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

model.train()
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")

    # Training
    total_train_loss = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(input_ids=batch["input_ids"], labels=batch["input_ids"])
        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Training loss: {avg_train_loss:.4f}")

    # Evaluation
    model.eval()
    total_eval_loss = 0
    with torch.no_grad():
        for batch in test_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(input_ids=batch["input_ids"], labels=batch["input_ids"])
            loss = outputs.loss
            total_eval_loss += loss.item()

    avg_eval_loss = total_eval_loss / len(test_loader)
    print(f"Evaluation loss: {avg_eval_loss:.4f}")

    model.train()



Map:   0%|          | 0/4 [00:00<?, ? examples/s]


Epoch 1/100
Training loss: 8.0272
Evaluation loss: 3.0073

Epoch 2/100
Training loss: 2.0695
Evaluation loss: 1.0596

Epoch 3/100
Training loss: 0.8281
Evaluation loss: 0.9044

Epoch 4/100
Training loss: 0.7848
Evaluation loss: 0.8509

Epoch 5/100
Training loss: 0.6123
Evaluation loss: 0.8148

Epoch 6/100
Training loss: 0.4986
Evaluation loss: 0.7853

Epoch 7/100
Training loss: 0.4284
Evaluation loss: 0.7580

Epoch 8/100
Training loss: 0.3947
Evaluation loss: 0.7329

Epoch 9/100
Training loss: 0.2913
Evaluation loss: 0.7074

Epoch 10/100
Training loss: 0.3122
Evaluation loss: 0.6857

Epoch 11/100
Training loss: 0.2180
Evaluation loss: 0.6680

Epoch 12/100
Training loss: 0.1663
Evaluation loss: 0.6572

Epoch 13/100
Training loss: 0.1446
Evaluation loss: 0.6511

Epoch 14/100
Training loss: 0.1877
Evaluation loss: 0.6491

Epoch 15/100
Training loss: 0.0880
Evaluation loss: 0.6491

Epoch 16/100
Training loss: 0.0871
Evaluation loss: 0.6527

Epoch 17/100
Training loss: 0.0769
Evaluation lo