In [None]:
# Import necessary libraries
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, pipeline

: 

In [None]:
# Step 1: Load the dataset
# For this example, we'll use the "BookCorpus" dataset
from datasets import load_dataset

# Load the OpenWebText dataset
# Step 1: Load the dataset (only the 'train' split exists)
raw_dataset = load_dataset("openwebtext", split="train")

In [None]:
# Step 2: Split the dataset into train and test (e.g., 90% train, 10% test)
train_test_split = raw_dataset.train_test_split(test_size=0.1, seed=42)
raw_train_dataset = train_test_split["train"]
raw_test_dataset = train_test_split["test"]

In [None]:
# Step 3: Load the tokenizer
from transformers import GPT2TokenizerFast, GPT2LMHeadModel, TrainingArguments
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Step 4: Define the tokenization function
def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

In [None]:
# Step 5: Tokenize each split
tokenized_train_dataset = raw_train_dataset.map(preprocess_function, batched=True, num_proc=4)
tokenized_test_dataset = raw_test_dataset.map(preprocess_function, batched=True, num_proc=4)

In [None]:
# Step 6: Load Pre-Trained GPT2 Model
# Load GPT-2 with a language modeling head
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))  # Adjust embedding size if tokenizer size changes

In [None]:
# Step 7: Define Training Arguments
training_args = TrainingArguments(
    output_dir = "./results",       # Directory to save model checkpoints
    evaluation_strategy="epoch",    # Evaluate the model at the end of each epoch
    learning_rate=5e-5,             # Learning rate
    num_train_epochs=3,             # Number of training epochs
    per_device_train_batch_size=2,  # Batch size per GPU/TPU
    per_device_eval_batch_size=2,   # Batch size for evaluation
    gradient_accumulation_steps=4,  # Simulate larger batch sizes
    logging_dir="./logs",           # Directory for logs
    logging_steps=10,               # Log every 10 steps
    save_steps = 500,               # Save checkpoint every 500 steps
    save_total_limit=2,             # Only keep the last 2 checkpoints
    load_best_model_at_end=True,    # Load the best model (based on evaluation metric)

)

In [None]:
# Step 8: Set up the trainer
import math
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits,axis=-1)
    # Calculate perplexity
    perplexity = math.exp(np.mean([logit - label for logit, label in zip(logits.flatten(), labels.flatten())]))
    return {"perplexity": perplexity}

In [None]:
# Step 8.1:Trainer Initialization
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = small_train_dataset,
    eval_dataset = small_eval_dataset,
    tokenizer=tokenizer,    # Optional: tokenizer for dynamic padding
    compute_metrics=None,   # Replace with compute_metrics if desired
)

In [None]:
# Step 9: Start the training process

trainer.train()

In [None]:
# Step 10: Save the Fine-tuned Model
model.save_pretrained("./fine_tuned_gpt2")
tokenizer.save_pretrained("./fine_tuned_gpt2")

In [None]:
# Step 11: Test the Fine-tuned Model

# Load the fine-tuned model for text generation
generator = pipeline("text-generation", model="./fine_tuned_gpt2", tokenizer=tokenizer)

# Generate text
text = generator("Once upon a time", max_length=50, num_return_sequences=1)
print(text)