# Load GPT-2 Model & Tokenizer

This step involves installing necessary libraries, and then loading the pre-trained GPT-2 model and its corresponding tokenizer from the Hugging Face transformers library. A padding token is also added to the tokenizer.

In [None]:
!pip install --upgrade transformers huggingface_hub fsspec datasets

from transformers import GPT2LMHeadModel, GPT2Tokenizer
from datasets import load_dataset, Dataset
import time

# Load the GPT-2 model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# Add a padding token to the tokenizer. GPT-2 doesn't have one by default.
# Using the eos_token is a common workaround.
tokenizer.pad_token = tokenizer.eos_token

print("GPT-2 model and tokenizer loaded successfully.")

#Loading a dataset

This step focuses on defining a custom dataset, which in this case is a simple list of strings.

In [None]:
# Try loading a different dataset to diagnose
try:
    test_dataset = load_dataset("emotion", split="train", streaming=True)
    print("Successfully loaded a different dataset (emotion).")
    # You can iterate a few samples to confirm
    # for i, example in enumerate(test_dataset):
    #     if i == 5: break
    #     print(example)
except Exception as e:
    print(f"Failed to load the 'emotion' dataset: {e}")

# If the above was successful, try loading wikitext-2 again
try:
    dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
    print("Successfully loaded wikitext-2-raw-v1 dataset.")
    print(dataset)
except Exception as e:
    print(f"Failed to load the wikitext-2-raw-v1 dataset after updates: {e}")

#Prepare custom dataset

This custom data is then converted into a format compatible with the Hugging Face datasets library.

In [None]:
# Example: Using a simple list of strings as a dataset
custom_texts = [
    "This is the first sentence of my custom dataset.",
    "Here is another example sentence for fine-tuning GPT-2.",
    "The third sentence continues the pattern of example data.",
    "More text to help the model learn from custom data."
]

# To work with the `datasets` library, you might want to convert this
# into a structure compatible with `Dataset.from_dict` or similar.
# A simple way is to create a dictionary where each key is a column name
# and the value is a list of data for that column.
custom_data = {'text': custom_texts}

# Create a Hugging Face Dataset from the custom data
custom_dataset = Dataset.from_dict(custom_data)

print("Custom dataset created successfully:")
print(custom_dataset)

# You might need to tokenize this dataset before training.
# The tokenization would depend on the specific task (e.g., text generation, classification).
# For language modeling, you would typically concatenate the texts and then tokenize.

def tokenize_function(examples):
    # The tokenizer variable should be defined in the global scope before this function is called
    # Ensure padding is set to max_length when truncating
    return tokenizer(examples["text"], truncation=True, max_length=tokenizer.model_max_length, padding="max_length")


# Apply tokenization
tokenized_custom_dataset = custom_dataset.map(tokenize_function, batched=True, remove_columns=["text"]) # Remove the original text column

print("\nTokenized custom dataset:")
print(tokenized_custom_dataset)

# If you intend to use this for language modeling, you'll likely need to
# prepare the data further, e.g., group and chunk texts.
# Example (this is a common pattern for causal language modeling):
def group_texts(examples):
    # Concatenate all texts.
    # Only concatenate tokenized columns (input_ids and attention_mask)
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys() if k in ["input_ids", "attention_mask"]}
    total_length = len(concatenated_examples[list(concatenated_examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can customize this part to your needs.
    max_length = tokenizer.model_max_length # Typically 1024 for gpt2
    # Adjust total_length to be a multiple of max_length for consistent chunking
    total_length = (total_length // max_length) * max_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_length] for i in range(0, total_length, max_length)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_dataset = tokenized_custom_dataset.map(group_texts, batched=True)

print("\nLanguage model prepared custom dataset:")
print(lm_dataset)

#Prepare Data for Language Modeling:
The tokenized custom dataset is further processed for causal language modeling. This involves concatenating the tokenized texts and splitting them into fixed-size chunks, which is a standard practice for training language models.
#Fine-tune the GPT-2 Model:
This is the core training step. It involves defining training arguments, setting up a Trainer object from the transformers library with the fine-tuned model, training arguments, prepared dataset, and a data collator. The trainer.train() method is then called to perform the fine-tuning.
#Save the Fine-tuned Model:
After fine-tuning is complete, the trained model and its tokenizer are saved to a specified directory.

In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling, GPT2LMHeadModel, GPT2Tokenizer
import time

# Define the model name (if not defined elsewhere)
model_name = "gpt2"

# Load the GPT-2 model and tokenizer (Ensure these lines are executed before DataCollator)
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token # Add padding token

# Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt2_finetuned",  # Output directory
    overwrite_output_dir=True,  # Overwrite the content of the output directory
    num_train_epochs=3,  # Number of training epochs
    per_device_train_batch_size=2,  # Batch size for training
    save_steps=10_000,  # Save checkpoint every X updates steps
    save_total_limit=2,  # Limit the total amount of checkpoints
    logging_dir='./logs',  # Directory for storing logs
    logging_steps=200,
    eval_strategy="no", # No evaluation during training
    learning_rate=5e-5,
    weight_decay=0.01,
    gradient_accumulation_steps=1,
)

# Data collator
# Using DataCollatorForLanguageModeling for causal language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False # mlm=False for causal language modeling (GPT-2)
)

# Initialize Trainer
# Note: lm_dataset needs to be defined before this cell is run
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset, # Use the prepared language model dataset
    data_collator=data_collator,
)

# Fine-tune the model
print("\nStarting fine-tuning...")
start_time = time.time()
trainer.train()
end_time = time.time()
print(f"Fine-tuning completed in {end_time - start_time:.2f} seconds.")

# Save the fine-tuned model and tokenizer
trainer.save_model("./gpt2_finetuned_custom_data")
tokenizer.save_pretrained("./gpt2_finetuned_custom_data")

print("Fine-tuned model saved to ./gpt2_finetuned_custom_data")

#Load Fine-tuned Model and Generate Text:
This step involves loading the saved fine-tuned model and tokenizer. A function is defined to take a prompt and use the loaded model to generate new text based on the prompt. Different generation strategies like greedy search and beam search are demonstrated.

In [None]:
# Load the fine-tuned model and tokenizer
fine_tuned_model_path = "./gpt2_finetuned_custom_data"
model = GPT2LMHeadModel.from_pretrained(fine_tuned_model_path)
tokenizer = GPT2Tokenizer.from_pretrained(fine_tuned_model_path)

# Ensure the padding token is set if not already saved in the tokenizer config
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Fine-tuned model and tokenizer loaded from {fine_tuned_model_path}")

# Function to generate text
def generate_text(prompt, max_length=50, num_return_sequences=1, temperature=1.0, top_k=50, top_p=0.95):
    # Encode the prompt and get attention mask
    # We need to explicitly include the attention mask here
    encoded_inputs = tokenizer.encode_plus(
        prompt,
        return_tensors="pt",
        padding=True, # Ensure padding is applied if needed, though not strictly necessary for a single prompt
        truncation=True, # Truncate if the prompt is too long
        max_length=tokenizer.model_max_length
    )
    input_ids = encoded_inputs["input_ids"]
    attention_mask = encoded_inputs["attention_mask"]


    # Set num_beams to be at least num_return_sequences for beam search
    # If num_return_sequences > 1, we need to enable beam search (num_beams > 1)
    num_beams = max(num_return_sequences, 1)

    # Generate text
    output_sequences = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask, # Pass the attention mask
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        temperature=temperature, # Temperature is typically used with sampling, not beam search
        # top_k=top_k, # top_k is typically used with sampling, not beam search
        # top_p=top_p, # top_p is typically used with sampling, not beam search
        num_beams=num_beams, # Enable beam search if num_return_sequences > 1
        early_stopping=True, # Recommended for beam search
        pad_token_id=tokenizer.eos_token_id, # Use eos_token_id for padding
        no_repeat_ngram_size=2, # Optional: Prevent repeating n-grams
    )

    # Decode and print the generated text
    print("\nGenerated Text:")
    for i, generated_sequence in enumerate(output_sequences):
        generated_sequence = generated_sequence.tolist()

        # Decode text
        text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)

        print(f"Sequence {i+1}: {text.strip()}")

# Example usage:
prompt = "This is a sentence to start the generation."
generate_text(prompt, max_length=100, num_return_sequences=3) # This will now use beam search

prompt = "Here's another prompt:"
generate_text(prompt, max_length=80) # This will use greedy search (num_return_sequences=1, num_beams=1)