# <center> LoRA for GPT2

In [1]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
import math
import torch
import transformers
import datasets
from datasets import load_dataset
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    default_data_collator,
    set_seed,
)
set_seed(42)
from peft import (
    LoraConfig,
    get_peft_model
)
import evaluate
import matplotlib.pyplot as plt

In [2]:
# Check if GPU is available
if torch.cuda.is_available():
    print("GPU is available!")
else:
    print("GPU is not available.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Training on:", device)

GPU is available!
Training on: cuda


### 1. Load Model and Tokenizer

In [3]:
# Chose the base GPT2 model
base_model_name_or_path = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(base_model_name_or_path)

# GPT-2 does not have a pad_token, so we manually set the pad_token to eos_token
tokenizer.pad_token = tokenizer.eos_token

# Load the GPT2LMHeadModel
model = GPT2LMHeadModel.from_pretrained(
    base_model_name_or_path,   # Use the GPT2 model
    load_in_8bit=False,        # Enable int8 quantization if needed
    torch_dtype=torch.float32, # Use float32 for better performance
    device_map="auto",         # Use automatic mixed precision
)

### 2. Prepare LoRA Configuration

In [4]:
lora_config = LoraConfig(
    r=8,                       # Rank for low-rank decomposition
    lora_alpha=32,             # LoRA scaling factor
    target_modules=["c_attn"], # Specify weight layers to apply LoRA
    lora_dropout=0.05,         # LoRA dropout probability
    bias="none",               # No bias for LoRA 
    task_type="CAUSAL_LM",     # Task type: causal language modeling
    fan_in_fan_out=True,       # Use fan-in/fan-out for LoRA
)

In [5]:
# Wrap the GPT-2 model into a PEFT LoRA model
model = get_peft_model(model, lora_config)

# Print trainable parameters to verify LoRA configuration
model.print_trainable_parameters()

trainable params: 294,912 || all params: 124,734,720 || trainable%: 0.2364


### 3. Load the Alpaca Dataset

In [6]:
# Load the Alpaca dataset from Hugging Face directly
dataset = load_dataset("tatsu-lab/alpaca")
print(dataset)

# instruction = a prompt that tell the model what task to perform or what question to answer
# input = additional context related to the instruction (if any)
# output = the model's response to the instruction and optional input
# text = a combination of instruction, input, and output

# Check the first example in the dataset
print(dataset["train"][0])

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 52002
    })
})
{'instruction': 'Give three tips for staying healthy.', 'input': '', 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.', 'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nGive three tips for staying healthy.\n\n### Response:\n1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.'}


In [7]:
# One approach is to merge the prompt as "<instruction>\n[input]" and use output as the training target
def format_alpaca(example):

    # Merge instruction and input to form the prompt
    if example["input"].strip(): # Check if input is empty
        prompt = f"Instruction: {example['instruction']}\nInput: {example['input']}\nAnswer: "
    else:
        prompt = f"Instruction: {example['instruction']}\nAnswer: "

    # Use the 'output' field as the target
    output = example["output"]

    return {"prompt": prompt, "output": output}

In [8]:
# Apply data formatting and remove original columns
dataset = dataset.map(format_alpaca, remove_columns=dataset["train"].column_names)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['output', 'prompt'],
        num_rows: 52002
    })
})


### 4. Data Preprocessing and Splitting

In [9]:
# For GPT-2 causal LM training, merge prompt + output and then tokenize
def tokenize_function(example):
    texts = [p + o for p, o in zip(example["prompt"], example["output"])]
    return tokenizer(texts, max_length=512, truncation=True)

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [10]:
# GPT-2 is trained as a causal LM, so we use DataCollatorForLanguageModeling
# Make sure to set mlm=False for GPT-2 (no masked LM needed)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [11]:
# Simple way to split train dataset into train and eval
# train_dataset = tokenized_dataset["train"].select(range(40000))
# eval_dataset = tokenized_dataset["train"].select(range(40000, 50000))
train_dataset = tokenized_dataset["train"].select(range(8000))
eval_dataset = tokenized_dataset["train"].select(range(8000, 10000))

### 5. Training Arguments

In [12]:
training_args = TrainingArguments(
    output_dir="lora-gpt2-alpaca", # The output directory
    overwrite_output_dir=True,     # Overwrite the content of the output directory
    num_train_epochs=1,            # Number of training epochs
    per_device_train_batch_size=2, # Batch size for training
    per_device_eval_batch_size=2,  # Batch size for evaluation
    eval_strategy="steps",         # Evaluate every few steps
    eval_steps=100,                # Evaluation steps
    logging_steps=50,              # Logging steps
    save_steps=200,                # Save checkpoint every few steps
    save_total_limit=1,            # Keep only one best model
    learning_rate=1e-4,            # Learning rate
    warmup_steps=100,              # Warmup steps
    fp16=True,                     # Enable fp16 if using GPU
    gradient_accumulation_steps=8, # Gradient accumulation steps
)

### 6. Define Trainer and Start Training

In [13]:
# Define Trainer and Start Training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
)

In [14]:
# Start training
trainer.train()

  0%|          | 0/500 [00:00<?, ?it/s]

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


KeyboardInterrupt: 

### 7. Save the Final Model with LoRA Weights

In [150]:
trainer.save_model("lora-gpt2-alpaca-final")

### 8. Define Perplexity evaluation functions

In [151]:
def evaluate_and_compute_perplexity(trainer, eval_dataset):
    """
    Evaluate the given Trainer on eval_dataset and compute Perplexity based on eval_loss.
    """
    eval_results = trainer.evaluate(eval_dataset=eval_dataset)
    eval_loss = eval_results.get("eval_loss")

    # If eval_loss is available, compute perplexity (exp of the loss)
    if eval_loss is not None:
        perplexity = math.exp(eval_loss)
    else:
        perplexity = float("inf")

    # Print or return the results as needed
    print(f"eval_loss = {eval_loss:.4f}, perplexity = {perplexity:.4f}")
    
    return perplexity

In [152]:
base_model = GPT2LMHeadModel.from_pretrained(
    base_model_name_or_path,
    torch_dtype=torch.float32,
    device_map="auto",
)

base_trainer = Trainer(
    model=base_model,
    args=training_args,
    data_collator=data_collator,
    processing_class=tokenizer,
    eval_dataset=eval_dataset,
)

# Evaluate base GPT-2 model on eval_dataset and compute perplexity
base_perplexity = evaluate_and_compute_perplexity(base_trainer, eval_dataset)
print(f"Base GPT-2 model perplexity: {base_perplexity:.4f}\n")

  0%|          | 0/1000 [00:00<?, ?it/s]

eval_loss = 3.2037, perplexity = 24.6231
Base GPT-2 model perplexity: 24.6231



In [153]:
# Evaluate the trained model on eval_dataset and compute perplexity
final_perplexity = evaluate_and_compute_perplexity(trainer, eval_dataset)
print(f"Trained model perplexity: {final_perplexity:.4f}\n")

  0%|          | 0/1000 [00:00<?, ?it/s]

eval_loss = 7.4547, perplexity = 1727.9251
Trained model perplexity: 1727.9251

