# LoRA on GPT2 model

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling 
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## 1. Load tokenizer and model
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token ## for padding
model = AutoModelForCausalLM.from_pretrained(model_name)

In [6]:
## Apply LoRA to model
lora_config = LoraConfig (
    r=8,  # LoRA rank
    lora_alpha=16,
    target_modules=["c_attn"], # Target attention layers in GPT-2
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 294,912 || all params: 124,734,720 || trainable%: 0.2364




In [None]:
#3. dataset (Toy)
data = [
    {"text": "Question: What is the capital of India?\nAnswer: New Delhi"},
    {"text": "Question: Who wrote Romeo and Juliet?\nAnswer: William Shakespeare"},
    {"text": "Question: What's the boiling point of water?\nAnswer: 100°C"},
]

from datasets import Dataset
dataset = Dataset.from_list(data)

def tokenize_function(ex):
    return tokenizer(ex["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)


Map: 100%|██████████| 3/3 [00:00<00:00, 161.68 examples/s]


In [8]:
tokenized_dataset

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 3
})

In [9]:
#4. Training setup
training_args = TrainingArguments (
    output_dir="./lora-gpt2",
    per_device_train_batch_size=2,
    num_train_epochs=5,
    logging_steps=1,
    save_steps=5,
    save_total_limit=2,
    fp16=True,
    logging_dir='./logs',
    report_to="none"
)

trainer = Trainer(
    model = model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False) 
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [10]:
#5. Train
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
1,3.4268
2,3.814
3,4.3651
4,3.3729
5,3.5433
6,3.1584
7,3.5505
8,3.2411
9,3.5892
10,3.4535




TrainOutput(global_step=10, training_loss=3.5514732122421266, metrics={'train_runtime': 12.0766, 'train_samples_per_second': 1.242, 'train_steps_per_second': 0.828, 'total_flos': 983242506240.0, 'train_loss': 3.5514732122421266, 'epoch': 5.0})

In [11]:
#6. save the model
model.save_pretrained(".lora-adapter")

# INFERENCE

In [None]:
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = PeftModel.from_pretrained(base_model, ".lora-adapter")


In [20]:
prompt = "Question: Who wrote Romeo and Juliet?"

inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(
    **inputs,
    max_new_tokens=50,
    temperature=0.7,         # controls randomness (lower = more conservative)
    top_k=50,                # only sample from top 50 words
    top_p=0.9,               # nucleus sampling (cumulative prob cutoff)
    repetition_penalty=1.5,  # discourage repeating same phrases
    do_sample=True           # use sampling instead of greedy decoding
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question: Who wrote Romeo and Juliet?
There are several reasons why it is possible to write a romantic novel. Firstly, there's the fact that Shakespeare knew how long he wanted his work done; secondly people don't like short stories in which characters talk about their lives but have no interest
