In [2]:
# Fine-tune GPT with LoRA (PEFT) - Colab-ready
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    Trainer, TrainingArguments, DataCollatorForLanguageModeling,
    pipeline
)
from peft import LoraConfig, get_peft_model, TaskType, PeftModel

# ---------- User settings (edit if needed) ----------
MODEL = "gpt2"                # base model; change to gpt2-medium etc. if you have memory
TRAIN_FILE = "Training.txt"   # upload train.txt to Colab /content/
OUTPUT_DIR = "/content/gpt_lora_out"
BATCH_SIZE = 8               # per-device batch size (lower if OOM)
EPOCHS = 3
LEARNING_RATE = 2e-4
BLOCK_SIZE = 128             # max token length
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.1
SEED = 42
# ----------------------------------------------------

# reproducibility
torch.manual_seed(SEED)

# check GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# 1) load dataset (plain text: one example per line)
if not os.path.exists(TRAIN_FILE):
    raise FileNotFoundError(f"Training file not found: {TRAIN_FILE}. Upload train.txt to Colab /content/")

ds = load_dataset("text", data_files={"train": TRAIN_FILE})
print("Loaded dataset. Examples:", len(ds["train"]))

# 2) tokenizer & base model
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)
# ensure pad token exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})

model = AutoModelForCausalLM.from_pretrained(MODEL)
# resize embeddings if tokenizer changed
model.resize_token_embeddings(len(tokenizer))

# 3) apply LoRA (PEFT)
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
)
model = get_peft_model(model, peft_config)
print("Applied LoRA. Trainable params (PEFT):", sum(p.numel() for p in model.parameters() if p.requires_grad))

# 4) tokenize dataset
def tokenize_batch(examples):
    out = tokenizer(examples["text"], truncation=True, max_length=BLOCK_SIZE, padding="max_length")
    out["labels"] = out["input_ids"].copy()
    return out

tokenized = ds["train"].map(tokenize_batch, batched=True, remove_columns=["text"])
tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
print("Tokenized dataset sample:", tokenized[0]["input_ids"][:10])

# 5) data collator and training args
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    learning_rate=LEARNING_RATE,
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    save_total_limit=3,
    save_strategy="epoch",
    report_to="none",   # disable wandb by default
    remove_unused_columns=False,
)

# 6) Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    data_collator=data_collator,
)

# 7) Train
trainer.train()

# 8) Save LoRA adapters + tokenizer
os.makedirs(OUTPUT_DIR, exist_ok=True)
model.save_pretrained(OUTPUT_DIR)       # saves adapters + base model config required by peft
tokenizer.save_pretrained(OUTPUT_DIR)
print("Saved fine-tuned model to", OUTPUT_DIR)

# 9) Quick generation check (loads model with adapters)
# Load the base model and then the PEFT adapters
# Load the tokenizer used during training from the output directory
tokenizer_for_inference = AutoTokenizer.from_pretrained(OUTPUT_DIR)
base_model = AutoModelForCausalLM.from_pretrained(MODEL)
# Resize the base model's embeddings to match the tokenizer used during training
base_model.resize_token_embeddings(len(tokenizer_for_inference))
model_to_infer = PeftModel.from_pretrained(base_model, OUTPUT_DIR)
model_to_infer = model_to_infer.merge_and_unload() # Merge LoRA weights for inference

# Use pipeline with the combined model
gen_device = 0 if torch.cuda.is_available() else -1
gen_pipe = pipeline("text-generation", model=model_to_infer, tokenizer=tokenizer_for_inference, device=gen_device)
prompt = "Write a short, friendly email saying you will be late to the meeting because"
print("Prompt:", prompt)
out = gen_pipe(prompt, max_length=120, do_sample=True, top_k=50, top_p=0.95, temperature=0.8, num_return_sequences=1)
print("\n=== Generated ===\n", out[0]["generated_text"])

Device: cpu
Loaded dataset. Examples: 10
Applied LoRA. Trainable params (PEFT): 589824




Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Tokenized dataset sample: tensor([16594,   257,   387, 28643,   546, 40048,    13,  8407,  1657, 42123])




Step,Training Loss




Saved fine-tuned model to /content/gpt_lora_out


Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=120) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Prompt: Write a short, friendly email saying you will be late to the meeting because

=== Generated ===
 Write a short, friendly email saying you will be late to the meeting because there is a lot of work to be done and that the meeting will be cancelled.

Please note that any delay in posting a short email will result in an immediate cancellation and may result in your email being deleted.

You may contact us directly or email us at:

Shenzhen Hao University

Yen-Shan Hao University

Tel: +886-917-879-3660

Fax: +886-917-879-3660

Email: hao@yenghao.edu.cn

For more information on this topic, please see the Resources section of the Hao University website.


In [4]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch

OUTPUT_DIR = "/content/gpt_lora_out"  # where the fine-tuned model was saved
MODEL = "gpt2" # specify the base model used for training

# pick GPU if available
device = 0 if torch.cuda.is_available() else -1

# Load the tokenizer used during training from the output directory
tokenizer_for_inference = AutoTokenizer.from_pretrained(OUTPUT_DIR)

# Load the base model
base_model = AutoModelForCausalLM.from_pretrained(MODEL)

# Resize the base model's embeddings to match the tokenizer used during training
base_model.resize_token_embeddings(len(tokenizer_for_inference))

# Load the PEFT adapters and merge them into the base model
model_to_infer = PeftModel.from_pretrained(base_model, OUTPUT_DIR)
model_to_infer = model_to_infer.merge_and_unload() # Merge LoRA weights for inference

# load pipeline with fine-tuned model
gen = pipeline("text-generation", model=model_to_infer, tokenizer=tokenizer_for_inference, device=gen_device) # Use gen_device from previous cell or define here

# try some prompts
prompts = [
    "Write a haiku about stars.",
    "Draft a polite email apologizing for being late.",
    "Explain binary search simply.",
    "Translate to a friendly tone: The meeting is cancelled."
]

for p in prompts:
    out = gen(p, max_length=80, do_sample=True, top_k=50, top_p=0.95, temperature=0.8)[0]["generated_text"]
    print("\nPrompt:", p)
    print("Generated:", out)

Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=80) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=80) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Prompt: Write a haiku about stars.
Generated: Write a haiku about stars. "The stars are on the other side of the universe. We all have an idea about what it means to be in the other side of the universe."

Now, I am not saying that there is not possibility to be in the other side of the universe. There is. I am saying that we are all in the other side of the universe. I am saying that there is a space-time continuum where the universe is not parallel and there is no space-time continuum. The universe is in the other side of the universe and the universe is not parallel. In other words, there is no time-space continuum and there is no time-time continuum. If you are in the other side of the universe, there is no time-time continuum or space-time continuum.

For me, I was asked a great question by a friend about the time-space continuum. She said that the universe is in the other side of the universe and the universe is not parallel and there is no time-space continuum. I am also saying

Both `max_new_tokens` (=256) and `max_length`(=80) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Prompt: Draft a polite email apologizing for being late.
Generated: Draft a polite email apologizing for being late.

"I never have had any issues with you," she wrote. "I had a phone call from a guy who asked me if he could send me a message asking if I should send him a copy of my book, and I told him to. Then he started asking if I could help him figure out what he should do.

"I tried to explain that I was just going to send him a thank you email but they weren't going to let him do it, so I tried to work it out with him and get him to sign the thank you.

"I was told to make sure he read my book, but I didn't know how to do it, so I wrote the book for him. I was told to go back to the restaurant and get ready to send him the book. I didn't think there was any chance of him getting it and had to leave the restaurant. I had to tell him that I wanted him to read the book, so I told him that I would read it the next day, but I didn't want him to miss his opportunity to finish his boo

Both `max_new_tokens` (=256) and `max_length`(=80) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Prompt: Explain binary search simply.
Generated: Explain binary search simply. If a string is not found, the search is not complete, but can be completed. If the string contains a'', a search will be performed.

' : search [1, '2', '3', '4'] : search [1, '2', '3', '4'] : search [1, '2', '3', '4'] : search [1, '2', '3', '4'] : search [1, '2', '3', '4'] : search [1, '2', '3', '4'] : search [1, '2', '3', '4'] : search [1, '2', '3', '4'] : search [1, '2', '3', '4'] : search [1, '2', '3', '4'] : search [1, '2', '3', '4'] : search [1, '2', '3', '4'] : search [1, '2', '3', '4'] : search [1, '2', '3', '4'] : search [1, '2', '3', '4'] : search [1, '2', '3', '

Prompt: Translate to a friendly tone: The meeting is cancelled.
Generated: Translate to a friendly tone: The meeting is cancelled. We are in a quiet room. The police are waiting for us. We need your help. Please take the time to read what happened. If you hear anything like this, please call the police.

After the meeting, the police wil