In [4]:
!pip install -q sentencepiece

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [4]:
!pip install -q protobuf

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [1]:
import os
import torch
from transformers import (
    AutoModelForCausalLM, AutoTokenizer,
    TrainingArguments, Trainer, BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset

In [2]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16  # ok even if training args use fp16
)

model_name = "meta-llama/Llama-2-7b-chat-hf"  # same model you specified

In [3]:
# ---------- Model & tokenizer ----------
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

# IMPORTANT: For LLaMA-2, use EOS as pad to avoid adding new tokens (and device-side asserts).
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=quantization_config,
)

# Make the model aware of the pad id
model.config.pad_token_id = tokenizer.pad_token_id

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
# ---------- LoRA ----------
lora_config = LoraConfig(
    r=8,                  # slightly larger rank is typical; keep small if VRAM tight
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],  # same as your code
    task_type="CAUSAL_LM",
)
# Prepare for k-bit training (handles layer norms, gradients, etc.)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.0622


In [5]:
# ---------- Data ----------
MAX_LENGTH = 256

# Keep IMDB (works fine for LM). Optionally, switch to wikitext by uncommenting the next two lines:
# dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:5%]")
# text_field = "text"
dataset = load_dataset("imdb", split="train[:5000]")  # smaller slice to fit 7B+LoRA memory
text_field = "text"

def tokenize_function(examples):
    out = tokenizer(
        examples[text_field],
        truncation=True,
        max_length=MAX_LENGTH,
        padding="max_length",  # consistent shape for fp16/4-bit training
    )
    # For causal LM, labels are the input_ids (shift is done inside model)
    out["labels"] = out["input_ids"].copy()
    return out

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [6]:
# ---------- Training ----------
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,     # lower batch for 7B + 4-bit + LoRA on common GPUs
    gradient_accumulation_steps=8,     # effective batch = 16
    save_strategy="epoch",
    logging_steps=20,
    num_train_epochs=1,                # start small; increase once stable
    fp16=True,                         # mixed precision; ok with 4-bit quant
    bf16=False,                        # keep False unless your GPU supports bf16 well
    learning_rate=2e-4,                # typical LoRA LR
    weight_decay=0.0,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
20,3.1171
40,2.3022
60,2.221
80,2.1381
100,2.0952
120,2.1122
140,2.0728
160,2.1049
180,2.0805
200,2.1008


TrainOutput(global_step=313, training_loss=2.184054377741707, metrics={'train_runtime': 1110.7351, 'train_samples_per_second': 4.502, 'train_steps_per_second': 0.282, 'total_flos': 5.07766112256e+16, 'train_loss': 2.184054377741707, 'epoch': 1.0})

In [7]:
# ---------- Save locally ----------
model.save_pretrained("./llama2-imdb-lora")
tokenizer.save_pretrained("./llama2-imdb-lora")

print("✅ Training complete and model saved at ./llama2-imdb-lora")

✅ Training complete and model saved at ./llama2-imdb-lora


In [None]:
# Reload model for inference
from peft import PeftModel
from transformers import pipeline

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=quantization_config,
)
ft_model = PeftModel.from_pretrained(base_model, "./llama2-imdb-lora")
ft_model.eval()

pipe = pipeline(
    "text-generation",
    model=ft_model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

prompt = "The movie was absolutely wonderful because"
outputs = pipe(prompt, max_new_tokens=100, do_sample=True, temperature=0.7)
print(outputs[0]["generated_text"])

In [None]:
# side by side comparison

In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel

# ----------------- Load tokenizer -----------------
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

# ----------------- Load base model -----------------
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=quantization_config,
)
base_model.eval()

pipe_base = pipeline(
    "text-generation",
    model=base_model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

# ----------------- Load fine-tuned model -----------------
ft_model = PeftModel.from_pretrained(base_model, "./llama2-imdb-lora")
ft_model.eval()

pipe_ft = pipeline(
    "text-generation",
    model=ft_model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

# ----------------- Test prompts -----------------
prompts = [
    "The movie was absolutely wonderful because",
    "The film was terrible and I would not recommend it because",
]

for prompt in prompts:
    print("="*80)
    print(f"Prompt: {prompt}")
    print("\n--- Base Model Output ---")
    out_base = pipe_base(prompt, max_new_tokens=80, do_sample=True, temperature=0.7)
    print(out_base[0]["generated_text"])

    print("\n--- Fine-tuned Model Output ---")
    out_ft = pipe_ft(prompt, max_new_tokens=80, do_sample=True, temperature=0.7)
    print(out_ft[0]["generated_text"])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

`torch_dtype` is deprecated! Use `dtype` instead!
Device set to use cuda:0
Device set to use cuda:0


Prompt: The movie was absolutely wonderful because

--- Base Model Output ---
The movie was absolutely wonderful because of the people in it. I have never seen such a talented group of actors, with such a good story, in a movie so bad. The plot was so bad that it was good. The acting was so good that it was bad. The acting was so good that it was good. The acting was so bad that it was good. I can't believe they made this movie.

--- Fine-tuned Model Output ---
The movie was absolutely wonderful because of the cast. I loved the chemistry between the leads, the supporting cast was perfect, and the story was really interesting and funny. However, the direction was terrible. The director should have cut out the horrible exposition at the beginning and the end and just focused on the relationship between the two leads. The story was told very well and it was so funny, but the ex
Prompt: The film was terrible and I would not recommend it because

--- Base Model Output ---
The film was terri