In [12]:
# Step 1: Install required libs
!pip install -q transformers datasets peft accelerate bitsandbytes

# Step 2: Imports
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer, Seq2SeqTrainingArguments,
    BitsAndBytesConfig
)
from peft import (
    LoraConfig, get_peft_model,
    TaskType, prepare_model_for_kbit_training
)

# Step 3: Toy dataset
dialogues = [
    "Alice: Hey, are we still on for the meeting at 3?\nBob: Yes, but let's make it 3:30.",
    "Manager: Please send the sales report by EOD.\nAnalyst: Sure, working on it now.",
    "Team Lead: Can you update the client on the new timeline?\nDev: Yes, will do it by lunch.",
    "CEO: We should plan for next quarter’s growth.\nCFO: I suggest revisiting the budget first."
]
summaries = [
    "Meeting postponed to 3:30.",
    "Sales report needed by EOD.",
    "Dev will update the client before lunch.",
    "CFO suggests revisiting budget before planning growth."
]

data = Dataset.from_dict({"dialogue": dialogues, "summary": summaries}).train_test_split(test_size=0.5)

# Step 4: Tokenization
model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess(example):
    inputs = tokenizer(example["dialogue"], truncation=True, padding="max_length", max_length=128)
    labels = tokenizer(example["summary"], truncation=True, padding="max_length", max_length=64)
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized = data.map(preprocess, batched=True, remove_columns=["dialogue", "summary"])

# Step 5: Prepare BitsAndBytesConfig (new recommended way)
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True
)

# Load the model with quantization config
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"": "cuda:0"}
)

model = prepare_model_for_kbit_training(model)

# Step 6: Setup LoRA config and apply
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Step 7: Trainer setup
training_args = Seq2SeqTrainingArguments(
    output_dir="./lora-results",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    logging_dir="./logs",
    report_to="none",
    fp16=True
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Step 8: Train
trainer.train()

# Step 9: Inference example
example = data["test"][0]["dialogue"]
inputs = tokenizer(example, return_tensors="pt", padding=True, truncation=True).to("cuda")
output = model.generate(**inputs, max_new_tokens=64)
print("DIALOGUE:\n", example)
print("\nPREDICTED SUMMARY:\n", tokenizer.decode(output[0], skip_special_tokens=True))


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
