# LoRA Fine-tuning with Multi-Dataset Combination

This notebook fine-tunes a Seq2Seq model using LoRA on two Hugging Face datasets combined into a single training set.

In [None]:
!pip install torch transformers datasets peft -q

## 1) Imports & LoRA Config

In [None]:
import os, shutil
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from peft import LoraConfig, get_peft_model

model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)
model = get_peft_model(model, lora_config)

## 2) Load and Combine Datasets

In [None]:
dataset1 = load_dataset("chibbss/fitness-chat-prompt-completion-dataset")['train']
dataset2 = load_dataset("m-a-p/CodeFeedback-Filtered-Instruction")['train']

# Add task prefixes
dataset1 = dataset1.map(lambda x: {"instruction": "fitness task: " + x["instruction"], "output": x["output"]})
dataset2 = dataset2.map(lambda x: {"instruction": "code task: " + x["instruction"], "output": x["output"]})

combined_dataset = concatenate_datasets([dataset1, dataset2])

## 3) Tokenization

In [None]:
def preprocess_function(examples):
    model_inputs = tokenizer(examples["instruction"], max_length=512, truncation=True)
    labels = tokenizer(examples["output"], max_length=512, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = combined_dataset.map(preprocess_function, batched=True)

## 4) Training Arguments

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="lora-multitask-output",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    save_total_limit=5,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=False,
    logging_dir="logs",
    logging_strategy="steps",
    logging_steps=50,
    resume_from_checkpoint=True
)

## 5) Train Model

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset.select(range(200)),
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

## 6) Save Model

In [None]:
model.save_pretrained("lora-multitask-model")
tokenizer.save_pretrained("lora-multitask-model")