<a href="https://colab.research.google.com/github/Danny2173/RAGproject/blob/main/3_Fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [None]:
# Install dependencies
%pip install -q transformers datasets peft

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Imports
import os, gc, json
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model, TaskType, PeftModel


##Loading Generated Question-Answer dataset

In [None]:
load_path = "/content/drive/MyDrive/expanded_dataset.json"

# Load the JSON file
with open(load_path, "r", encoding="utf-8") as f:
    expanded_data = json.load(f)

print(f"Loaded {len(expanded_data)} examples.")


# Fine-tuning (BART)

In [None]:
# Environmental setup - ensure sufficient memory
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"
gc.collect()
torch.cuda.empty_cache()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data Preparation
def flatten_text(text):
    return text.replace('\n', ' ').replace('  ', ' ').strip()

# Creating input/output format
reformatted_dataset = [
    {
        "question": item["question"],
        "context": flatten_text(item["context"]),
        "answer": item["answer"]
    }
    for item in expanded_data
]

# Converting list to dataset
dataset = Dataset.from_list(reformatted_dataset)

# Tokenizing
model_name = "facebook/bart-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)


def tokenize_bart(example):
    input_text = f"Context: {example['context']} Question: {example['question']}"

    model_inputs = tokenizer(
        input_text,
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            example["answer"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )

    model_inputs["labels"] = [
        label if label != tokenizer.pad_token_id else -100
        for label in labels["input_ids"]
    ]

    return model_inputs


tokenized_dataset = dataset.map(tokenize_bart, batched=False, remove_columns=dataset.column_names)

# Setting up LoRA model
config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

# Setting training arguments
training_args = TrainingArguments(
    output_dir="./bart-RAG",
    eval_strategy="no",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    logging_steps=10,
    logging_dir="./logs",
    report_to="none",
)

# Training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)


trainer.train()


## Fine-tuning (T5-Large)

In [None]:
# Environmental setup - ensure sufficient memory

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"
gc.collect()
torch.cuda.empty_cache()

# Data Preparation

def flatten_text(text):
    return text.replace('\n', ' ').replace('  ', ' ').strip()

# Creating input/output format
reformatted_dataset = [
    {
        "input": f"question: {item['question']} context: {flatten_text(item['context'])}",
        "output": item["answer"]
    }
    for item in expanded_data
]

# Converting list to dataset
dataset = Dataset.from_list(reformatted_dataset)


# Tokenizing
model = "t5-large"
tokenizer = AutoTokenizer.from_pretrained(model)

def tokenize(example):
    # Tokenize input
    model_inputs = tokenizer(
        example["input"],
        max_length=512,
        padding="max_length",
        truncation=True
    )
    # Tokenize target (output)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            example["output"],
            max_length=128,
            padding="max_length",
            truncation=True
        )

    # Transforming padded token positions
    model_inputs["labels"] = [
        l if l != tokenizer.pad_token_id else -100 for l in labels["input_ids"]
    ]
    return model_inputs

tokenized = dataset.map(tokenize, batched=True)

# Setting up LoRA model

model = AutoModelForSeq2SeqLM.from_pretrained(model)

config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

# Setting training arguments

training_args = TrainingArguments(
    output_dir="./t5-RAG",
    eval_strategy="no",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    logging_steps=10,
    logging_dir="./logs",
    report_to="none",
)

# Training

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    tokenizer=tokenizer,
)

trainer.train()

##Saving Fine-tuned T5 large model parameters

In [None]:
trainer.save_model("/content/drive/MyDrive/t5-lora-final")
tokenizer.save_pretrained("/content/drive/MyDrive/t5-lora-final")


In [None]:
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/t5-lora-final")
base_model = AutoModelForSeq2SeqLM.from_pretrained("t5-large")
model = PeftModel.from_pretrained(base_model, "/content/drive/MyDrive/t5-lora-final")