<a href="https://colab.research.google.com/github/Danny2173/RAGproject/blob/main/3_Fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install nbstripout
!nbstripout /content/drive/MyDrive/3_Fine_tuning.ipynb



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os, gc, torch

from datasets import Dataset
from transformers import AutoTokenizer
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
from transformers import AutoModelForSeq2SeqLM
from transformers import TrainingArguments, Trainer
import json

In [None]:
load_path = "/content/drive/MyDrive/expanded_dataset.json"

# Load the JSON file
with open(load_path, "r", encoding="utf-8") as f:
    expanded_data = json.load(f)

print(f"Loaded {len(expanded_data)} examples.")


Loaded 5121 examples.


In [None]:
# Environmental setup - ensure sufficient memory
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"
gc.collect()
torch.cuda.empty_cache()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data Preparation
def flatten_text(text):
    return text.replace('\n', ' ').replace('  ', ' ').strip()

# Creating input/output format
reformatted_dataset = [
    {
        "question": item["question"],
        "context": flatten_text(item["context"]),
        "answer": item["answer"]
    }
    for item in expanded_data
]

# Converting list to dataset
dataset = Dataset.from_list(reformatted_dataset)

# Tokenizing
model_name = "facebook/bart-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)


def tokenize_bart(example):
    input_text = f"Context: {example['context']} Question: {example['question']}"

    model_inputs = tokenizer(
        input_text,
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            example["answer"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )

    model_inputs["labels"] = [
        label if label != tokenizer.pad_token_id else -100
        for label in labels["input_ids"]
    ]

    return model_inputs


tokenized_dataset = dataset.map(tokenize_bart, batched=False, remove_columns=dataset.column_names)

# Setting up LoRA model
config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

# Setting training arguments
training_args = TrainingArguments(
    output_dir="./bart-RAG",
    eval_strategy="no",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    logging_steps=10,
    logging_dir="./logs",
    report_to="none",
)

# Training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)


trainer.train()


Map:   0%|          | 0/5121 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 1,179,648 || all params: 407,471,104 || trainable%: 0.2895


Step,Training Loss
10,1.935
20,1.888
30,1.6678
40,1.8924
50,1.5534
60,1.3942
70,1.4281
80,1.1892
90,1.2777
100,1.0586


TrainOutput(global_step=3843, training_loss=0.5770015273279056, metrics={'train_runtime': 2865.4231, 'train_samples_per_second': 5.362, 'train_steps_per_second': 1.341, 'total_flos': 1.6702287648915456e+16, 'train_loss': 0.5770015273279056, 'epoch': 3.0})

In [None]:
# Environmental setup - ensure sufficient memory

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"
gc.collect()
torch.cuda.empty_cache()

# Data Preparation

def flatten_text(text):
    return text.replace('\n', ' ').replace('  ', ' ').strip()

# Creating input/output format
reformatted_dataset = [
    {
        "input": f"question: {item['question']} context: {flatten_text(item['context'])}",
        "output": item["answer"]
    }
    for item in expanded_data
]

# Converting list to dataset
dataset = Dataset.from_list(reformatted_dataset)


# Tokenizing
model = "t5-large"
tokenizer = AutoTokenizer.from_pretrained(model)

def tokenize(example):
    # Tokenize input
    model_inputs = tokenizer(
        example["input"],
        max_length=512,
        padding="max_length",
        truncation=True
    )
    # Tokenize target (output)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            example["output"],
            max_length=128,
            padding="max_length",
            truncation=True
        )

    # Transforming padded token positions
    model_inputs["labels"] = [
        l if l != tokenizer.pad_token_id else -100 for l in labels["input_ids"]
    ]
    return model_inputs

tokenized = dataset.map(tokenize, batched=True)

# Setting up LoRA model

model = AutoModelForSeq2SeqLM.from_pretrained(model)

config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

# Setting training arguments

training_args = TrainingArguments(
    output_dir="./t5-RAG",
    eval_strategy="no",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    logging_steps=10,
    logging_dir="./logs",
    report_to="none",
)

# Training

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    tokenizer=tokenizer,
)

trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/5121 [00:00<?, ? examples/s]



model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 2,359,296 || all params: 740,027,392 || trainable%: 0.3188


  trainer = Trainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
10,19.996
20,18.8488
30,18.7921
40,18.3705
50,17.6605
60,16.3084
70,14.9604
80,12.9432
90,11.2303
100,10.0888


TrainOutput(global_step=3843, training_loss=0.615073929044253, metrics={'train_runtime': 5612.9924, 'train_samples_per_second': 2.737, 'train_steps_per_second': 0.685, 'total_flos': 3.3372968904032256e+16, 'train_loss': 0.615073929044253, 'epoch': 3.0})

In [None]:
trainer.save_model("/content/drive/MyDrive/t5-lora-final")
tokenizer.save_pretrained("/content/drive/MyDrive/t5-lora-final")


('/content/drive/MyDrive/bart-lora-final/tokenizer_config.json',
 '/content/drive/MyDrive/bart-lora-final/special_tokens_map.json',
 '/content/drive/MyDrive/bart-lora-final/vocab.json',
 '/content/drive/MyDrive/bart-lora-final/merges.txt',
 '/content/drive/MyDrive/bart-lora-final/added_tokens.json',
 '/content/drive/MyDrive/bart-lora-final/tokenizer.json')

In [None]:
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/t5-lora-final")
base_model = AutoModelForSeq2SeqLM.from_pretrained("t5-large")
model = PeftModel.from_pretrained(base_model, "/content/drive/MyDrive/t5-lora-final")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]