# Installing libraries

In [None]:
# Install transformers and other core libraries
!pip install transformers datasets peft accelerate evaluate


Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


In [None]:
# Install bitsandbytes for 4-bit quantization
!pip install bitsandbytes



In [None]:
# Install sentence-transformers for embedding
!pip install -U sentence-transformers



In [None]:
# Install FAISS CPU version
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m45.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


# 1. Load the Squad Dataset

In [None]:
from datasets import load_dataset
import pandas as pd

# Load the SQuAD v1.1 dataset
dataset = load_dataset("squad")

# Let's look at the structure
print(dataset)

# Let's inspect one example from the training set
print("\n--- Example from Training Set ---")
example = dataset['train'][0]
print(f"ID: {example['id']}")
print(f"Title: {example['title']}")
print(f"Context: {example['context']}")
print(f"Question: {example['question']}")
print(f"Answer: {example['answers']}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

plain_text/validation-00000-of-00001.par(…):   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

--- Example from Training Set ---
ID: 5733be284776f41900661182
Title: University_of_Notre_Dame
Context: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statu

# 2. Prepared data for RAG and FineTuning

In [None]:
# Create a DataFrame for easier manipulation
train_df = pd.DataFrame(dataset['train'])
val_df = pd.DataFrame(dataset['validation'])

In [None]:
# For RAG, we need a unique list of contexts to index
# We'll combine contexts from both train and validation splits
all_contexts = pd.concat([train_df['context'], val_df['context']]).unique()
print(f"Total unique contexts to index for RAG: {len(all_contexts)}")


Total unique contexts to index for RAG: 20958


# 3. Initializing the Embedding model

In [None]:
from sentence_transformers import SentenceTransformer

# We'll use a lightweight but powerful embedding model
embedding_model_name = 'all-MiniLM-L6-v2'
embedding_model = SentenceTransformer(embedding_model_name)

print("✅ Embedding model loaded.")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Embedding model loaded.


# 4. Implement the Text Splitter

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Define our text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=64,
    length_function=len,
)

# Let's test it on one context
sample_context = all_contexts[10]
chunks = text_splitter.split_text(sample_context)

print(f"Original context has {len(sample_context)} characters.")
print(f"Split into {len(chunks)} chunks.")
print(f"First chunk: \n{chunks[0]}")

Original context has 1033 characters.
Split into 3 chunks.
First chunk: 
Father Joseph Carrier, C.S.C. was Director of the Science Museum and the Library and Professor of Chemistry and Physics until 1874. Carrier taught that scientific research and its promise for progress were not antagonistic to the ideals of intellectual and moral culture endorsed by the Church. One of Carrier's students was Father John Augustine Zahm (1851–1921) who was made Professor and Co-Director of the Science Department at age 23 and by 1900 was a nationally prominent scientist and naturalist. Zahm was


# 5. Chunk and Embedd all contexts

In [None]:
import numpy as np
from tqdm import tqdm

# We need to store the chunks and their corresponding original context
chunk_texts = []
context_mapping = []

print("Processing and chunking all contexts...")
for context in tqdm(all_contexts):
    chunks = text_splitter.split_text(context)
    for chunk in chunks:
        chunk_texts.append(chunk)
        context_mapping.append(context)

print(f"Total chunks created: {len(chunk_texts)}")

# Now, let's create the embeddings for all chunks
print("\nEmbedding all chunks... (This may take a few minutes)")
chunk_embeddings = embedding_model.encode(chunk_texts, show_progress_bar=True, convert_to_numpy=True)

print(f"Embeddings created with shape: {chunk_embeddings.shape}")

Processing and chunking all contexts...


100%|██████████| 20958/20958 [00:03<00:00, 6232.25it/s]


Total chunks created: 43344

Embedding all chunks... (This may take a few minutes)


Batches:   0%|          | 0/1355 [00:00<?, ?it/s]

Embeddings created with shape: (43344, 384)


# 6. Build and Save the FAISS Index

In [None]:
import faiss

# Get the dimension of our embeddings
d = chunk_embeddings.shape[1]

# Create the FAISS index
index = faiss.IndexFlatL2(d)

# Add the embeddings to the index
index.add(chunk_embeddings)

print(f"FAISS index created successfully.")
print(f"Number of vectors in the index: {index.ntotal}")

# Let's save the index and our chunk data for later use
faiss.write_index(index, "squad_context_index.faiss")

import pickle

with open("chunk_data.pkl", "wb") as f:
    pickle.dump({"chunks": chunk_texts, "mapping": context_mapping}, f)

print("✅ RAG knowledge base built and saved.")

FAISS index created successfully.
Number of vectors in the index: 43344
✅ RAG knowledge base built and saved.


In [None]:
from huggingface_hub import notebook_login

print("Please paste your Hugging Face access token below:")
notebook_login()

Please paste your Hugging Face access token below:


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# 7. Load the Backbone LLM

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_name = "mistralai/Mistral-7B-Instruct-v0.2"

# Configure quantization to load the model in 4-bit
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto" # Automatically maps model layers to available devices (GPU/CPU)
)

print("✅ Backbone LLM (Mistral-7B) loaded successfully in 4-bit.")

tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

✅ Backbone LLM (Mistral-7B) loaded successfully in 4-bit.


# 8. Define the RAG Prompt Template

In [None]:
RAG_PROMPT_TEMPLATE = """
CONTEXT:
{context}

QUESTION:
{question}

INSTRUCTIONS:
Based *only* on the context provided, answer the question. Your answer should be a direct quote from the context.
If the context does not contain the answer, state that the answer is not found in the context.
Cite the source context by including its index at the end of your answer, like this: 'The answer is X. [1]'.
"""

# 9. End to End RAG Pipeline

In [None]:
def answer_question_rag(question, top_k=5):
    # 1. Embed the question
    question_embedding = embedding_model.encode([question], convert_to_numpy=True)

    # 2. Search the FAISS index
    distances, indices = index.search(question_embedding, top_k)

    # 3. Retrieve the context chunks

    retrieved_chunks = [chunk_texts[i] for i in indices[0]]

    # 4. Format the context for the prompt
    context_str = ""
    for i, chunk in enumerate(retrieved_chunks):
        context_str += f"[{i+1}] {chunk}\n\n"

    # 5. Create the prompt
    prompt = RAG_PROMPT_TEMPLATE.format(context=context_str, question=question)



    # 6. Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    # 7. Get the length of the prompt in tokens
    input_length = inputs.input_ids.shape[1]

    # 8. Generate the answer tokens
    # Added pad_token_id to suppress a warning
    generated_ids = model.generate(
        **inputs,
        max_new_tokens=100,
        pad_token_id=tokenizer.eos_token_id
    )

    # 9. Decode only the newly generated tokens, skipping the prompt
    answer_tokens = generated_ids[0, input_length:]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)

    return answer, retrieved_chunks

# 10. Test the RAG Pipeline

In [None]:
# Let's grab a sample question from the validation set
sample_validation = dataset['validation'][5]
question = sample_validation['question']
gold_answer = sample_validation['answers']['text'][0]

print(f"Question: {question}")
print(f"Gold Answer: {gold_answer}")

# Get the answer from our RAG system
rag_answer, retrieved_context = answer_question_rag(question, top_k=5)

print("\n--- RAG System Output ---")
print(f"Answer: {rag_answer}")
print("\n--- Retrieved Contexts ---")
for i, context in enumerate(retrieved_context):
    print(f"[{i+1}] {context[:150]}...") # Print first 150 chars

Question: What was the theme of Super Bowl 50?
Gold Answer: "golden anniversary"

--- RAG System Output ---
Answer: 
ANSWER:
The theme of Super Bowl 50 was the 'golden anniversary'. [1]

EXPLANATION:
The context states that the league emphasized the 'golden anniversary' with various gold-themed initiatives and temporarily suspended the tradition of naming each Super Bowl game with Roman numerals to prominently feature the Arabic numerals 50 in the logo. This information directly answers the question. [1]

--- Retrieved Contexts ---
[1] Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of na...
[2] In early 2012, NFL Commissioner Roger Goodell stated that the league planned to make the 50th Super Bowl "spectacular" and that it would be "an import...
[3] the Super Bowl, and "homecoming" events were also held by Super Bowl-winning teams at games....
[4] As opposed to broadcasts of primetime seri

# Supervised Fine-Tuning with QLoRA

# 1. Install the TRL Library

In [None]:
!pip install --upgrade transformers datasets accelerate peft bitsandbytes trl

Collecting transformers
  Downloading transformers-4.55.2-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting trl
  Downloading trl-0.21.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downl

# 2. Format the Dataset for Fine-Tuning

In [None]:
def create_finetuning_prompt(example):
    """
    Formats a SQuAD example into the Mistral instruction-following format.
    """
    # The user's instruction is to answer the question based on the context
    question = example["question"].strip()
    context = example["context"].strip()

    # The model's desired output is the answer
    answer = example["answers"]["text"][0].strip()

    # Create the full prompt
    prompt = f"""[INST] Based on the context below, provide a precise answer to the question.

Context:
{context}

Question:
{question} [/INST] {answer}"""

    return {"text": prompt}


# The .map() function is highly efficient for this
print("Formatting the dataset...")
formatted_dataset = dataset['train'].map(create_finetuning_prompt)

# For this, we'll use a subset of 5000 examples to keep training time manageable on Colab.
# This is enough to see a significant improvement.
subset_dataset = formatted_dataset.select(range(5000))

print("\n--- Example of a formatted training prompt ---")
print(subset_dataset[0]['text'])

Formatting the dataset...


Map:   0%|          | 0/87599 [00:00<?, ? examples/s]


--- Example of a formatted training prompt ---
[INST] Based on the context below, provide a precise answer to the question.

Context:
Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.

Question:
To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? [/INST] Saint Bernadette Soubirous


# 3. Configure QLoRA and Training Parameters

In [None]:
from peft import LoraConfig
from transformers import TrainingArguments

# 1. QLoRA Configuration
lora_config = LoraConfig(
    r=16,  # The rank of the LoRA matrices. A higher rank means more trainable parameters.
    lora_alpha=32, # A scaling factor for the LoRA matrices.
    target_modules=["q_proj", "v_proj"], # The specific layers of the model to adapt.
    lora_dropout=0.05, # Dropout to prevent overfitting.
    bias="none",
    task_type="CAUSAL_LM" # Specifies the task type.
)

# 2. Training Arguments
# These arguments control the entire training process.
training_args = TrainingArguments(
    output_dir="./mistral-squad-finetuned", # Where the trained model adapters will be saved.
    num_train_epochs=1, # We'll do one full pass over our 5000 examples.
    per_device_train_batch_size=1, # Process one example at a time.
    gradient_accumulation_steps=4, # Accumulate gradients over 4 steps to simulate a batch size of 4.
    learning_rate=2e-4, # The speed at which the model learns.
    logging_steps=20, # Print training loss every 20 steps.
    fp16=True, # Use 16-bit precision for faster training.
    save_total_limit=2, # Only keep the last two saved checkpoints.
    report_to="none"
)

print("✅ QLoRA and Training Arguments configured.")

✅ QLoRA and Training Arguments configured.


# 4. Run the Trainer

Do you too hardware constraint in colab, not able to run whole but this much is sufficent for the model to learn properly

In [None]:
from trl import SFTTrainer

# The SFTTrainer is a specialized trainer for supervised fine-tuning.
trainer = SFTTrainer(
    model=model,  # 4-bit quantized Mistral-7B model
    train_dataset=subset_dataset,
    peft_config=lora_config,
    args=training_args,
)

print("Starting the fine-tuning process... This will take a while.")

# This begins the training run.
trainer.train()

print("Fine-tuning complete!")

# After training, save the final adapter model.
final_adapter_path = "./mistral-squad-finetuned-adapter"
trainer.save_model(final_adapter_path)
print(f"Fine-tuned adapter saved to: {final_adapter_path}")

Adding EOS to train dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

Starting the fine-tuning process... This will take a while.


Step,Training Loss
20,1.7889
40,1.4393
60,1.5398
80,1.4922
100,1.3755
120,1.3155
140,1.462
160,1.3707


Step,Training Loss
20,1.7889
40,1.4393
60,1.5398
80,1.4922
100,1.3755
120,1.3155
140,1.462
160,1.3707
180,1.335
200,1.1944


OutOfMemoryError: CUDA out of memory. Tried to allocate 88.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 44.12 MiB is free. Process 5330 has 14.70 GiB memory in use. Of the allocated memory 14.39 GiB is allocated by PyTorch, and 174.42 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# The training arguments saving to "./mistral-squad-finetuned"
!ls -l ./mistral-squad-finetuned

total 8
drwxr-xr-x 2 root root 4096 Aug 16 14:17 checkpoint-500
-rw-r--r-- 1 root root 1496 Aug 16 14:16 README.md


# 5. Locate the Last Saved Checkpoint

In [None]:
print("Installing required libraries...")
!pip install transformers datasets peft accelerate bitsandbytes faiss-cpu sentence-transformers evaluate

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import faiss
import pickle
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm
from IPython.display import display
import evaluate

Installing required libraries...


# 6. Load all Models

In [None]:
print("\nPlease log in to Hugging Face to download the model...")
from huggingface_hub import notebook_login
notebook_login()

print("\nLoading base model, tokenizer, and your fine-tuned adapter...")

# Define model and quantization
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# Load base model and tokenizer from Hugging Face
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Load and merge the adapter from saved checkpoint
adapter_path = "./mistral-squad-finetuned/checkpoint-500"
ft_model = PeftModel.from_pretrained(base_model, adapter_path)
ft_model = ft_model.merge_and_unload()

print("✅ All models loaded successfully into the clean environment.")



Please log in to Hugging Face to download the model...


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…


Loading base model, tokenizer, and your fine-tuned adapter...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



✅ All models loaded successfully into the clean environment.


# 7. Inference Functions

In [None]:
print("\nDefining inference functions...")

# Load RAG components from saved files
index = faiss.read_index("squad_context_index.faiss")
with open("chunk_data.pkl", "rb") as f:
    chunk_data = pickle.load(f)
chunk_texts = chunk_data['chunks']
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

def answer_question_rag(question, top_k=5):
    question_embedding = embedding_model.encode([question], convert_to_numpy=True)
    distances, indices = index.search(question_embedding, top_k)
    retrieved_chunks = [chunk_texts[i] for i in indices[0]]
    context_str = "\n\n---\n\n".join(retrieved_chunks)
    prompt = f"[INST] Based on the context below, answer the question. Context: {context_str}\n\nQuestion: {question} [/INST]"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    input_length = inputs.input_ids.shape[1]
    generated_ids = base_model.generate(**inputs, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id)
    answer = tokenizer.decode(generated_ids[0, input_length:], skip_special_tokens=True)
    return answer

def answer_question_ft(question, context):
    prompt = f"[INST] Based on the context below, answer the question. Context: {context}\n\nQuestion: {question} [/INST]"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    input_length = inputs.input_ids.shape[1]
    generated_ids = ft_model.generate(**inputs, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id)
    answer = tokenizer.decode(generated_ids[0, input_length:], skip_special_tokens=True)
    return answer

print("✅ Inference functions are ready.")


Defining inference functions...
✅ Inference functions are ready.


# 7. Evaluations

In [None]:
print("\nRunning evaluation on the SQuAD validation set...")
dataset = load_dataset("squad")
eval_dataset = dataset['validation'].select(range(200)) # Using a 200-example subset
results = []

for example in tqdm(eval_dataset):
    question = example['question']
    context = example['context']
    gold_answers = example['answers']['text']

    rag_prediction = answer_question_rag(question, top_k=5)
    ft_prediction = answer_question_ft(question, context)

    results.append({
        "id": example['id'],
        "question": question,
        "gold_answers": gold_answers,
        "rag_prediction": rag_prediction.strip(),
        "ft_prediction": ft_prediction.strip()
    })

results_df = pd.DataFrame(results)
print("\n--- Sample of Evaluation Results ---")
display(results_df.head())


Running evaluation on the SQuAD validation set...


100%|██████████| 200/200 [05:50<00:00,  1.75s/it]


--- Sample of Evaluation Results ---





Unnamed: 0,id,question,gold_answers,rag_prediction,ft_prediction
0,56be4db0acb8001400a502ec,Which NFL team represented the AFC at Super Bo...,"[Denver Broncos, Denver Broncos, Denver Broncos]",Denver Broncos,Denver Broncos
1,56be4db0acb8001400a502ed,Which NFL team represented the NFC at Super Bo...,"[Carolina Panthers, Carolina Panthers, Carolin...",Carolina Panthers,Carolina Panthers
2,56be4db0acb8001400a502ee,Where did Super Bowl 50 take place?,"[Santa Clara, California, Levi's Stadium, Levi...",Levi's Stadium,Levi's Stadium
3,56be4db0acb8001400a502ef,Which NFL team won Super Bowl 50?,"[Denver Broncos, Denver Broncos, Denver Broncos]",Denver Broncos,Denver Broncos
4,56be4db0acb8001400a502f0,What color was used to emphasize the 50th anni...,"[gold, gold, gold]",gold,gold


# 8. Metrics

In [None]:
print("\nCalculating final EM and F1 scores...")
squad_metric = evaluate.load("squad")

rag_predictions_formatted = [{'prediction_text': r['rag_prediction'], 'id': r['id']} for r in results]
references_formatted = [{'answers': {'text': r['gold_answers'], 'answer_start': [-1]*len(r['gold_answers'])}, 'id': r['id']} for r in results]

rag_metrics = squad_metric.compute(predictions=rag_predictions_formatted, references=references_formatted)
print(f"\n--- RAG Model Metrics ---")
print(f"Exact Match: {rag_metrics['exact_match']:.2f}")
print(f"F1 Score: {rag_metrics['f1']:.2f}")

ft_predictions_formatted = [{'prediction_text': r['ft_prediction'], 'id': r['id']} for r in results]
ft_metrics = squad_metric.compute(predictions=ft_predictions_formatted, references=references_formatted)
print(f"\n--- Fine-Tuned Model (from checkpoint-500) Metrics ---")
print(f"Exact Match: {ft_metrics['exact_match']:.2f}")
print(f"F1 Score: {ft_metrics['f1']:.2f}")

print("\n\n✅ Evaluation Complete!")


Calculating final EM and F1 scores...


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]


--- RAG Model Metrics ---
Exact Match: 81.50
F1 Score: 82.64

--- Fine-Tuned Model (from checkpoint-500) Metrics ---
Exact Match: 94.00
F1 Score: 95.67


✅ Evaluation Complete!


In [None]:
import pandas as pd
from IPython.display import display, HTML

def generate_comparative_analysis_report(results_df: pd.DataFrame, num_examples_to_show: int = 3):
    """
    Analyzes the results DataFrame to find specific failure and success cases
    for RAG vs. Fine-Tuning and prints a formatted report.
    """

    print("="*90)
    print("           Automated Comparative Analysis: RAG vs. Fine-Tuning          ")
    print("="*90)

    # --- Step 1: Define success criteria using boolean masks for efficiency ---
    # A prediction is successful if it is present in the list of possible gold answers.
    ft_succeeded = results_df.apply(lambda row: row['ft_prediction'] in row['gold_answers'], axis=1)
    rag_succeeded = results_df.apply(lambda row: row['rag_prediction'] in row['gold_answers'], axis=1)

    # --- Step 2: Identify the four key scenarios by combining the masks ---
    ft_wins = results_df[ft_succeeded & ~rag_succeeded]
    rag_wins = results_df[~ft_succeeded & rag_succeeded]
    both_fail = results_df[~ft_succeeded & ~rag_succeeded]
    both_succeed = results_df[ft_succeeded & rag_succeeded]

    # --- Step 3: Print a Quantitative Summary ---
    print("\n--- Quantitative Summary of Model Performance ---")
    print(f"Total Examples Analyzed: {len(results_df)}")
    print("-" * 50)
    print(f"  ▶ Fine-Tuning Won (FT Correct, RAG Wrong): {len(ft_wins)} examples")
    print(f"  ▶ RAG Won (RAG Correct, FT Wrong):         {len(rag_wins)} examples")
    print(f"  ▶ Both Models Failed:                      {len(both_fail)} examples")
    print(f"  ▶ Both Models Succeeded:                   {len(both_succeed)} examples")
    print("-" * 50)

    # --- Step 4: Define a helper function to display examples cleanly ---
    def display_examples(df, title):
        print("\n" + "="*90)
        print(f"    {title.upper()}    ")
        print("="*90)
        if df.empty:
            print("\n  (No examples found in this category.)\n")
            return

        for index, row in df.head(num_examples_to_show).iterrows():
            print("\n" + "-"*50)
            print(f"Question: {row['question']}")
            print(f"Gold Answer(s): {row['gold_answers']}")
            # Use HTML for color formatting to make it easy to spot differences
            display(HTML(f"<b>Fine-Tuned Prediction:</b> <font color='green' style='background-color: #e6ffed;'>{row['ft_prediction']}</font>"))
            display(HTML(f"<b>RAG Prediction:</b> <font color='red' style='background-color: #ffe6e6;'>{row['rag_prediction']}</font>"))

    def display_rag_wins_examples(df, title):
        # Special formatting for when RAG wins
        print("\n" + "="*90)
        print(f"    {title.upper()}    ")
        print("="*90)
        if df.empty:
            print("\n  (No examples found in this category.)\n")
            return

        for index, row in df.head(num_examples_to_show).iterrows():
            print("\n" + "-"*50)
            print(f"Question: {row['question']}")
            print(f"Gold Answer(s): {row['gold_answers']}")
            display(HTML(f"<b>Fine-Tuned Prediction:</b> <font color='red' style='background-color: #ffe6e6;'>{row['ft_prediction']}</font>"))
            display(HTML(f"<b>RAG Prediction:</b> <font color='green' style='background-color: #e6ffed;'>{row['rag_prediction']}</font>"))

    # --- Step 5: Display the qualitative examples for each category ---
    display_examples(ft_wins, "Analysis: Where Fine-Tuning Succeeded and RAG Failed")
    display_rag_wins_examples(rag_wins, "Analysis: Where RAG Succeeded and Fine-Tuning Failed")




generate_comparative_analysis_report(results_df)

           Automated Comparative Analysis: RAG vs. Fine-Tuning          

--- Quantitative Summary of Model Performance ---
Total Examples Analyzed: 200
--------------------------------------------------
  ▶ Fine-Tuning Won (FT Correct, RAG Wrong): 31 examples
  ▶ RAG Won (RAG Correct, FT Wrong):         3 examples
  ▶ Both Models Failed:                      10 examples
  ▶ Both Models Succeeded:                   156 examples
--------------------------------------------------

    ANALYSIS: WHERE FINE-TUNING SUCCEEDED AND RAG FAILED    

--------------------------------------------------
Question: What day was the game played on?
Gold Answer(s): ['February 7, 2016', 'February 7', 'February 7, 2016']



--------------------------------------------------
Question: What was the theme of Super Bowl 50?
Gold Answer(s): ['"golden anniversary"', 'gold-themed', 'gold']



--------------------------------------------------
Question: What team did the Panthers defeat?
Gold Answer(s): ['Arizona Cardinals', 'the Arizona Cardinals', 'Arizona Cardinals']



    ANALYSIS: WHERE RAG SUCCEEDED AND FINE-TUNING FAILED    

--------------------------------------------------
Question: When was Levi's Stadium awarded the right to host Super Bowl 50?
Gold Answer(s): ['May 21, 2013', 'May 21, 2013', 'May 21, 2013,']



--------------------------------------------------
Question: When was Levi's Stadium picked for Super bowl 50?
Gold Answer(s): ['May 21, 2013', 'May 21, 2013', 'May 21, 2013']



--------------------------------------------------
Question: What was the number of times the Denver Broncos played in a Super Bowl by the time they reached Super Bowl 50?
Gold Answer(s): ['eight', 'eight', 'eight']


In [None]:
# ==============================================================================
#                 PROJECT SUMMARY: FINE-TUNING vs. RAG
# ==============================================================================
# This script takes the final evaluation metrics and generates a
# comprehensive summary report of the project's findings.

# --- INPUTS: Final metrics from your evaluation ---
rag_metrics = {'exact_match': 81.50, 'f1': 82.64}
ft_metrics = {'exact_match': 94.00, 'f1': 95.67}
ft_training_steps = 500  # The checkpoint you loaded from

# --- REPORT GENERATION ---

print("="*80)
print("           Project Report: QLoRA Fine-Tuning vs. RAG          ")
print("="*80)

# --- Section 1: Quantitative Comparison ---
print("\n--- 1. Quantitative Comparison: SQuAD v1.1 Validation Set ---")
print("-" * 80)
# Define table headers
header = f"{'System':<35} | {'Exact Match (%)':^20} | {'F1 Score (%)':^20}"
print(header)
print("-" * 80)

# RAG Row
rag_row = f"{'RAG (Zero-Shot)':<35} | {rag_metrics['exact_match']:^20.2f} | {rag_metrics['f1']:^20.2f}"
print(rag_row)

# Fine-Tuning Row
ft_row = f"{f'Fine-Tuning (QLoRA @ {ft_training_steps} steps)':<35} | {ft_metrics['exact_match']:^20.2f} | {ft_metrics['f1']:^20.2f}"
print(ft_row)
print("-" * 80)

# --- Section 2: Analysis of the "Uno Reverse" Result ---
print("\n--- 2. Analysis & Key Insight ---")
print("The results were counter-intuitive: the partially fine-tuned model dramatically")
print("outperformed the RAG system. This reveals a fundamental concept:")
print("\n  ▶ FINE-TUNING teaches a model a new SKILL or BEHAVIOR.")
print("    Our model became a 'specialist' at the SQuAD task, learning to generate")
print("    the precise, extractive answers that the metrics reward.")
print("\n  ▶ RAG provides a model with new KNOWLEDGE.")
print("    The RAG model had access to the correct facts but was penalized for its")
print("    more natural, conversational responses, which score lower.")

# --- Section 3: Cost, Scalability, and Limitations ---
print("\n--- 3. Cost, Scalability, and Limitations ---")
print("\nA. Fine-Tuning Trade-offs:")
print("  - High Upfront Cost: Requires significant GPU time for training.")
print("  - Knowledge is Static: The model's knowledge is frozen at the time of training.")
print("  - Expensive to Update: To incorporate new information, the entire fine-tuning")
print("    process must be repeated.")

print("\nB. RAG Trade-offs:")
print("  - Low Upfront Cost: No training required, only a one-time indexing.")
print("  - Knowledge is Dynamic: The knowledge base can be updated cheaply and quickly")
print("    by simply adding new documents to the vector store.")
print("  - Retrieval is the Bottleneck: System performance is capped by the quality")
print("    of the document retriever. 'Garbage in, garbage out.'")

# --- Section 4: Final Comparison Summary Table ---
print("\n--- 4. Final Comparison Summary ---")
print("-" * 80)
summary_header = f"{'Aspect':<25} | {'Fine-Tuning (QLoRA)':<25} | {'RAG':<25}"
print(summary_header)
print("-" * 80)
summary_data = [
    {"Aspect": "Primary Use Case", "FT": "Skill / Behavior", "RAG": "Knowledge Provisioning"},
    {"Aspect": "Update Cost", "FT": "Expensive (Re-train)", "RAG": "Cheap (Re-index)"},
    {"Aspect": "Knowledge Handling", "FT": "Static / Becomes Stale", "RAG": "Dynamic / Always Fresh"},
    {"Aspect": "Format Specialization", "FT": "Extremely High", "RAG": "Low"},
    {"Aspect": "Key Strength", "FT": "Task Format Mastery", "RAG": "Scalable Knowledge"},
    {"Aspect": "Key Weakness", "FT": "Knowledge Obsolescence", "RAG": "Retrieval Quality Cap"},
]
for row in summary_data:
    print(f"{row['Aspect']:<25} | {row['FT']:<25} | {row['RAG']:<25}")
print("-" * 80)



           Project Report: QLoRA Fine-Tuning vs. RAG          

--- 1. Quantitative Comparison: SQuAD v1.1 Validation Set ---
--------------------------------------------------------------------------------
System                              |   Exact Match (%)    |     F1 Score (%)    
--------------------------------------------------------------------------------
RAG (Zero-Shot)                     |        81.50         |        82.64        
Fine-Tuning (QLoRA @ 500 steps)     |        94.00         |        95.67        
--------------------------------------------------------------------------------

--- 2. Analysis & Key Insight ---
The results were counter-intuitive: the partially fine-tuned model dramatically
outperformed the RAG system. This reveals a fundamental concept:

  ▶ FINE-TUNING teaches a model a new SKILL or BEHAVIOR.
    Our model became a 'specialist' at the SQuAD task, learning to generate
    the precise, extractive answers that the metrics reward.

  ▶ RAG p