<a href="https://colab.research.google.com/github/ACicmansky/RAG-Workshop/blob/main/lab2-retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🔍 FAISS Similarity Search with Sentence Transformers + RAG

In [3]:
import requests
import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import getpass
import os
import openai


In [1]:
!pip install sentence-transformers pymupdf faiss-cpu openai


Collecting pymupdf
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==

In [4]:
# --- Step 1: Download PDFs from arXiv ---
pdf_urls = [
    "https://arxiv.org/pdf/2401.15884",
    "https://arxiv.org/pdf/2005.11401"
]

local_paths = []
for i, url in enumerate(pdf_urls):
    response = requests.get(url)
    filename = f"paper_{i}.pdf"
    with open(filename, "wb") as f:
        f.write(response.content)
    local_paths.append(filename)

print(f"Downloaded {len(local_paths)} PDFs.")


Downloaded 2 PDFs.


In [5]:
# --- Step 2: Extract text using PyMuPDF ---
def extract_text_from_pdf(path):
    doc = fitz.open(path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

texts = [extract_text_from_pdf(path) for path in local_paths]
full_text = "\n".join(texts)
print(f"Total extracted characters: {len(full_text)}")


Total extracted characters: 130355


In [6]:
# --- Step 3: Split text into overlapping chunks ---
def split_text(text, chunk_size=500, overlap=100):
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

chunks = split_text(full_text)
print(f"Generated {len(chunks)} chunks.")


Generated 326 chunks.


In [7]:
# --- Step 4: Generate embeddings ---
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(chunks, convert_to_numpy=True, show_progress_bar=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

In [8]:
# --- Step 5: Build FAISS index ---
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
print(f"FAISS index built with {index.ntotal} vectors.")


FAISS index built with 326 vectors.


In [9]:
# --- Step 6: Similarity search ---
def search(query, k=3):
    query_embedding = model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, k)
    print(f"\nTop {k} results for: '{query}'\n")
    for i, idx in enumerate(indices[0]):
        print(f"Result {i+1} (score={distances[0][i]:.2f}):\n{chunks[idx][:500]}\n{'-'*80}")


In [10]:
# --- Step 7: Try some sample questions ---
sample_questions = [
    "What is RAG and how does it work?",
    "What is the difference between RAG-Sequence and RAG-Token?",
    "How does RAG use non-parametric memory?",
    "What tasks were used to evaluate RAG?",
    "How is Dense Passage Retrieval (DPR) used in RAG?",
    "What is the advantage of hybrid models over purely parametric models?",
    "What decoding strategies are used in RAG?",
    "How does RAG compare to T5 and BART?",
    "What datasets were used to benchmark RAG models?",
    "Can RAG models be updated without retraining?"
]

for q in sample_questions:
    search(q, k=2)



Top 2 results for: 'What is RAG and how does it work?'

Result 1 (score=0.80):
etability. RAG could be
employed in a wide variety of scenarios with direct beneﬁt to society, for example by endowing it
with a medical index and asking it open-domain questions on that topic, or by helping people be more
effective at their jobs.
With these advantages also come potential downsides: Wikipedia, or any potential external knowledge
source, will probably never be entirely factual and completely devoid of bias. Since RAG can be
employed as a language model, similar concerns as for GP
--------------------------------------------------------------------------------
Result 2 (score=0.90):
raphy (Min et al., 2023), Pub Health (Zhang et al.,
2023a), and Arc-Challenge (Bhakthavatsalam et al.,
2021) show that CRAG can significantly improve
the performance of standard RAG and state-of-the-
art Self-RAG, demonstrating its generalizability
across both short- and long-form generation tasks.
To facilitate o

In [None]:

# Securely prompt for your OpenAI API key
if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

openai.api_key = os.getenv("OPENAI_API_KEY")

def rag_answer(question, k=2, model_name="gpt-4o"):
    query_embedding = model.encode([question], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, k)
    retrieved_context = "\n\n".join([chunks[i] for i in indices[0]])

    prompt = f"Context:\n{retrieved_context}\n\nAnswer this Question based only on the provided context: {question}\nAnswer:"

    response = openai.chat.completions.create(
        model=model_name,
        messages=[
            {"role": "system", "content": "You are a helpful assistant that answers questions based only on the provided context."},
            {"role": "user", "content": prompt}
        ],
        temperature=0
    )

    answer = response.choices[0].message.content
    print(f"\nQuestion: {question}\nAnswer: {answer}")

# 🔍 Try it out with a real question
rag_answer("What is retrieval augmented generation?")
