In [1]:
!pip -q install faiss-cpu sentence-transformers transformers accelerate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m72.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import re
import numpy as np
import faiss

from sentence_transformers import SentenceTransformer
from transformers import pipeline

In [3]:
documents = [
    {
        "source": "doc_1",
        "text": """Retrieval-Augmented Generation (RAG) combines information retrieval with text generation.
        A retriever finds relevant documents, and a generator uses them to produce an answer."""
    },
    {
        "source": "doc_2",
        "text": """Vector search uses embeddings to represent text in a dense space.
        FAISS is a library for efficient similarity search and clustering of dense vectors."""
    },
    {
        "source": "doc_3",
        "text": """Chunking splits long documents into smaller passages to improve retrieval.
        Common chunk sizes range from 200 to 500 tokens, with overlap to preserve context."""
    },
]

In [4]:
def chunk_text(text, chunk_size=350, overlap=80):
    text = re.sub(r"\s+", " ", text).strip()
    chunks = []
    start = 0
    while start < len(text):
        end = min(len(text), start + chunk_size)
        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)
        start += (chunk_size - overlap)
    return chunks

chunks = []
for doc in documents:
    for c in chunk_text(doc["text"]):
        chunks.append({"source": doc["source"], "text": c})

len(chunks), chunks[0]

(3,
 {'source': 'doc_1',
  'text': 'Retrieval-Augmented Generation (RAG) combines information retrieval with text generation. A retriever finds relevant documents, and a generator uses them to produce an answer.'})

In [5]:
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

texts = [c["text"] for c in chunks]
emb = embed_model.encode(texts, normalize_embeddings=True, show_progress_bar=True)
emb = np.array(emb).astype("float32")

dim = emb.shape[1]
index = faiss.IndexFlatIP(dim)  # cosine similarity (با normalize) => inner product
index.add(emb)

print("Index size:", index.ntotal)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]



config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Index size: 3


In [6]:
def retrieve(query, top_k=3):
    q_emb = embed_model.encode([query], normalize_embeddings=True)
    q_emb = np.array(q_emb).astype("float32")
    scores, ids = index.search(q_emb, top_k)
    results = []
    for score, idx in zip(scores[0], ids[0]):
        results.append({
            "score": float(score),
            "source": chunks[idx]["source"],
            "text": chunks[idx]["text"]
        })
    return results

In [8]:
gen = pipeline("text2text-generation", model="google/flan-t5-base", max_new_tokens=150)

def answer_question(query, top_k=3):
    ctxs = retrieve(query, top_k=top_k)
    context = "\n\n".join([f"[{i+1}] ({c['source']}) {c['text']}" for i, c in enumerate(ctxs)])

    prompt = f"""Answer the question using ONLY the context.
If the answer is not in the context, say: "I don't know based on the provided documents."

Question: {query}

Context:
{context}

Answer:"""
    out = gen(prompt)[0]["generated_text"]
    return out, ctxs

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/282 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Passing `generation_config` together with generation-related arguments=({'max_new_tokens'}) is deprecated and will be removed in future versions. Please pass either a `generation_config` object OR all generation parameters explicitly, but not both.
The model 'T5ForConditionalGeneration' is not supported for text-generation. Supported models are ['PeftModelForCausalLM', 'AfmoeForCausalLM', 'ApertusForCausalLM', 'ArceeForCausalLM', 'AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BitNetForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'BltForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'CwmForCausalLM', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DeepseekV2ForCausalLM', 'DeepseekV3ForCausalLM', 'DiffLlamaF

In [9]:
q = "What is RAG and how does it work?"
ans, ctx = answer_question(q, top_k=3)
print("Answer:\n", ans)
print("\nCitations:")
for i, c in enumerate(ctx, 1):
    print(i, c["source"], "score=", round(c["score"], 3))

Both `max_new_tokens` (=150) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Answer:
 Answer the question using ONLY the context.
If the answer is not in the context, say: "I don't know based on the provided documents."

Question: What is RAG and how does it work?

Context:
[1] (doc_1) Retrieval-Augmented Generation (RAG) combines information retrieval with text generation. A retriever finds relevant documents, and a generator uses them to produce an answer.

[2] (doc_3) Chunking splits long documents into smaller passages to improve retrieval. Common chunk sizes range from 200 to 500 tokens, with overlap to preserve context.

[3] (doc_2) Vector search uses embeddings to represent text in a dense space. FAISS is a library for efficient similarity search and clustering of dense vectors.

Answer:sing data extraction schemes created throughout Google Books (for both copyright purposes Issador copy monopolizing what should actually void at every

Citations:
1 doc_1 score= 0.466
2 doc_3 score= 0.104
3 doc_2 score= 0.039
