# Let's go PRO!

Advanced RAG Techniques!

Let's start by digging into ingest:

1. No LangChain! Just native for maximum flexibility
2. Let's use an LLM to divide up chunks in a sensible way
3. Let's use the best chunk size and encoder from yesterday
4. Let's also have the LLM rewrite chunks in a way that's most useful ("document pre-processing")

In [15]:
!pip install litellm chromadb tqdm sentence-transformers

from pathlib import Path
import os
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from chromadb import PersistentClient
from tqdm import tqdm
from litellm import completion, acompletion
import asyncio

# Load env
load_dotenv(override=True)

# -------- MODEL ----------
# NVIDIA NIM (correct)
MODEL = "nvidia_nim/meta/llama3-8b-instruct"

# -------- PATHS ----------
DB_NAME = "preprocessed_db"
collection_name = "docs"
KNOWLEDGE_BASE_PATH = Path("knowledge-base")

AVERAGE_CHUNK_SIZE = 400

# Test that NVIDIA key is present
print("NVIDIA_API_KEY found:", bool(os.getenv("NVIDIA_API_KEY")))


NVIDIA_API_KEY found: True


In [16]:
# A class similar to LangChain's Result
class Result(BaseModel):
    page_content: str
    metadata: dict


# Chunk model (fixed: added metadata)
class Chunk(BaseModel):
    headline: str
    summary: str
    original_text: str
    metadata: dict = {}  # <-- Added


class Chunks(BaseModel):
    chunks: list[Chunk]


In [17]:
def fetch_documents():
    documents = []

    for folder in KNOWLEDGE_BASE_PATH.iterdir():
        doc_type = folder.name
        for file in folder.rglob("*.md"):
            with open(file, "r", encoding="utf-8") as f:
                documents.append({
                    "type": doc_type,
                    "source": file.as_posix(),
                    "text": f.read()
                })

    print(f"Loaded {len(documents)} documents")
    return documents


documents = fetch_documents()


Loaded 30 documents


In [18]:
def make_prompt(document):
    how_many = (len(document["text"]) // AVERAGE_CHUNK_SIZE) + 1

    return f"""
Split the following document into clean, overlapping chunks.

Document Type: {document["type"]}
Source: {document["source"]}

Rules:
- Do NOT leave out any content.
- Use ~{how_many} chunks (flexible).
- Add ~25% overlap (50 words).
- Keep chunk text unchanged.
- Output only JSON.

Document:
{document["text"]}

Output only valid JSON.
"""


## Three steps:

1. Fetch documents from the knowledge base, like LangChain did
2. Call an LLM to turn documents into Chunks
3. Store the Chunks in Chroma

That's it!

### Let's start with Step 1

In [19]:
def make_messages(document):
    return [
        {
            "role": "system",
            "content": """
You MUST respond ONLY with this JSON format:

{
  "chunks": [
    {
      "headline": "...",
      "summary": "...",
      "original_text": "..."
    }
  ]
}

NO extra text. NO explanations.
"""
        },
        {"role": "user", "content": make_prompt(document)}
    ]


In [20]:
async def process_document_async(document):
    MAX_RETRIES = 5

    for attempt in range(1, MAX_RETRIES + 1):
        try:
            response = await acompletion(
                model=MODEL,
                messages=make_messages(document)
            )

            reply = response.choices[0].message.content

            # Parse JSON
            parsed = Chunks.model_validate_json(reply).chunks

            # Attach metadata
            for c in parsed:
                c.metadata = {
                    "source": document["source"],
                    "type": document["type"]
                }

            return parsed

        except Exception as e:
            print(f"[Retry {attempt}/{MAX_RETRIES}] Error â†’ {e}")
            await asyncio.sleep(attempt * 2)

    raise RuntimeError(f"Failed after {MAX_RETRIES} retries â†’ {document['source']}")


### Donezo! On to Step 2 - make the chunks

In [21]:
async def create_chunks_parallel(documents):
    all_chunks = []

    for doc in tqdm(documents, desc="Processing documents", unit="doc"):
        chunks = await process_document_async(doc)
        all_chunks.extend(chunks)

    return all_chunks


# Run processing
chunks = await create_chunks_parallel(documents)

print("Total chunks created:", len(chunks))
chunks[:2]


Processing documents: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 30/30 [01:24<00:00,  2.83s/doc]

Total chunks created: 127





[Chunk(headline='Professional Bio', summary='Dedicated Computer Science Engineering student with strong analytical mindset and passion for technology.', original_text='Ayush Tyagi is a dedicated Computer Science Engineering student at JIMS, Greater Noida (IP University), on track to graduate in 2025. With a strong foundation built during his PCM schooling at Vivekanand School, Anand Vihar, he has channeled his analytical mindset into a passion for technology. Ayush thrives on the creative process of front-end development, specializing in building visually stunning, user-friendly, and innovative web applications.', metadata={'source': 'knowledge-base/about/bio.md', 'type': 'about'}),
 Chunk(headline='Unique Combination', summary='Creative eye and logical, problem-solving mindset with a natural strength for designing intuitive and engaging interfaces.', original_text="My uniqueness lies in the powerful combination of a creative eye and a logical, problem-solving mindset. I don't just wri

In [22]:
from sentence_transformers import SentenceTransformer

# local embedding model
embedder = SentenceTransformer("all-MiniLM-L6-v2")


def create_embeddings(chunks):
    chroma = PersistentClient(path=DB_NAME)

    # Delete old collection
    if collection_name in [c.name for c in chroma.list_collections()]:
        chroma.delete_collection(collection_name)

    # Prepare data
    texts = [
        chunk.headline + "\n\n" + chunk.summary + "\n\n" + chunk.original_text
        for chunk in chunks
    ]
    metas = [chunk.metadata for chunk in chunks]
    ids = [str(i) for i in range(len(chunks))]

    print("Generating embeddings locally...")

    vectors = embedder.encode(texts, convert_to_numpy=True)

    # Add to Chroma
    collection = chroma.get_or_create_collection(collection_name)
    collection.add(
        ids=ids,
        embeddings=vectors.tolist(),
        documents=texts,
        metadatas=metas
    )

    print(f"Vectorstore created with {collection.count()} documents")


In [23]:
create_embeddings(chunks)


Generating embeddings locally...
Vectorstore created with 127 documents


In [24]:
import numpy as np

# Load Chroma and retrieve stored vectors
chroma = PersistentClient(path=DB_NAME)
collection = chroma.get_or_create_collection(collection_name)

result = collection.get(include=["embeddings", "documents", "metadatas"])

# Extract values
vectors = np.array(result["embeddings"])
documents = result["documents"]
metadatas = result["metadatas"]

# Extract document types from metadata
doc_types = [meta.get("type", "unknown") for meta in metadatas]

# Assign colors for visualization
unique_types = list(sorted(set(doc_types)))

# generate a color map automatically for ANY number of folders
color_palette = [
    "blue", "green", "red", "orange", 
    "purple", "cyan", "pink", "yellow",
    "brown", "gray"
]

# map each folder type to a unique color
type_to_color = {
    doc_type: color_palette[i % len(color_palette)]
    for i, doc_type in enumerate(unique_types)
}

# final colors list per document
colors = [type_to_color[t] for t in doc_types]

print("Found document types:", unique_types)
print("Color map:", type_to_color)


Found document types: ['about', 'education', 'experience', 'extras', 'projects', 'skills']
Color map: {'about': 'blue', 'education': 'green', 'experience': 'red', 'extras': 'orange', 'projects': 'purple', 'skills': 'cyan'}


In [25]:
from sklearn.manifold import TSNE
import plotly.graph_objects as go
import numpy as np

# --- SAFETY: If vectors are large, reduce perplexity ---
perplexity = min(30, len(vectors) - 1)

tsne = TSNE(
    n_components=2,
    random_state=42,
    perplexity=perplexity,
    learning_rate="auto",
    init="pca"
)

reduced_vectors = tsne.fit_transform(vectors)

# --- HOVER TEXT (shortened for speed) ---
hover_texts = [
    f"<b>Type:</b> {t}<br><b>Source:</b> {m.get('source','')}<br><br>{d[:150]}..."
    for t, m, d in zip(doc_types, metadatas, documents)
]

# --- PLOT ---
fig = go.Figure(data=[
    go.Scatter(
        x=reduced_vectors[:, 0],
        y=reduced_vectors[:, 1],
        mode='markers',
        marker=dict(
            size=6,
            color=colors,
            opacity=0.8,
            line=dict(width=0.5, color="black")
        ),
        text=hover_texts,
        hoverinfo='text'
    )
])

fig.update_layout(
    title="ðŸ“Œ 2D Vector Store t-SNE Visualization",
    xaxis_title="t-SNE dimension 1",
    yaxis_title="t-SNE dimension 2",
    width=900,
    height=700,
    margin=dict(l=20, r=20, t=60, b=20),
    template="plotly_dark"
)

fig.show()


In [27]:
from sklearn.manifold import TSNE
import plotly.graph_objects as go
import numpy as np

# --- SAFETY FIX: perplexity must be < num_samples ---
perplexity = min(30, len(vectors) - 1)

# --- 3D TSNE ---
tsne = TSNE(
    n_components=3,
    random_state=42,
    perplexity=perplexity,
    init="pca",
    learning_rate="auto"
)

reduced_vectors = tsne.fit_transform(vectors)

# --- Hover text with type + preview ---
hover_texts = [
    f"<b>Type:</b> {t}<br><b>Source:</b> {m.get('source','')}<br><br>{d[:150]}..."
    for t, m, d in zip(doc_types, metadatas, documents)
]

# --- 3D Scatter Plot ---
fig = go.Figure(data=[
    go.Scatter3d(
        x=reduced_vectors[:, 0],
        y=reduced_vectors[:, 1],
        z=reduced_vectors[:, 2],
        mode='markers',
        marker=dict(
            size=6,
            color=colors,
            opacity=0.85,
            line=dict(width=0.5, color="black")
        ),
        text=hover_texts,
        hoverinfo='text'
    )
])

fig.update_layout(
    title="ðŸŒŒ 3D t-SNE Visualization of Ayushâ€™s Vector Database",
    scene=dict(
        xaxis_title="t-SNE Dim 1",
        yaxis_title="t-SNE Dim 2",
        zaxis_title="t-SNE Dim 3",
    ),
    width=950,
    height=750,
    template="plotly_dark",
    margin=dict(l=10, r=10, t=60, b=10)
)

fig.show()


### Well that was easy! If a bit slow.

In the python module version, I sneakily use the multi-processing Pool to run this in parallel,
but if you get a Rate Limit Error you can turn this off in the code.

### Finally, Step 3 - save the embeddings

# Nothing more to do here... right?

Wait! Didja think I'd forget??

## And now - let's build an Advanced RAG!

We will use these techniques:

1. Reranking - reorder the rank results
2. Query re-writing

In [None]:
class RankOrder(BaseModel):
    order: list[int] = Field(
        description="The order of relevance of chunks, from most relevant to least relevant, by chunk id number"
    )

In [None]:
def rerank(question, chunks):
    system_prompt = """
You are a document re-ranker.
You are provided with a question and a list of relevant chunks of text from a query of a knowledge base.
The chunks are provided in the order they were retrieved; this should be approximately ordered by relevance, but you may be able to improve on that.
You must rank order the provided chunks by relevance to the question, with the most relevant chunk first.
Reply only with the list of ranked chunk ids, nothing else. Include all the chunk ids you are provided with, reranked.
"""
    user_prompt = f"The user has asked the following question:\n\n{question}\n\nOrder all the chunks of text by relevance to the question, from most relevant to least relevant. Include all the chunk ids you are provided with, reranked.\n\n"
    user_prompt += "Here are the chunks:\n\n"
    for index, chunk in enumerate(chunks):
        user_prompt += f"# CHUNK ID: {index + 1}:\n\n{chunk.page_content}\n\n"
    user_prompt += "Reply only with the list of ranked chunk ids, nothing else."
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]
    response = completion(model=MODEL, messages=messages, response_format=RankOrder)
    reply = response.choices[0].message.content
    order = RankOrder.model_validate_json(reply).order
    print(order)
    return [chunks[i - 1] for i in order]

In [None]:
RETRIEVAL_K = 10

def fetch_context_unranked(question):
    query = openai.embeddings.create(model=embedding_model, input=[question]).data[0].embedding
    results = collection.query(query_embeddings=[query], n_results=RETRIEVAL_K)
    chunks = []
    for result in zip(results["documents"][0], results["metadatas"][0]):
        chunks.append(Result(page_content=result[0], metadata=result[1]))
    return chunks

In [None]:
question = "Who won the IIOTY award?"
chunks = fetch_context_unranked(question)

In [None]:
for chunk in chunks:
    print(chunk.page_content[:15]+"...")

In [None]:
reranked = rerank(question, chunks)

In [None]:
for chunk in reranked:
    print(chunk.page_content[:15]+"...")

In [None]:
question = "Who went to Manchester University?"
RETRIEVAL_K = 20
chunks = fetch_context_unranked(question)
for index, c in enumerate(chunks):
    if "manchester" in c.page_content.lower():
        print(index)

In [None]:
reranked = rerank(question, chunks)

In [None]:
for index, c in enumerate(reranked):
    if "manchester" in c.page_content.lower():
        print(index)

In [None]:
reranked[0].page_content

In [None]:
def fetch_context(question):
    chunks = fetch_context_unranked(question)
    return rerank(question, chunks)

In [None]:
SYSTEM_PROMPT = """
You are a knowledgeable, friendly assistant representing the company Insurellm.
You are chatting with a user about Insurellm.
Your answer will be evaluated for accuracy, relevance and completeness, so make sure it only answers the question and fully answers it.
If you don't know the answer, say so.
For context, here are specific extracts from the Knowledge Base that might be directly relevant to the user's question:
{context}

With this context, please answer the user's question. Be accurate, relevant and complete.
"""

In [None]:
# In the context, include the source of the chunk

def make_rag_messages(question, history, chunks):
    context = "\n\n".join(f"Extract from {chunk.metadata['source']}:\n{chunk.page_content}" for chunk in chunks)
    system_prompt = SYSTEM_PROMPT.format(context=context)
    return [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": question}]

In [None]:
def rewrite_query(question, history=[]):
    """Rewrite the user's question to be a more specific question that is more likely to surface relevant content in the Knowledge Base."""
    message = f"""
You are in a conversation with a user, answering questions about the company Insurellm.
You are about to look up information in a Knowledge Base to answer the user's question.

This is the history of your conversation so far with the user:
{history}

And this is the user's current question:
{question}

Respond only with a single, refined question that you will use to search the Knowledge Base.
It should be a VERY short specific question most likely to surface content. Focus on the question details.
Don't mention the company name unless it's a general question about the company.
IMPORTANT: Respond ONLY with the knowledgebase query, nothing else.
"""
    response = completion(model=MODEL, messages=[{"role": "system", "content": message}])
    return response.choices[0].message.content

In [None]:
rewrite_query("Who won the IIOTY award?", [])

In [None]:
def answer_question(question: str, history: list[dict] = []) -> tuple[str, list]:
    """
    Answer a question using RAG and return the answer and the retrieved context
    """
    query = rewrite_query(question, history)
    print(query)
    chunks = fetch_context(query)
    messages = make_rag_messages(question, history, chunks)
    response = completion(model=MODEL, messages=messages)
    return response.choices[0].message.content, chunks

In [None]:
answer_question("Who won the IIOTY award?", [])

In [None]:
answer_question("Who went to Manchester University?", [])