In [1]:
import requests
import frontmatter
import io
import zipfile

In [23]:
url = 'https://codeload.github.com/DataTalksClub/faq/zip/refs/heads/main'
resp = requests.get(url)

In [26]:
repository_data = []

# Create a ZipFile object from the downloaded content
zf = zipfile.ZipFile(io.BytesIO(resp.content))

for file_info in zf.infolist():
    filename = file_info.filename.lower()
    # Only process markdown files
    if not filename.endswith('.md'):
        continue



    # Read and parse each file
    with zf.open(file_info) as f_in:
        content = f_in.read().decode('utf-8' , errors="ignore")
        post = frontmatter.loads(content)
        data = post.to_dict()
        data['filename'] = filename
        repository_data.append(data)

zf.close()


In [27]:
repository_data[1]

{'id': '9e508f2212',
 'question': 'Course: When does the course start?',
 'sort_order': 1,
 'content': "The next cohort starts January 13th, 2025. More info at [DTC](https://datatalks.club/blog/guide-to-free-online-courses-at-datatalks-club.html).\n\n- Register before the course starts using this [link](https://airtable.com/shr6oVXeQvSI5HuWD).\n- Join the [course Telegram channel with announcements](https://t.me/dezoomcamp).\n- Don’t forget to register in DataTalks.Club's Slack and join the channel.",
 'filename': 'faq-main/_questions/data-engineering-zoomcamp/general/001_9e508f2212_course-when-does-the-course-start.md'}

In [2]:
def read_repo_data(repo_owner, repo_name):
    """
    Download and parse all markdown files from a GitHub repository.
    
    Args:
        repo_owner: GitHub username or organization
        repo_name: Repository name
    
    Returns:
        List of dictionaries containing file content and metadata
    """
    prefix = 'https://codeload.github.com' 
    url = f'{prefix}/{repo_owner}/{repo_name}/zip/refs/heads/main'
    resp = requests.get(url)
    
    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    repository_data = []
    zf = zipfile.ZipFile(io.BytesIO(resp.content))
    
    for file_info in zf.infolist():
        filename = file_info.filename
        filename_lower = filename.lower()

        if not (filename_lower.endswith('.md') 
            or filename_lower.endswith('.mdx')):
            continue
    
        try:
            with zf.open(file_info) as f_in:
                content = f_in.read().decode('utf-8', errors='ignore')
                post = frontmatter.loads(content)
                data = post.to_dict()
                data['filename'] = filename
                repository_data.append(data)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue
    
    zf.close()
    return repository_data


In [3]:
rag_cookbooks = read_repo_data('athina-ai' , 'rag-cookbooks')

In [28]:
dtc_faq = read_repo_data('DataTalksClub', 'faq')
evidently_docs = read_repo_data('evidentlyai', 'docs')

print(f"FAQ documents: {len(dtc_faq)}")
print(f"Evidently documents: {len(evidently_docs)}")


FAQ documents: 1219
Evidently documents: 95


## Day 2 Chunking  and Intelligent Processing for Data


## 1 Simple Chunking

In [4]:
def sliding_window(seq , size , step):
    if size <=0 or step <=0:
        raise ValueError("size and step must be positive")
    n = len(seq)
    result = []
    for i in range(0 , n , step):
        chunk = seq[i:i+size]
        result.append({'start' : i , 'chunk' : chunk})
        if i + size >=n:
            break 
            
    return result 

In [5]:
chunk = sliding_window(rag_cookbooks[0]['content'] , 400 , 100)

In [6]:
len(chunk)

105

In [7]:
RAG_chunks = []

for doc in rag_cookbooks:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    chunks = sliding_window(doc_content ,500 , 200)
    for chunk in chunks:
        chunk.update(doc_copy)
    RAG_chunks.extend(chunks)

In [8]:
len(RAG_chunks)

53

## 2 . Splitting by Paragraphs and Sections

In [51]:
import re

def split_markdown_by_level(text, level=2):
    """
    Split markdown text by a specific header level.
    
    :param text: Markdown text as a string
    :param level: Header level to split on
    :return: List of sections as strings
    """
    # This regex matches markdown headers
    # For level 2, it matches lines starting with "## "
    header_pattern = r'^(#{' + str(level) + r'} )(.+)$'
    pattern = re.compile(header_pattern, re.MULTILINE)

    # Split and keep the headers
    parts = pattern.split(text)
    
    sections = []
    for i in range(1, len(parts), 3):
        # We step by 3 because regex.split() with
        # capturing groups returns:
        # [before_match, group1, group2, after_match, ...]
        # here group1 is "## ", group2 is the header text
        header = parts[i] + parts[i+1]  # "## " + "Title"
        header = header.strip()

        # Get the content after this header
        content = ""
        if i+2 < len(parts):
            content = parts[i+2].strip()

        if content:
            section = f'{header}\n\n{content}'
        else:
            section = header
        sections.append(section)
    
    return sections


In [52]:
rag_chunks = []

for doc in rag_cookbooks:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    sections = split_markdown_by_level(doc_content, level=2)
    for section in sections:
        section_doc = doc_copy.copy()
        section_doc['section'] = section
        rag_chunks.append(section_doc)


In [26]:
len(rag_chunks)

10

In [4]:
from openai import OpenAI

In [5]:
import os 
from dotenv import load_dotenv
load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")


In [6]:
openai_client = OpenAI(api_key= api_key)


def llm(prompt, model='gpt-4o-mini'):
    messages = [
        {"role": "user", "content": prompt}
    ]

    response = openai_client.responses.create(
        model='gpt-4o-mini',
        input=messages
    )

    return response.output_text


In [7]:
prompt_template = """
Split the provided document into logical sections
that make sense for a Q&A system.

Each section should be self-contained and cover
a specific topic or concept.

<DOCUMENT>
{document}
</DOCUMENT>

Use this format:

## Section Name

Section content with all relevant details

---

## Another Section Name

Another section content

---
""".strip()


def intelligent_chunking(text):
    prompt = prompt_template.format(document=text)
    response = llm(prompt)
    sections = response.split('---')
    sections = [s.strip() for s in sections if s.strip()]
    return sections

from tqdm.auto import tqdm

rag_chunks = []

for doc in tqdm(rag_cookbooks):
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')

    sections = intelligent_chunking(doc_content)
    for section in sections:
        section_doc = doc_copy.copy()
        section_doc['section'] = section
        rag_chunks.append(section_doc)



  0%|          | 0/1 [00:00<?, ?it/s]

In [41]:
rag_chunks[0]

{'filename': 'rag-cookbooks-main/README.md',
 'section': '## Advanced + Agentic RAG Cookbooks\n\nWelcome to the comprehensive collection of advanced + agentic Retrieval-Augmented Generation (RAG) techniques.'}

In [9]:
len(rag_chunks)

19

In [11]:
rag_chunks[3]

{'filename': 'rag-cookbooks-main/README.md',
 'section': '## RAG Evaluation📊\n\nEvaluating RAG applications is essential for understanding their effectiveness. This evaluation checks the accuracy and relevance of RAG systems, helping to optimize performance and build confidence for real-world applications.\n\n![evals diagram](https://github.com/user-attachments/assets/65c2b5af-a931-40c5-b006-87567aef019f)'}

### Text Search

In [9]:
from minsearch import Index

In [10]:
index = Index(
    text_fields=  ["chunk", "title", "description", "filename"],
    keyword_fields= []
    
)

index.fit(RAG_chunks)

<minsearch.minsearch.Index at 0x2a73c208650>

In [11]:
query = "RAG evaluation"
results = index.search(query)
results

[{'start': 3800,
  'chunk': 'al with generative models by checking their accuracy and relevance. This evaluation helps improve RAG applications in tasks like text summarization, chatbots, and question-answering. It also identifies areas for improvement, ensuring that these systems provide trustworthy responses as information changes. Overall, effective evaluation helps optimize performance and builds confidence in RAG applications for real-world use. These notebooks contain an end-to-end RAG implementation + RAG evaluation ',
  'filename': 'rag-cookbooks-main/README.md'},
 {'start': 4000,
  'chunk': 's areas for improvement, ensuring that these systems provide trustworthy responses as information changes. Overall, effective evaluation helps optimize performance and builds confidence in RAG applications for real-world use. These notebooks contain an end-to-end RAG implementation + RAG evaluation part in Athina AI.\n\n![evals diagram](https://github.com/user-attachments/assets/65c2b5af-a

### Vector Search

In [12]:
from sentence_transformers import SentenceTransformer

In [13]:
embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

In [14]:
record = RAG_chunks[20]

text = record['chunk']

v_doc = embedding_model.encode(text)


In [15]:
query = "Evaluating rag application"

v_query = embedding_model.encode(query)

similarity = v_query.dot(v_doc)


In [16]:
from tqdm.auto import tqdm
import numpy as np 

faq_embeddings = []

for d in tqdm(RAG_chunks):
    text =d['chunk']
    v = embedding_model.encode(text)
    faq_embeddings.append(v)
faq_embeddings = np.array(faq_embeddings)


  0%|          | 0/53 [00:00<?, ?it/s]

In [17]:
faq_embeddings

array([[ 0.15881318, -0.041525  ,  0.06855938, ...,  0.05195213,
        -0.02086912,  0.00230424],
       [ 0.15370825, -0.04568202,  0.05077348, ...,  0.02697159,
        -0.0393999 ,  0.01631664],
       [ 0.0958629 , -0.0182435 ,  0.02079821, ...,  0.05082281,
        -0.00261695,  0.01596041],
       ...,
       [ 0.05444733,  0.0338323 ,  0.00745371, ...,  0.06647051,
         0.03916425,  0.03215628],
       [ 0.06413162,  0.00811296, -0.00165808, ...,  0.03467955,
         0.04873566,  0.01943674],
       [ 0.07936952,  0.04640346, -0.02640165, ...,  0.00537769,
         0.02057582, -0.00446462]], shape=(53, 768), dtype=float32)

In [20]:
from minsearch import VectorSearch
faq_vindex = VectorSearch()

faq_vindex.fit(faq_embeddings , RAG_chunks)

<minsearch.vector.VectorSearch at 0x2a748cc0bf0>

In [21]:
query = "What are the main four components in RAG"
q = embedding_model.encode(query)
result = faq_vindex.search(q)

In [22]:
result

[{'start': 2400,
  'chunk': "ternal documents to improve the LLM's responses through in-context learning. RAG ensures that the information provided by the LLM is not only contextually relevant but also accurate and up-to-date.\n\n![final diagram](https://github.com/user-attachments/assets/508b3a87-ac46-4bf7-b849-145c5465a6c0)\n\nThere are four main components in RAG:\n\n**Indexing:** First, documents (in any format) are split into chunks, and embeddings for these chunks are created. These embeddings are then added to a vector sto",
  'filename': 'rag-cookbooks-main/README.md'},
 {'start': 2600,
  'chunk': "[final diagram](https://github.com/user-attachments/assets/508b3a87-ac46-4bf7-b849-145c5465a6c0)\n\nThere are four main components in RAG:\n\n**Indexing:** First, documents (in any format) are split into chunks, and embeddings for these chunks are created. These embeddings are then added to a vector store.\n\n**Retriever:** Then, the retriever finds the most relevant documents based 

### Hybrid Search

In [23]:

query = "What are the main four components in RAG"


text_results = index.search(query, num_results=3)

q = embedding_model.encode(query)
vector_results = faq_vindex.search(q, num_results=3)

final_results = text_results + vector_results


In [24]:
final_results[0]

{'start': 2400,
 'chunk': "ternal documents to improve the LLM's responses through in-context learning. RAG ensures that the information provided by the LLM is not only contextually relevant but also accurate and up-to-date.\n\n![final diagram](https://github.com/user-attachments/assets/508b3a87-ac46-4bf7-b849-145c5465a6c0)\n\nThere are four main components in RAG:\n\n**Indexing:** First, documents (in any format) are split into chunks, and embeddings for these chunks are created. These embeddings are then added to a vector sto",
 'filename': 'rag-cookbooks-main/README.md'}

In [25]:
def text_search(query):
    return index.search(query, num_results=5)

def vector_search(query):
    q = embedding_model.encode(query)
    return faq_vindex.search(q, num_results=5)

def hybrid_search(query):
    text_results = text_search(query)
    vector_results = vector_search(query)
    
    # Combine and deduplicate results
    seen_ids = set()
    combined_results = []

    for result in text_results + vector_results:
        if result['filename'] not in seen_ids:
            seen_ids.add(result['filename'])
            combined_results.append(result)
    
    return combined_results


In [26]:
hybrid_search("What are the main components of RAG")

[{'start': 2600,
  'chunk': "[final diagram](https://github.com/user-attachments/assets/508b3a87-ac46-4bf7-b849-145c5465a6c0)\n\nThere are four main components in RAG:\n\n**Indexing:** First, documents (in any format) are split into chunks, and embeddings for these chunks are created. These embeddings are then added to a vector store.\n\n**Retriever:** Then, the retriever finds the most relevant documents based on the user's query, using techniques like vector similarity from the vector store.\n\n**Augment:** After that, the Augmen",
  'filename': 'rag-cookbooks-main/README.md'}]

### Agents and Tools

In [42]:
import openai 
import os 
from dotenv import load_dotenv
from groq import Groq
load_dotenv()

os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")

In [44]:
openai_client = openai.OpenAI()

groq_client = Groq()

user_prompt = "What are the main components of RAG"

chat_messages = [
    {"role": "user", "content": user_prompt}
]

response = groq_client.chat.completions.create(
    messages = chat_messages,
    model = "openai/gpt-oss-20b"
)


In [48]:
response.choices[0].message.content

'### Retrieval‑Augmented Generation (RAG) – The Core Architecture\n\nRAG is a family of NLP systems that **blend a retrieval component with a generative language model** to answer queries, complete texts, or perform other generation tasks while grounding the output in external documents.  The design is deliberately modular so that you can swap out each part for a different implementation or tune it separately.\n\nBelow is a concise breakdown of the **three main building blocks** that are universally present in a RAG pipeline:\n\n| # | Component | Key Responsibilities | Typical Tech Choices | Why It Matters |\n|---|-----------|----------------------|----------------------|----------------|\n| **1** | **Retriever** | • Finds the most relevant passages / documents for a given query. <br>• Creates a set of “context snippets” that the generator will see. | • **Sparse retrievers** – BM25, TF‑IDF, ElasticSearch <br>• **Dense retrievers** – DPR, Sentence‑BERT, CLIP‑style embeddings <br>• **Hyb

## Function Calling with OpenAI

In [58]:
text_search_tool = {
    "type": "function",
    "function" : {
    "name": "text_search",
    "description": "Search the rag cookbooks database ",
    "parameters": {
        "type": "object",
        "properties": {
            "query": {
                "type": "string",
                "description": "Search query text to look up in the rag database"
            }
        },
        "required": ["query"],
        "additionalProperties": False
    }

    }
}


In [59]:
system_prompt = """
You are a helpful assistant for answering question. 
"""

user_prompt = "What are the main components of RAG"

chat_messages = [
    {"role": "user", "content": system_prompt},
    {'role' : 'user' , "content" : user_prompt}
]

response = groq_client.chat.completions.create(
    model = "openai/gpt-oss-20b" ,
    messages= chat_messages , 
    tools= [text_search_tool]
)



In [61]:
response_message

ChatCompletionMessage(content=None, role='assistant', executed_tools=None, function_call=None, reasoning='User asks: "What are the main components of RAG". We need to answer: main components of Retrieval-Augmented Generation: retrieval system, generator (LLM), re-ranking, index, prompt engineering, etc. Also mention RAG pipeline: query encoder, document retrieval, document encoder, reranker, generator. Provide explanation.', tool_calls=[ChatCompletionMessageToolCall(id='fc_aef8e937-7144-46d9-b433-899c36750f12', function=Function(arguments='{"query":"main components of Retrieval-Augmented Generation RAG components"}', name='text_search'), type='function')])

In [60]:
response_message = response.choices[0].message
tool_calls = response_message.tool_calls

tool_calls

[ChatCompletionMessageToolCall(id='fc_aef8e937-7144-46d9-b433-899c36750f12', function=Function(arguments='{"query":"main components of Retrieval-Augmented Generation RAG components"}', name='text_search'), type='function')]

In [64]:
import json

tool_call = tool_calls[0]
arguments = json.loads(tool_call.function.arguments)
print(arguments)

{'query': 'main components of Retrieval-Augmented Generation RAG components'}


In [65]:
result = text_search(**arguments)
result 

[{'start': 1000,
  'chunk': 'nsive collection of advanced + agentic Retrieval-Augmented Generation (RAG) techniques.\n\n## Introduction🚀\nRAG is a popular method that improves accuracy and relevance by finding the right information from reliable sources and transforming it into useful answers. This repository covers the most effective advanced + agentic RAG techniques with clear implementations and explanations.\n\nThe main goal of this repository is to provide a helpful resource for researchers and developers looking to use adv',
  'filename': 'rag-cookbooks-main/README.md'},
 {'start': 800,
  'chunk': 't%20on%20GitHub:%20https://github.com/athina-ai/rag-cookbooks)\n\n>If you find this repository helpful, please consider giving it a star⭐️\n\n# Advanced + Agentic RAG Cookbooks👨🏻\u200d💻\nWelcome to the comprehensive collection of advanced + agentic Retrieval-Augmented Generation (RAG) techniques.\n\n## Introduction🚀\nRAG is a popular method that improves accuracy and relevance by findi

In [67]:
tool_call.id

'fc_aef8e937-7144-46d9-b433-899c36750f12'

In [68]:
call_output = {
    "type": "function_call_output",
    "tool_call_id" : tool_call.id,
    "name": tool_call.function.name,
    "output": json.dumps(result),
}

call_output


{'type': 'function_call_output',
 'tool_call_id': 'fc_aef8e937-7144-46d9-b433-899c36750f12',
 'name': 'text_search',
 'output': '[{"start": 1000, "chunk": "nsive collection of advanced + agentic Retrieval-Augmented Generation (RAG) techniques.\\n\\n## Introduction\\ud83d\\ude80\\nRAG is a popular method that improves accuracy and relevance by finding the right information from reliable sources and transforming it into useful answers. This repository covers the most effective advanced + agentic RAG techniques with clear implementations and explanations.\\n\\nThe main goal of this repository is to provide a helpful resource for researchers and developers looking to use adv", "filename": "rag-cookbooks-main/README.md"}, {"start": 800, "chunk": "t%20on%20GitHub:%20https://github.com/athina-ai/rag-cookbooks)\\n\\n>If you find this repository helpful, please consider giving it a star\\u2b50\\ufe0f\\n\\n# Advanced + Agentic RAG Cookbooks\\ud83d\\udc68\\ud83c\\udffb\\u200d\\ud83d\\udcbb\\nWelc

In [74]:
response.choices[0].message.content

In [75]:
system_prompt = """
You are a helpful assistant for a course. 

Always search for relevant information before answering. 
If the first search doesn't give you enough information, try different search terms.

Make multiple searches if needed to provide comprehensive answers.
"""


In [76]:
from typing import List, Any

def text_search(query: str) -> List[Any]:
    """
    Perform a text-based search on the RAG cookbook data.

    Args:
        query (str): The search query string.

    Returns:
        List[Any]: A list of up to 5 search results returned by the RAG cookbook.
    """
    return index.search(query, num_results=5)


In [77]:
from pydantic import BaseModel
from pydantic_ai import Agent

In [78]:
agent = Agent(
    name="rag_agent",
    instructions=system_prompt,
    tools=[text_search],
    model='gpt-4o-mini'
)


In [79]:
question = "What are the main components of RAG"

result = await agent.run(user_prompt=question)

In [83]:
result

AgentRunResult(output="Retrieval-Augmented Generation (RAG) consists of four main components:\n\n1. **Indexing**: Initially, documents are divided into smaller chunks, and embeddings for these chunks are generated. These embeddings are stored in a vector store, which allows for efficient retrieval based on a query.\n\n2. **Retrieval**: When a query is made, the system retrieves relevant document chunks from the vector store. This step is crucial for ensuring that the response generated is based on the most pertinent and up-to-date information.\n\n3. **Generation**: The retrieved document chunks are then used as context for a language model (like GPT) to generate responses. The model combines the information from these document chunks with its own learned knowledge to provide a coherent and relevant answer.\n\n4. **In-context Learning**: This component enhances the model's ability to use external documents effectively in generating responses, ensuring that the outputs are contextually r