Follow along this tutorial: https://github.com/alexeygrigorev/rag-agents-workshop

In [1]:
%pip install minsearch

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [3]:
from minsearch import AppendableIndex

index = AppendableIndex(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.append.AppendableIndex at 0x2878bbc5e50>

In [36]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5,
        output_ids=True
    )

    return results

In [5]:
question = 'Can I still join the course?'

In [6]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

def build_prompt(query, search_results):
    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [7]:
search_results = search(question)

In [8]:
prompt = build_prompt(question, search_results)

In [4]:
import os
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()

client = OpenAI(
    base_url="https://api.groq.com/openai/v1",
    api_key=os.environ.get("GROQ_API_KEY")
)

def llm(prompt):
    response = client.chat.completions.create(
        model='llama-3.3-70b-versatile',
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

In [10]:
answer = llm(prompt)

In [11]:
print(answer)

Yes, you can still join the course. According to the course information, you can submit homework even if you don't register, and you can follow the course at your own pace. However, be aware of the deadlines for turning in final projects.


In [12]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [13]:
rag("How do I patch KDE under FreeBSD?")

'There is no information available in the CONTEXT to answer the question about patching KDE under FreeBSD.'

## "Agentic" RAG

In [49]:
prompt_template = """
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.
At the beginning the context is EMPTY.

<QUESTION>
{question}
</QUESTION>

<CONTEXT> 
{context}
</CONTEXT>

If CONTEXT is EMPTY, you can use our FAQ database.
In this case, use the following output template:

{{
"action": "SEARCH",
"reasoning": "<add your reasoning here>"
}}

If you can answer the QUESTION using CONTEXT, use this template:

{{
"action": "ANSWER",
"answer": "<your answer>",
"source": "CONTEXT"
}}

If the context doesn't contain the answer, use your own knowledge to answer the question

{{
"action": "ANSWER",
"answer": "<your answer>",
"source": "OWN_KNOWLEDGE"
}}

Make sure that each output you return is only a well formatted JSON.
""".strip()

In [50]:
question = 'Can I still join the course?'
context = 'EMPTY'

In [51]:
prompt = prompt_template.format(question=question, context=context)

In [52]:
answer_json = llm(prompt)

In [40]:
import json

In [54]:
answer = json.loads(answer_json)

In [20]:
answer

{'action': 'SEARCH',
 'reasoning': 'The context is empty, so I need to search the FAQ database to find out if the student can still join the course. The FAQ database may have information on course enrollment deadlines, eligibility criteria, and registration procedures.'}

In [21]:
def build_context(search_results):
    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    return context.strip()

In [22]:
search_results = search(question)
context = build_context(search_results)
prompt = prompt_template.format(question=question, context=context)

In [23]:
answer_json = llm(prompt)

In [24]:
print(answer_json)

{
"action": "ANSWER",
"answer": "Yes, you can still join the course. According to our course policies, even if you don't register, you're still eligible to submit the homeworks. However, be aware that there will be deadlines for turning in the final projects, so don't leave everything for the last minute.",
"source": "CONTEXT"
}


In [25]:
def agentic_rag(question):
    context = "EMPTY"
    prompt = prompt_template.format(question=question, context=context)
    answer_json = llm(prompt)
    answer = json.loads(answer_json)
    print(answer)

    if answer['action'] == 'SEARCH':
        print('need to perform search...')
        search_results = search(question)
        context = build_context(search_results)
        
        prompt = prompt_template.format(question=question, context=context)
        answer_json = llm(prompt)
        answer = json.loads(answer_json)
        print(answer)

    return answer

In [26]:
agentic_rag('How do I setup Docker?')

{'action': 'ANSWER', 'answer': "To set up Docker, you'll need to download and install the Docker Desktop application from the official Docker website. Once installed, follow these steps: 1) Launch Docker Desktop, 2) Create a Docker account or log in if you already have one, 3) Verify that Docker is running by opening a terminal or command prompt and typing 'docker --version' or 'docker run hello-world', 4) Pull a Docker image from Docker Hub using 'docker pull <image_name>', and 5) Run a Docker container using 'docker run <image_name>'. You can also use the Docker CLI to manage images, containers, and volumes.", 'source': 'OWN_KNOWLEDGE'}


{'action': 'ANSWER',
 'answer': "To set up Docker, you'll need to download and install the Docker Desktop application from the official Docker website. Once installed, follow these steps: 1) Launch Docker Desktop, 2) Create a Docker account or log in if you already have one, 3) Verify that Docker is running by opening a terminal or command prompt and typing 'docker --version' or 'docker run hello-world', 4) Pull a Docker image from Docker Hub using 'docker pull <image_name>', and 5) Run a Docker container using 'docker run <image_name>'. You can also use the Docker CLI to manage images, containers, and volumes.",
 'source': 'OWN_KNOWLEDGE'}

In [27]:
agentic_rag('How can I get the course certificate?')

{'action': 'SEARCH', 'reasoning': 'The context is empty, so I need to search the FAQ database to find the requirements for obtaining a course certificate.'}
need to perform search...
{'action': 'ANSWER', 'answer': "To get a course certificate, you need to finish the course with a 'live' cohort. This is because you need to peer-review capstone(s) after submitting a project, which can only be done when the course is running. You cannot get a certificate if you take the course in self-paced mode.", 'source': 'CONTEXT'}


{'action': 'ANSWER',
 'answer': "To get a course certificate, you need to finish the course with a 'live' cohort. This is because you need to peer-review capstone(s) after submitting a project, which can only be done when the course is running. You cannot get a certificate if you take the course in self-paced mode.",
 'source': 'CONTEXT'}

## Agentic Search

Remove duplicated query results to save tokens.

In [28]:
def dedup(seq):
    seen = set()
    result = []
    for el in seq:
        _id = el['_id']
        if _id in seen:
            continue
        seen.add(_id)
        result.append(el)
    return result

In [34]:
prompt_template = """
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.

Don't repeat previously performed actions.

Don't perform more than {max_iterations} iterations for a given student question.
The current iteration number: {iteration_number}. If we exceed the allowed number 
of iterations, give the best possible answer with the provided information.

Output templates:

If you want to perform search, use this template:

{{
"action": "SEARCH",
"reasoning": "<add your reasoning here>",
"keywords": ["search query 1", "search query 2", ...]
}}

If you can answer the QUESTION using CONTEXT, use this template:

{{
"action": "ANSWER_CONTEXT",
"answer": "<your answer>",
"source": "CONTEXT"
}}

If the context doesn't contain the answer, use your own knowledge to answer the question

{{
"action": "ANSWER",
"answer": "<your answer>",
"source": "OWN_KNOWLEDGE"
}}

Make sure that each output you return per iteration is only a well formatted JSON.

<QUESTION>
{question}
</QUESTION>

<SEARCH_QUERIES>
{search_queries}
</SEARCH_QUERIES>

<CONTEXT> 
{context}
</CONTEXT>

<PREVIOUS_ACTIONS>
{previous_actions}
</PREVIOUS_ACTIONS>
""".strip()

In [35]:
question = 'how do I do well on module 1'
max_iterations = 3
iteration_number = 0
search_queries = []
search_results  = []
previous_actions = []

In [36]:
context = build_context(search_results)

prompt = prompt_template.format(
    question=question,
    context=context,
    search_queries="\n".join(search_queries),
    previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
    max_iterations=max_iterations,
    iteration_number=iteration_number
)

In [37]:
answer_json = llm(prompt)
print(answer_json)

{
"action": "SEARCH",
"reasoning": "The question is about performing well on module 1, so we need to gather information about what module 1 entails and the requirements for success. Searching the FAQ database with relevant keywords will provide us with the necessary context to answer the student's question.",
"keywords": ["module 1 requirements", "module 1 study tips", "module 1 assessment criteria"]
}


In [38]:
answer = json.loads(answer_json)

In [39]:
previous_actions.append(answer)

In [40]:
keywords = answer['keywords']

In [41]:
for kw in keywords:
    search_queries.append(kw)
    sr = search(kw)
    search_results.extend(sr)

In [42]:
search_results = dedup(search_results)

In [43]:
iteration_number = 2

context = build_context(search_results)

prompt = prompt_template.format(
    question=question,
    context=context,
    search_queries="\n".join(search_queries),
    previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
    max_iterations=max_iterations,
    iteration_number=iteration_number
)

In [44]:
answer_json = llm(prompt)

In [45]:
answer = json.loads(answer_json)
print(answer['answer'])

To do well on Module 1, it seems that Docker and Terraform are key topics. Based on the CONTEXT provided, it appears that students have encountered issues with psycopg2 and SQLAlchemy, and resolving these issues is crucial. Make sure to install the necessary packages, such as psycopg2-binary, and update your conda or pip as needed. Additionally, pay attention to the connection string when using create_engine. Unfortunately, without more specific information about the module's content and the student's current progress, it's challenging to provide a more tailored answer. However, it's essential to carefully follow the course instructions, and if issues arise, refer to the solutions provided in the CONTEXT or seek additional help.


In [46]:
question = "what do I need to do to be successful at the course?"

search_queries = []
search_results = []
previous_actions = []

iteration = 0

while True:
    print(f'ITERATION #{iteration}...')

    context = build_context(search_results)
    prompt = prompt_template.format(
        question=question,
        context=context,
        search_queries="\n".join(search_queries),
        previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
        max_iterations=3,
        iteration_number=iteration
    )

    print(prompt)

    answer_json = llm(prompt)
    answer = json.loads(answer_json)
    print(json.dumps(answer, indent=2))

    previous_actions.append(answer)

    action = answer['action']
    if action != 'SEARCH':
        break

    keywords = answer['keywords']
    search_queries = list(set(search_queries) | set(keywords))
    
    for k in keywords:
        res = search(k)
        search_results.extend(res)

    search_results = dedup(search_results)
    
    iteration = iteration + 1
    if iteration >= 4:
        break

    print()

ITERATION #0...
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.

Don't repeat previously performed actions.

Don't perform more than 3 iterations for a given student question.
The current 

In [47]:
answer

{'action': 'ANSWER',
 'answer': "To be successful at the course, it's essential to manage your time effectively, stay organized, and keep up with the coursework. Based on the provided context, it's recommended to register for the course before it starts, join the course Telegram channel, and participate in the 'Office Hours' live sessions. Additionally, you can use Git and GitHub to access the instructors' code and make pull requests. The course also provides a leaderboard to track your progress, and you can earn points by submitting homework, engaging in discussions, and completing learning activities. While the context doesn't provide a comprehensive guide to success, it's clear that being proactive, managing your time, and staying engaged with the course materials are crucial to achieving success.",
 'source': 'OWN_KNOWLEDGE'}

In [48]:
iteration

2

## Function calling ("tool use")

Groq tools: https://console.groq.com/docs/tool-use

In [41]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5,
        output_ids=True
    )

    return json.dumps(results)

In [29]:
search_tool = {
    "type": "function",
    "function": {
        "name": "search",
        "description": "Search the FAQ database",
        "parameters": {
            "type": "object",
            "properties": {
                "query": {
                    "type": "string",
                    "description": "Search query text to look up in the course FAQ."
                }
            },
            "required": ["query"],
        }
    }
}

In [7]:
def do_call(tool_call):
    function_name = tool_call.function.name
    arguments = json.loads(tool_call.function.arguments)

    available_tools = {
        "search": search
    }
    
    function = available_tools[function_name]
    function_response = function(**arguments)

    return {
        "tool_call_id": tool_call.id, 
        "role": "tool",
        "name": function_name,
        "content": function_response,
    }

In [8]:
question = "How do I do well in module 1?"

system_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.
If you look up something in FAQ, convert the student question into multiple queries
using the search function.
""".strip()

tools = [search_tool]

chat_messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": question}
]

response = client.chat.completions.create(
    model='llama-3.3-70b-versatile',
    messages=chat_messages,
    tools=tools,
    tool_choice="auto"
)
response_message = response.choices[0].message
response_message

ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='bhqx0qjga', function=Function(arguments='{"query":"module 1 tips"}', name='search'), type='function'), ChatCompletionMessageToolCall(id='ydqvn73mq', function=Function(arguments='{"query":"module 1 requirements"}', name='search'), type='function'), ChatCompletionMessageToolCall(id='ad5c4bfva', function=Function(arguments='{"query":"module 1 study guide"}', name='search'), type='function')])

In [193]:
tool_calls = response_message.tool_calls
tool_calls

[ChatCompletionMessageToolCall(id='391ts7kz2', function=Function(arguments='{"query":"tips for module 1 success"}', name='search'), type='function'),
 ChatCompletionMessageToolCall(id='3p3vyzb6v', function=Function(arguments='{"query":"module 1 learning strategies"}', name='search'), type='function'),
 ChatCompletionMessageToolCall(id='mey5mdz1j', function=Function(arguments='{"query":"common mistakes in module 1"}', name='search'), type='function'),
 ChatCompletionMessageToolCall(id='v331hwe76', function=Function(arguments='{"query":"module 1 study advice"}', name='search'), type='function')]

In [194]:
chat_messages.append(response_message)
for tool_call in tool_calls:
    function_response = do_call(tool_call)
    chat_messages.append(function_response)

In [195]:
chat_messages[3]

{'tool_call_id': '391ts7kz2',
 'role': 'tool',
 'name': 'search',
 'content': '[{"text": "Even after installing pyspark correctly on linux machine (VM ) as per course instructions, faced a module not found error in jupyter notebook .\\nThe solution which worked for me(use following in jupyter notebook) :\\n!pip install findspark\\nimport findspark\\nfindspark.init()\\nThereafter , import pyspark and create spark contex<<t as usual\\nNone of the solutions above worked for me till I ran !pip3 install pyspark instead !pip install pyspark.\\nFilter based on conditions based on multiple columns\\nfrom pyspark.sql.functions import col\\nnew_final.filter((new_final.a_zone==\\"Murray Hill\\") & (new_final.b_zone==\\"Midwood\\")).show()\\nKrishna Anand", "section": "Module 5: pyspark", "question": "Module Not Found Error in Jupyter Notebook .", "course": "data-engineering-zoomcamp", "_id": 322}, {"text": "You need to look for the Py4J file and note the version of the filename. Once you know the

In [196]:
second_response = client.chat.completions.create(
    model='llama-3.3-70b-versatile',
    messages=chat_messages
)
print(second_response.choices[0].message.content)

To do well in Module 1, it is essential to understand the basics of Docker and Terraform. Here are some tips to help you succeed:
1. **Familiarize yourself with Docker**: Make sure you have a good grasp of Docker basics, such as creating containers, running images, and managing volumes.
2. **Understand Terraform**: Study the fundamentals of Terraform, including how to create infrastructure as code, manage state, and use modules.
3. **Practice, practice, practice**: Practice deploying containers and managing infrastructure using Terraform.
4. **Join online communities**: Participate in online forums, such as the Data Engineering Zoomcamp Slack channel, to ask questions and learn from others.
5. **Review the course materials**: Go through the course notes, slides, and videos to ensure you understand the concepts covered in Module 1.
6. **Work on the assignments**: Complete the assignments and exercises provided in the course to gain hands-on experience with Docker and Terraform.
7. **See

In [218]:
system_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.

Use FAQ if your own knowledge is not sufficient to answer the question.
When using FAQ, perform deep topic exploration: make one request to FAQ,
and then based on the results, make more requests (search function call).

When a final response is given, ask the user a follow up question based on your answer.
""".strip()

In [220]:
chat_messages = [
    {"role": "system", "content": system_prompt},
]

while True: # main Q&A loop
    question = input() # How do I do my best for module 1?
    if question == 'stop':
        break

    message = {"role": "user", "content": question}
    chat_messages.append(message)

    while True: # request-response loop - query API till get a message
        response = client.chat.completions.create(
            model='llama-3.1-8b-instant',
            messages=chat_messages,
            tools=tools,
            tool_choice="auto"
        )

        message_response = response.choices[0].message
        chat_messages.append(message_response)
        
        tool_calls = message_response.tool_calls
        if not tool_calls:
            print(message_response.content)
            break
        
        for tool_call in tool_calls:
            print('function_call:', tool_call)
            print()
            function_response = do_call(tool_call)
            chat_messages.append(function_response)

function_call: ChatCompletionMessageToolCall(id='nqbr2912k', function=Function(arguments='{"query":"module 1 performance tips"}', name='search'), type='function')

Based on the results, it seems that the main advice for doing well in module 1 is to make sure you have the right environment set up, particularly with Docker and Terraform. Here are some specific tips:

1. Make sure you have the latest version of Docker installed and that you're using the right image for your project.
2. Use the `--platform` flag when building your Docker image to ensure that it's compatible with your system.
3. Use `pip install findspark` and `findspark.init()` to set up PySpark in your Jupyter notebook.
4. Use `!pip3 install pyspark` instead of `!pip install pyspark` if you're using a non-Windows OS.
5. Make sure you have the right version of `pytz` installed by adding `RUN python -m pip install --no-cache pytz` to your Dockerfile.
6. Update your `PYTHONPATH` environment variable to include the correct ve

## Multiple tools

In [225]:
!curl -O https://raw.githubusercontent.com/alexeygrigorev/rag-agents-workshop/refs/heads/main/chat_assistant.py

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  3485  100  3485    0     0  13212      0 --:--:-- --:--:-- --:--:-- 13301


In [67]:
def add_entry(question, answer):
    doc = {
        'question': question,
        'text': answer,
        'section': 'user added',
        'course': 'data-engineering-zoomcamp'
    }
    index.append(doc)

    return json.dumps(doc)

In [61]:
add_entry_description = {
    "type": "function",
    "function": {
        "name": "add_entry",
        "description": "Add an entry to the FAQ database",
        "parameters": {
            "type": "object",
            "properties": {
                "question": {
                    "type": "string",
                    "description": "The question to be added to the FAQ database",
                },
                "answer": {
                    "type": "string",
                    "description": "The answer to the question",
                }
            },
            "required": ["question", "answer"],
        }
    }
}

In [77]:
import importlib
import chat_assistant
importlib.reload(chat_assistant)

tools = chat_assistant.Tools()
tools.add_tool(search, search_tool)

In [78]:
tools.add_tool(add_entry, add_entry_description)

In [79]:
tools.get_tools()

[{'type': 'function',
  'function': {'name': 'search',
   'description': 'Search the FAQ database',
   'parameters': {'type': 'object',
    'properties': {'query': {'type': 'string',
      'description': 'Search query text to look up in the course FAQ.'}},
    'required': ['query']}}},
 {'type': 'function',
  'function': {'name': 'add_entry',
   'description': 'Add an entry to the FAQ database',
   'parameters': {'type': 'object',
    'properties': {'question': {'type': 'string',
      'description': 'The question to be added to the FAQ database'},
     'answer': {'type': 'string',
      'description': 'The answer to the question'}},
    'required': ['question', 'answer']}}}]

In [89]:
system_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.

Use FAQ via search function if your own knowledge is not sufficient to answer the question.

When giving a final response, ask the user a follow up question based on your answer.
""".strip()

chat_interface = chat_assistant.ChatInterface()

chat = chat_assistant.ChatAssistant(
    tools=tools,
    developer_prompt=system_prompt,
    chat_interface=chat_interface,
    client=client
)

In [90]:
chat.run()

Chat ended.


In [91]:
index.docs[-1]

{'question': 'How do I install missing Python modules?',
 'text': 'To install missing Python modules, run pip install module_name. For example, to install psycopg2, run pip install psycopg2.',
 'section': 'user added',
 'course': 'data-engineering-zoomcamp'}