In [1]:
import requests

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [2]:
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

In [3]:
import sys
from pathlib import Path

project_root = Path.cwd().parent
sys.path.append(str(project_root / "01-intro"))
import minsearch 


index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.Index at 0x718e284978d0>

In [16]:
import google.generativeai as genai
from pathlib import Path
from dotenv import load_dotenv
import os

current_dir = Path.cwd() 
env_path = current_dir.parent / "01-intro" / ".env"

load_dotenv(env_path)
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

model = genai.GenerativeModel(
    model_name="gemini-2.0-flash"
)

In [17]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [18]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [19]:
def llm(prompt):
    try:
        response = model.generate_content(prompt)
        
        if response.text:
            return response.text
        else:
            if response.prompt_feedback.block_reason:
                reason = response.prompt_feedback.block_reason.name
                return f"Gemini blocked response. Reason: {reason}"
            return "No response generated"
            
    except Exception as e:
        return f"Error calling Gemini API: {str(e)}"

In [20]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [21]:
rag('how do I run kafka?')

'To run Kafka, there are different methods depending on the language you are using.\n\nFor Java Kafka, in the project directory, run: `java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java`\n\nFor Python Kafka, create a virtual environment and run requirements.txt in that environment. The steps are:\n1.  `python -m venv env`\n2.  `source env/bin/activate`\n3.  `pip install -r ../requirements.txt`\n\nTo activate the virtual environment, run: `source env/bin/activate`. To deactivate it, run: `deactivate`. Note that for Windows, the path is `env/Scripts/activate`. The virtual environment should be created only to run the python file, and Docker images should first all be up and running.\n\nIf you encounter a "ModuleNotFoundError: No module named \'kafka.vendor.six.moves\'" error, use `pip install kafka-python-ng` instead.\n'

In [22]:
rag('the course has already started, can I still enroll?')

"Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects, so don't leave everything for the last minute.\n"

## **RAG with Vector Search**

In [28]:
from qdrant_client import QdrantClient, models

qd_client = QdrantClient("http://localhost:6333")
EMBEDDING_DIMENSIONALITY = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"
collection_name = "zoomcamp-faq"

qd_client.delete_collection(collection_name=collection_name)

True

In [29]:
qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,
        distance=models.Distance.COSINE
    )
)

True

In [30]:
qd_client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword"
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [31]:
points = []

for i, doc in enumerate(documents):
    text = doc['question'] + ' ' + doc['text']
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)

In [32]:
qd_client.upsert(
    collection_name=collection_name,
    points=points
)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

onnx/model.onnx:   0%|          | 0.00/130M [00:00<?, ?B/s]

UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [33]:
question = 'I just discovered the course. Can I still join it?'

In [34]:
def vector_search(question):
    print('vector_search is used')
    
    course = 'data-engineering-zoomcamp'
    query_points = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=question,
            model=model_handle 
        ),
        query_filter=models.Filter( 
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=5,
        with_payload=True
    )
    
    results = []
    
    for point in query_points.points:
        results.append(point.payload)
    
    return results

In [35]:
def rag(query):
    search_results = vector_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [36]:
rag('how do I run kafka?')

vector_search is used


'To run kafka, you can use the following methods:\n\n*   **Java:** In the project directory, run: `java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java`\n*   **Docker:** If you encounter a `kafka.errors.NoBrokersAvailable` error, your Kafka broker Docker container might not be working. Use `docker ps` to confirm and then run `docker compose up -d` in the Docker Compose YAML file folder to start all instances.\n*   **Python:** If you get a "Module \'kafka\' not found" error, create a virtual environment, activate it, and install the requirements.\n    ```\n    python -m venv env\n    source env/bin/activate\n    pip install -r ../requirements.txt\n    ```\n    To activate it (you\'ll need to run it every time you need the virtual env):\n    ```\n    source env/bin/activate\n    ```\n    To deactivate it:\n    ```\n    deactivate\n    ```\n    Note: For Windows, the path is slightly different (it\'s `env/Scripts/activate`). Also the virtual envi