## Load Data

In [1]:
import json

with open('documents-with-ids.json', 'r') as f_in:
    documents = json.load(f_in)

In [2]:
documents[10]

{'text': 'It depends on your background and previous experience with modules. It is expected to require about 5 - 15 hours per week. [source1] [source2]\nYou can also calculate it yourself using this data and then update this answer.',
 'section': 'General course-related questions',
 'question': 'Course - \u200b\u200bHow many hours per week am I expected to spend on this  course?',
 'course': 'data-engineering-zoomcamp',
 'id': 'ea739c65'}

In [3]:
import pandas as pd

gt_csv_path = 'ground-truth-data.csv'

df_gt = pd.read_csv(gt_csv_path)
df_gt = df_gt[df_gt.course == 'machine-learning-zoomcamp']
ground_truth = df_gt.to_dict(orient='records')

In [4]:
ground_truth[10]

{'question': 'Are sessions recorded if I miss one?',
 'course': 'machine-learning-zoomcamp',
 'document': '5170565b'}

In [36]:
doc_idx = {d['id']: d for d in documents}
doc_idx['5170565b']['text']

'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.'

## Index data

In [6]:
from sentence_transformers import SentenceTransformer

model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
from tqdm.auto import tqdm

vectors = []

for doc in tqdm(documents):
    question = doc['question']
    text = doc['text']
    vector = model.encode(question + ' ' + text)
    vectors.append(vector)

100%|███████████████████████████████████████████████████| 948/948 [00:53<00:00, 17.87it/s]


In [8]:
import numpy as np

vectors = np.array(vectors)

In [13]:
from elasticsearch import Elasticsearch

es_url = 'http://127.0.0.1:9200/'

es_client = Elasticsearch(es_url)

In [15]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} ,
            "id": {"type": "keyword"} ,
            "question_text_vector":{
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            }
        }
    }
}

index_name = 'course-questions-eval'

# es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

  es_client.indices.create(index=index_name, body=index_settings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions-eval'})

In [18]:
import numpy as np
np.float_ = np.float64

In [19]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    question = doc['question']
    text = doc['text']
    doc['question_text_vector'] = model.encode(question + ' ' + text)

    es_client.index(index=index_name, document=doc)

100%|█| 948/948 [00:5


In [21]:
def elastic_search_knn(field, vector, course):
    knn = {
        "field": field,
        "query_vector": vector,
        "k":5,
        "num_candidates": 10000,
        "filter":{
            "term":{
                "course": course
            }
        }
    }

    search_query = {
        "knn":knn,
        "_source":["text", "section", "question", "course", "id"]
    }

    response = es_client.search(index=index_name, body=search_query)

    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

def question_text_vector_knn(q):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_knn('question_text_vector', v_q, course)

In [22]:
question_text_vector_knn(
    dict(
        question='Are sessions recorded if I miss one?',
        course='machine-learning-zoomcamp'
    )
)

  response = es_client.search(index=index_name, body=search_query)


[{'question': 'What if I miss a session?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.',
  'id': '5170565b'},
 {'question': 'Is it going to be live? When?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'The course videos are pre-recorded, you can start watching the course right now.\nWe will also occasionally have office hours - live sessions where we will answer your questions. The office hours sessions are recorded too.\nYou can see the office hours as well as the pre-recorded course videos in the course playlist on YouTube.',
  'id': '39fda9f0'},
 {'question': 'The same accuracy on epochs',
  'course': 'machine-learning-zoomcamp',
  'section': '8. Neural Networks an

## RAG Flow

In [30]:
def build_prompt(query, search_results):
    prompt_template = """
        You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
        Use only the facts from the CONTEXT when answering the QUESTION.
        
        QUESTION: {question}
        
        CONTEXT:
        {context}
        """
    
    context = ""

    for doc in search_results:
        context += f"section: {doc['section']}\nquestion:{doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()

    return prompt

In [24]:
from google import genai
import os

GEMINI_API_KEY = os.environ["GEMINI_API_KEY"]

client = genai.Client(api_key=GEMINI_API_KEY)

In [25]:
def llm(prompt):
    response = client.models.generate_content(
        model="gemini-2.0-flash",
        # config=types.GenerateContentConfig(
            # system_instruction="You are a cat. Your name is Neko."),
        contents=prompt
    )
    return response.text

In [31]:
def rag(query):
    search_results = question_text_vector_knn(query)
    prompt = build_prompt(query['question'], search_results)
    answer = llm(prompt)

    return answer

In [33]:
rag(ground_truth[10])

  response = es_client.search(index=index_name, body=search_query)


'Yes, everything is recorded, so you won’t miss anything. The office hour sessions are recorded too. You can see the pre-recorded course videos, as well as office hours in the course playlist on YouTube.\n'

In [37]:
doc_idx['5170565b']['text']

'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.'

## Cosine similarity metric

In [38]:
answer_original = doc_idx['5170565b']['text']
v_original = model.encode(answer_original)

answer_llm = rag(ground_truth[10])
v_llm = model.encode(answer_llm)



  response = es_client.search(index=index_name, body=search_query)


In [39]:
v_llm.dot(v_original)

np.float32(0.6928885)