# Setting enviroment

In [23]:
import requests
import pandas as pd
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch
from openai import OpenAI
from dotenv import load_dotenv
import os
from tqdm.auto import tqdm
import json

load_dotenv()

True

# Load documents

In [3]:
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [4]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

In [5]:
df_documents = pd.DataFrame(documents)
assert len(documents) == len(df_documents)
df_documents.head()

Unnamed: 0,text,section,question,course,id
0,The purpose of this document is to capture fre...,General course-related questions,Course - When will the course start?,data-engineering-zoomcamp,c02e79ef
1,GitHub - DataTalksClub data-engineering-zoomca...,General course-related questions,Course - What are the prerequisites for this c...,data-engineering-zoomcamp,1f6520ca
2,"Yes, even if you don't register, you're still ...",General course-related questions,Course - Can I still join the course after the...,data-engineering-zoomcamp,7842b56a
3,You don't need it. You're accepted. You can al...,General course-related questions,Course - I have registered for the Data Engine...,data-engineering-zoomcamp,0bbf41ec
4,You can start by installing and setting up all...,General course-related questions,Course - What can I do before the course starts?,data-engineering-zoomcamp,63394d91


# Load ground truth

In [6]:
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
gt_url = f'{base_url}/{relative_url}?raw=1'
df_gt = pd.read_csv(gt_url)
df_gt_ml = df_gt.query('course=="machine-learning-zoomcamp"')

In [7]:
def get_transformer_model():
    return SentenceTransformer(os.getenv("TRANSFORMER_MODEL_NAME"))
model = get_transformer_model()

In [9]:
es_client = Elasticsearch(os.getenv("ES_CLIENT_URL")) 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [10]:
for doc in documents:
    text = doc['text']
    question = doc['question']
    doc['question_text_vector'] = model.encode(question+' '+text)

    es_client.index(index=index_name, document=doc)

In [11]:
field = "question_text_vector"
vector = model.encode("Can I still join after the course beginning?")
course = "machine-learning-zoomcamp"
knn = {
    "field": field,
    "query_vector": vector,
    "k": 5,
    "num_candidates": 10000,
    "filter": {
        "term": {
            "course": course
        }
    }
}

search_query = {
    "knn": knn,
    "_source": ["text", "section", "question", "course", "id"]
}

es_results = es_client.search(index=index_name, body=search_query)

es_results['hits']['hits']
es_results['hits']['hits'][0]['_source']

{'question': 'The course has already started. Can I still join it?',
 'course': 'machine-learning-zoomcamp',
 'section': 'General course-related questions',
 'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
 'id': 'ee58a693'}

In [12]:
def get_retrieval_client():
    return Elasticsearch(os.getenv("ES_CLIENT_URL"))

def get_index_name():
    return os.getenv("INDEX_NAME")

def elastic_knn_search(field,vector,course, es_client, index_name):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(index=index_name, body=search_query)

    result_docs = []

    for hit in es_results['hits']['hits']:
        result_docs.append(hit["_source"])
    
    return result_docs

def retrieval_elastic_search_vector_knn(query, course, model, es_client, index_name):
    vector = model.encode(query)
    return elastic_knn_search('question_text_vector', vector, course, es_client, index_name)

query = "Can I still join after the course beginning?"
course = "machine-learning-zoomcamp"
retrieval_elastic_search_vector_knn(query,
                                    course=course,
                                    model=model,
                                    es_client=es_client,
                                    index_name=index_name)

[{'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'id': 'ee58a693'},
 {'question': 'How long is the course?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Approximately 4 months, but may take more if you want to do some extra activities (an extra project, an article, etc)',
  'id': '67e2fd13'},
 {'question': 'I just joined. What should I do next? How can I access course materials?',
  'course': 'machine-learning-zoomcamp',
  'section': 'Gene

In [13]:
PROMPT_TEMPLATE = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()
def build_prompt(query, search_results, prompt_template=PROMPT_TEMPLATE):
    context = ""

    for doc in search_results:
        text = doc["text"]
        section = doc["section"]
        question = doc["question"]
        context += f"section: {section}\nquestion: {question}\nanswer: {text}\n\n"

    return prompt_template.format(question=query, context=context).strip()

def get_llm_client():
    return OpenAI(
        base_url=os.getenv("BASE_LLM_URL"),
        api_key=os.getenv("API_KEY")
    )

def get_llm_model_name():
    return os.getenv("LLM_MODEL_NAME")

def llm_answer(prompt, client, llm_model_name):
    response = client.chat.completions.create(
        model=llm_model_name,
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

In [14]:
def rag(question, course):
    retrieval_client = get_retrieval_client()
    index_name = get_index_name()
    llm_client = get_llm_client()
    llm_model_name = get_llm_model_name()
    model = get_transformer_model()
    search_results = retrieval_elastic_search_vector_knn(question, course, model, retrieval_client, index_name)
    prompt = build_prompt(question, search_results)
    return llm_answer(prompt, llm_client, llm_model_name)

# Cosine similarity metric

In [15]:
course = df_gt_ml.iloc[10]['course']
id_doc = df_gt_ml.iloc[10]['document']
question =  df_gt_ml.iloc[10]['question']
print("Question:")
print(question)
true = df_documents.query(f'id=="{id_doc}"')['text'].iloc[0]
print("True answer:")
print(true)
generated = rag(question, course)
print("Generated answer:")
print(generated)
true_vec = model.encode(true)
gen_vec = model.encode(generated)
similarity_metric = true_vec.dot(gen_vec)
print("Similatity: ", similarity_metric)


Question:
Are sessions recorded if I miss one?
True answer:
Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.
Generated answer:
Yes, sessions are recorded if you miss one. Both the pre-recorded course videos and the live office hours sessions are recorded and available in the course playlist on YouTube. You won't miss anything, and you can ask questions in advance for office hours or via Slack.
Similatity:  0.6372177


# LLM as judge

## Evaluating responses

In [26]:
prompt1_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer compared to the original answer provided.
Based on the relevance and similarity of the generated answer to the original answer, you will classify
it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Original Answer: {answer_orig}
Generated Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the original
answer and provide your evaluation in parsable JSON without using json code blocks, I want only a clean
answer:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip() 

In [30]:
prompt = prompt1_template.format(answer_orig=true, question=question, answer_llm=generated)
llm_client = get_llm_client()
llm_model_name = get_llm_model_name()
relevance = llm_answer(prompt, llm_client, llm_model_name)
relevance_dict = json.loads(relevance)

In [31]:
print("LLM True answer to generated answer Relevance: ", relevance_dict['Relevance'])
print("LLM explanation: ", relevance_dict['Explanation'])

LLM True answer to generated answer Relevance:  RELEVANT
LLM explanation:  The generated answer accurately reflects the information in the original answer, confirming that sessions are recorded and mentioning the availability of recordings, the ability to ask questions in advance, and the option to ask questions via Slack. The additional detail about the course playlist on YouTube does not detract from the relevance.


## Evaluating relevance of the rag answer given a question

In [32]:
prompt2_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using json code blocks, I want only a clean
answer:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [33]:
prompt = prompt2_template.format(answer_orig=true, question=question, answer_llm=generated)
llm_client = get_llm_client()
llm_model_name = get_llm_model_name()
relevance = llm_answer(prompt, llm_client, llm_model_name)
relevance_dict = json.loads(relevance)

In [34]:
print("LLM question to generated answer Relevance: ", relevance_dict['Relevance'])
print("LLM explanation: ", relevance_dict['Explanation'])

LLM question to generated answer Relevance:  RELEVANT
LLM explanation:  The generated answer directly addresses the question by confirming that sessions are recorded if missed, and provides additional relevant details about where and how to access the recordings, as well as alternative ways to ask questions.
