In [1]:
# Evaluating retrievel
    # hitrate
    # mrr

# Evaluation
    # offline evaluation
        # cosine similarity (original ans <--> llm ans)
        # LLM as a judge
    # online evaluation
        # A/B tests
        # user feedback
    # monitoring
        # overall health of the sys
        # how good the answer is

In [2]:
## Load documents with IDs

import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [3]:
documents[10]

{'text': 'It depends on your background and previous experience with modules. It is expected to require about 5 - 15 hours per week. [source1] [source2]\nYou can also calculate it yourself using this data and then update this answer.',
 'section': 'General course-related questions',
 'question': 'Course - \u200b\u200bHow many hours per week am I expected to spend on this  course?',
 'course': 'data-engineering-zoomcamp',
 'id': 'ea739c65'}

In [4]:
## Load ground truth

import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [5]:
ground_truth[10]

{'question': 'Are sessions recorded if I miss one?',
 'course': 'machine-learning-zoomcamp',
 'document': '5170565b'}

In [6]:
doc_idx = {d['id']: d for d in documents}
doc_idx['5170565b']['text']

'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.'

In [7]:
# Index data

from sentence_transformers import SentenceTransformer

model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

  from tqdm.autonotebook import tqdm, trange


In [8]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 
es_client.info()

ObjectApiResponse({'name': 'd71b13e1f0dc', 'cluster_name': 'docker-cluster', 'cluster_uuid': '0Va1pTxQQFqru9PeiEW4oA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [9]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [10]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    question = doc['question']
    text = doc['text']
    doc['question_text_vector'] = model.encode(question + ' ' + text)

    es_client.index(index=index_name, document=doc)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [01:20<00:00, 11.75it/s]


In [61]:
# Retrieval

def elastic_search_knn(field, vector, course):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

def question_text_vector_knn(q):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_knn('question_text_vector', v_q, course)

In [62]:
question_text_vector_knn(dict(
    question='Are sessions recorded if I miss one?',
    course='machine-learning-zoomcamp'
))

[{'question': 'What if I miss a session?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.',
  'id': '5170565b'},
 {'question': 'Is it going to be live? When?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'The course videos are pre-recorded, you can start watching the course right now.\nWe will also occasionally have office hours - live sessions where we will answer your questions. The office hours sessions are recorded too.\nYou can see the office hours as well as the pre-recorded course videos in the course playlist on YouTube.',
  'id': '39fda9f0'},
 {'question': 'The same accuracy on epochs',
  'course': 'machine-learning-zoomcamp',
  'section': '8. Neural Networks an

In [63]:
## The RAG flow

def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [64]:
from groq import Groq
from dotenv import load_dotenv
import os

load_dotenv()
groq_api_key = os.getenv('GROQ_API_KEY')
client = Groq(api_key=groq_api_key)

def llm(prompt, model='llama3-8b-8192'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [65]:
# previously: rag(query: str) -> str
def rag(query: dict, model='llama3-8b-8192') -> str:
    search_results = question_text_vector_knn(query)
    prompt = build_prompt(query['question'], search_results)
    answer = llm(prompt, model=model)
    return answer

In [66]:
ground_truth[10]

{'question': 'Are sessions recorded if I miss one?',
 'course': 'machine-learning-zoomcamp',
 'document': '5170565b'}

In [67]:
rag(ground_truth[10])

'Based on the context, the QUESTION is: Are sessions recorded if I miss one?\n\nAccording to the context, the answer is: Yes, everything is recorded, so you won’t miss anything.'

In [68]:
doc_idx['5170565b']['text']

'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.'

## Cosine similarity metric

In [69]:
answer_org = 'Yes, everything is recorded, so you won’t miss anything.'
answer_llm = 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.'

v_llm = model.encode(answer_llm)
v_org = model.encode(answer_org)

v_llm.dot(v_org)

0.5877092

In [70]:
ground_truth[0]

{'question': 'Where can I sign up for the course?',
 'course': 'machine-learning-zoomcamp',
 'document': '0227b872'}

In [71]:
answers = {}

for i, rec in enumerate(tqdm(ground_truth)):
    if i in answers:
        continue

    answer_llm = rag(rec)
    doc_id = rec['document']
    original_doc = doc_idx[doc_id]
    answer_org = ['text']

    answers[i] = {
        'answer_llm': answer_llm,
        'answer_org': answer_org,
        'document': doc_id,
        'question': rec['question'],
        'course': rec['course'],
    }

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1830/1830 [1:22:30<00:00,  2.71s/it]


In [74]:
import pandas as pd

df_groq_llama3 = pd.DataFrame(answers.values())

In [76]:
df_groq_llama3.sample(n=5).to_dict(orient='records')

[{'answer_llm': 'Based on the CONTEXT, if you encounter the issue "type InterpreterWrapper is already registered", you should:\n\n* Restart the kernel\n* Import only tflite_runtime.interpreter: "import tflite_runtime.interpreter as tflite"\n\nThis solution is provided in section 9. Serverless Deep Learning.',
  'answer_org': ['text'],
  'document': 'da2f1cf4',
  'question': "What should I do if I get 'type InterpreterWrapper is already registered'?",
  'course': 'machine-learning-zoomcamp'},
 {'answer_llm': 'Based on the CONTEXT from the FAQ database, I can help you with your QUESTION.\n\nThe QUESTION is: Where can I find the option to set IPv6 to Manually on MacOS Ventura?\n\nThe ANSWER is: You can find the option to set IPv6 to Manually in System Settings > Network > your network connection > Details > Configure IPv6 > set to Manually > OK.',
  'answer_org': ['text'],
  'document': 'efdb235f',
  'question': 'Where can I find the option to set IPv6 to Manually on MacOS Ventura?',
  'c

In [None]:
df_groq_llama3 = [None] * len(ground_truth)

for i, val in answers.items():
    results_groq_llama3[i] = val.copy()
    results_groq_llama3[i].update(ground_truth[i])

In [None]:
!mkdir data
df_groq_llama3.to_csv('data/results-groql3', index=False)

## Evaluating with mixtral-8x7b-32768

In [25]:
rag(ground_truth[10], model="mixtral-8x7b-32768")

'Yes, sessions are recorded and you can watch them if you miss a session. This includes both the pre-recorded course videos and the live office hours sessions where questions are answered. However, please note that if you miss the midterm project, you can still receive a certificate, as mentioned in a previous answer.'

In [60]:
# speeding things up --script--
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor

pool = ThreadPoolExecutor(max_workers=1)

def map_progress(pool, seq, f):
    results = []

    with tqdm(total=len(seq)) as progress:
        futures = []

        for el in seq:
            future = pool.submit(f, el)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        for future in futures:
            result = future.result()
            results.append(result)

    return results

ValueError: max_workers must be greater than 0

In [57]:
def process_record(rec):

    model =  "mixtral-8x7b-32768"
    answer_llm = rag(rec, model=model)
    
    doc_id = rec['document']
    original_doc = doc_idx[doc_id]
    answer_orig = original_doc['text']

    return {
        'answer_llm': answer_llm,
        'answer_orig': answer_orig,
        'document': doc_id,
        'question': rec['question'],
        'course': rec['course'],
    }

In [58]:
process_record(ground_truth[10])

{'answer_llm': 'Yes, all sessions are recorded and will be available for you to watch if you miss one. This includes both the pre-recorded course videos and the live office hours sessions where questions are answered. Additionally, if you have questions that you would like to be addressed during the live stream, you can ask them in advance through Slack or during office hours.',
 'answer_orig': 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.',
 'document': '5170565b',
 'question': 'Are sessions recorded if I miss one?',
 'course': 'machine-learning-zoomcamp'}

In [59]:
groq_mixtral = map_progress(pool, ground_truth, process_record)

df_groq_mixtral = pd.DataFrame(groq_mixtral.values())

df_groq_mixtral = [None] * len(ground_truth)

for i, val in answers.items():
    results_groq__mixtral[i] = val.copy()
    results_groq__mixtral[i].update(ground_truth[i])

df_groq_mixtral.to_csv('data/results-groqmxt', index=False)

  0%|                                                                                                                 | 1/1830 [00:16<8:26:42, 16.62s/it]


RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for model `mixtral-8x7b-32768` in organization `org_01htv0tzqfe4qv39xgh74hej67` on tokens per minute (TPM): Limit 5000, Used 4749, Requested 666. Please try again in 4.978s. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}

## Cosine similarity

In [None]:
results_groql3 = results
record = results[2]
record

In [None]:
def compute_similarity(record):
    answer_org = record['answer_org']
    answer_llm = record['answer_llm']
    
    v_llm = model.encode(answer_llm)
    v_org = model.encode(answer_org)
    
    return v_llm.dot(v_org)
    

In [None]:
similarity = []

for record in tqdm(results_groql3):
    sim = compute_similarity(record)
    similarity.append(sim)

In [None]:
df_groq_llama3['cosine'] = similarity

In [None]:
import seaborn as sns

## LLM as a judge

In [None]:
prompt1_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer compared to the original answer provided.
Based on the relevance and similarity of the generated answer to the original answer, you will classify
it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Original Answer: {answer_orig}
Generated Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the original
answer and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

prompt2_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [None]:
df_sample = df_groq_llama3.sample(n=150, random_state=1)
sample = df_sample.to_dict(orient='record')

In [None]:
record = sample[0]
record

In [None]:
prompt = prompt1_template.format(**record)

In [None]:
answer = llm(prompt, model='llama3-8b-8192')

In [None]:
import json
json.loads(answer)

In [None]:
answer = []
for record in tqdm(samples):
    prompt = prompt1_template.format(**record)
    answer=llm(prompt, model='llama3-8b-8192')
    answer.append(answer)

In [None]:
json_answers = []
for i, str_answer in enumerate(answer):
    json_answer = json.loads(str_answer)
    json_answer.append(json_answer)

In [None]:
df_evaluations = pd.DataFrame(json_answers)
df_evaluations

In [None]:
df_evaluations.Relevance.value_counts()

In [None]:
df_evaluations[df_evaluations.Relevance == 'NON_RELEVANT'] #.to_dict(orient='records')

In [None]:
answer2 = []
for record in tqdm(samples):
    prompt = prompt2_template.format(**record)
    answer=llm(prompt, model='mixtral-8x7b-32768')
    answer2.append(answer)

json_answers2 = []
for i, str_answer in enumerate(answer):
    json_answer = json.loads(str_answer)
    json_answers2.append(json_answer)

df_evaluations2 = pd.DataFrame(json_answers)
df_evaluations2

df_evaluations2.Relevance.value_counts()

df_evaluations2[df_evaluations2.Relevance == 'NON_RELEVANT'] #.to_dict(orient='records')