## References



Hybrid

https://www.elastic.co/search-labs/blog/improving-information-retrieval-elastic-stack-hybrid

https://lazycoder.ro/posts/elasticsearch-hybrid-semantic-search-embeddings/


# Check Google Colab

In [1]:
try:
  import google.colab
  from google.colab import drive
  drive.mount('/content/drive')
  IN_COLAB = True
except:
  IN_COLAB = False
print(f"am I in Colab? {IN_COLAB}")

am I in Colab? False


# Import

In [2]:
import os
import json
import pandas as pd
import numpy as np
from datetime import datetime
from tqdm.auto import tqdm
import re
from time import time

# Elasticsearch
from elasticsearch import Elasticsearch
from sklearn.model_selection import train_test_split

# Model
#from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings
#from langchain_core.embeddings import Embeddings


from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import TokenTextSplitter


from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.llms import OpenAI
from langchain_ollama import OllamaLLM
from langchain_ollama import ChatOllama
from langchain_openai import ChatOpenAI

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# Model
from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.embeddings import Embeddings

# Settings

In [3]:
if IN_COLAB:
  # Colab
  BASE_DIR = "."
  BACKUPS_DATA_DIR   = "/content/drive/MyDrive/Colab Notebooks/Dataclub/llm/data"
else:
  # Local
  BASE_DIR = ".."
  BACKUPS_DATA_DIR   = "../backups"

# Raw directory
RAW_DATA_DIR               = f"{BASE_DIR}/data/raw"
RAW_DOCS_DATA_DIR          = f"{BASE_DIR}/data/raw/documents"
RAW_INFO_DATA_DIR          = f"{BASE_DIR}/data/raw/info"
# Preprocessing
PROCESSED_DATA_DIR         = f"{BASE_DIR}/data/processed"
PROCESSED_DOCS_DATA_DIR    = f"{BASE_DIR}/data/processed/documents"
# Indexing
INDEXING_DATA_DIR          = f"{BASE_DIR}/data/indexing"
INDEXING_DOCS_DATA_DIR     = f"{BASE_DIR}/data/indexing/documents"
# Test directory
TEST_DATA_DIR               = f"{BASE_DIR}/data/test"
GROUND_TRUTH_DATA_DIR       = f"{BASE_DIR}/data/test/ground_truth"
GROUND_TRUTH_DOCS_DATA_DIR  = f"{BASE_DIR}/data/test/ground_truth/documents"
GROUND_TRUTH_GEN_DATA_DIR   = f"{BASE_DIR}/data/test/ground_truth/generated"
# Test directory
EVAL_DATA_DIR            = f"{BASE_DIR}/data/evaluation"
EVAL_RETRIEVER_DATA_DIR  = f"{BASE_DIR}/data/evaluation/retriever"
EVAL_RAG_DATA_DIR        = f"{BASE_DIR}/data/evaluation/rag"
# Config Prompts Dir
PROMPTS_CONFIG_DIR = f"{BASE_DIR}/cooking_recipe_assistant/config/prompts"

# Raw Info
PLAYLIST_INFO_PATH = f"{RAW_INFO_DATA_DIR}/playlist_info.pkl"
VIDEO_PLAYLIST_MAP_PATH = f"{RAW_INFO_DATA_DIR}/video_playlist_map.pkl"

# Ground-truth
GROUND_TRUTH_PATH = f"{GROUND_TRUTH_DATA_DIR}/ground-truth-retrieval.csv"

# Optimization
REST_OPT_ES_BM25_PATH       = f"{EVAL_RETRIEVER_DATA_DIR}/res-opt-es-bm25.json"
REST_OPT_ES_HYBRID_PATH     = f"{EVAL_RETRIEVER_DATA_DIR}/res-opt-es-hybrid.json"
REST_OPT_ES_HYBRID_RRF_PATH = f"{EVAL_RETRIEVER_DATA_DIR}/res-opt-es-hybrid-rrf.json"

# Make dirs if not exists
if not os.path.exists(RAW_DATA_DIR):
  print("Not exists dir: ", RAW_DATA_DIR)
os.makedirs(RAW_DOCS_DATA_DIR, exist_ok=True)
os.makedirs(RAW_INFO_DATA_DIR, exist_ok=True)
os.makedirs(PROCESSED_DOCS_DATA_DIR, exist_ok=True)
os.makedirs(TEST_DATA_DIR, exist_ok=True)
os.makedirs(EVAL_DATA_DIR, exist_ok=True)
os.makedirs(BACKUPS_DATA_DIR, exist_ok=True)

In [4]:
%load_ext autoreload
%autoreload 2
import os
import sys

# Agregar solo si no está ya en sys.path
if BASE_DIR not in sys.path:
    sys.path.append(BASE_DIR)

# Ahora puedes importar tu módulo
import cooking_recipe_assistant
from cooking_recipe_assistant.commons.utils import (
    read_pickle, 
    read_document, 
    read_text,
    save_pickle, 
    save_document,
    save_text
)

from cooking_recipe_assistant.evaluation.retrievers import evaluate
from cooking_recipe_assistant.evaluation.optimization import run_hyperopt
from cooking_recipe_assistant.rags.retrievers.es_bm25 import es_bm25_query
from cooking_recipe_assistant.rags.retrievers.es_hybrid import es_hybrid_query
from cooking_recipe_assistant.rags.retrievers.es_hybrid_rrf import es_hybrid_rrf_query

In [5]:
ES_URL = "http://localhost:9200"
INDEX_NAME = "cooking-recipes"

In [6]:
ES_CLIENT = Elasticsearch(hosts=[ES_URL])

In [7]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
MINILM_EMBEDDING_MODEL_NAME = 'all-MiniLM-L6-v2'
#NMNET_EMBEDDING_MODEL_NAME = 'all-mpnet-base-v2'
BATCH_SIZE = 100

In [8]:
EMBEDDINGS = HuggingFaceEmbeddings(model_name=MINILM_EMBEDDING_MODEL_NAME)
#EMBEDDINGS = HuggingFaceEmbeddings(model_name=NMNET_EMBEDDING_MODEL_NAME)

In [9]:
# LLM
OLLAMA_URL = 'http://localhost:11434'
PLAYLIST_TITLE = 'Imperial Stout'
MODEL_NAME = 'gpt-4o-mini'
MODEL_NAME = 'llama3'

# Templates

In [10]:
!ls -lg "{PROMPTS_CONFIG_DIR}"

total 68
-rw-rw-r-- 1 aztleclan  109 oct 27 01:30 en_entry_template.txt
-rw-rw-r-- 1 aztleclan 1380 oct 18 17:24 en_prompt_template_blocks.txt
-rw-rw-r-- 1 aztleclan  831 oct 26 14:11 en_prompt_template_eval_rag_v1.txt
-rw-rw-r-- 1 aztleclan  643 oct 25 17:50 en_prompt_template_eval_rag_v2.txt
-rw-rw-r-- 1 aztleclan 1552 oct 18 20:04 en_prompt_template_extractions.txt
-rw-rw-r-- 1 aztleclan 1612 oct 25 15:44 en_prompt_template_ground_truth_v1.txt
-rw-rw-r-- 1 aztleclan 1534 oct 25 15:47 en_prompt_template_ground_truth_v2.txt
-rw-rw-r-- 1 aztleclan 1175 oct 23 18:16 en_prompt_template_questions.txt
-rw-rw-r-- 1 aztleclan  206 oct 24 16:33 en_prompt_template_rag_v1.txt
-rw-rw-r-- 1 aztleclan  206 oct 26 15:25 en_prompt_template_rag_v2.txt
-rw-rw-r-- 1 aztleclan  346 oct 25 02:00 en_prompt_template_system_assistent.txt
-rw-rw-r-- 1 aztleclan 1529 oct 18 17:24 es_prompt_template_blocks.txt
-rw-rw-r-- 1 aztleclan 1688 oct 18 23:07 es_prompt_template_extractions_v1.txt
-rw-rw-r-- 1 aztleclan

In [11]:
ENTRY_TEMPLATE = read_text(f"{PROMPTS_CONFIG_DIR}/en_entry_template.txt")
print(ENTRY_TEMPLATE)

meals: {meals}
title: {title}
ingredients: {ingredients}
summary: {summary}
instructions: {text}
tips: {tips}


In [12]:
TEMPLATE_RAG_V1 = read_text(f"{PROMPTS_CONFIG_DIR}/en_prompt_template_rag_v1.txt")
print(TEMPLATE_RAG_V1)

You are a cooking recipe asistente. Answer the QUESTION based on the CONTEXT of our recipe database. 
Use only the data in the CONTEXT when answering the QUESTION.

CONTEXT: 
{context}

QUESTION: {question}


In [13]:
TEMPLATE_RAG_V2 = read_text(f"{PROMPTS_CONFIG_DIR}/en_prompt_template_rag_v2.txt")
print(TEMPLATE_RAG_V2)

You are a cooking recipe asistente. Answer the QUESTION based on the CONTEXT of our recipe database. 
Use only the data in the CONTEXT when answering the QUESTION.

CONTEXT: 
{context}

QUESTION: {question}


In [14]:
TEMPLATE_LLM_JUDGE_V1 = read_text(f"{PROMPTS_CONFIG_DIR}/en_prompt_template_eval_rag_v1.txt")
print(TEMPLATE_LLM_JUDGE_V1)

You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}

Generated Answer: {answer_llm}


Please analyze the content and context of the generated answer in relation to the question. Provide the result in parsable JSON format without using code blocks. Do not generate solutions with additional text or json-style comments. Make sure the JSON is well-formed and has closing braces and brackets.
Please follow the following format strictly:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}


In [15]:
TEMPLATE_LLM_JUDGE_V2 = read_text(f"{PROMPTS_CONFIG_DIR}/en_prompt_template_eval_rag_v2.txt")
print(TEMPLATE_LLM_JUDGE_V2)

You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}


# Check ElasticSearch

In [16]:
info_es = ES_CLIENT.info()
print(json.dumps(info_es.body, indent=4))

{
    "name": "1ce7214a22c5",
    "cluster_name": "docker-cluster",
    "cluster_uuid": "wg43N1DqSdy9g9z_pOLIDQ",
    "version": {
        "number": "8.4.3",
        "build_flavor": "default",
        "build_type": "docker",
        "build_hash": "42f05b9372a9a4a470db3b52817899b99a76ee73",
        "build_date": "2022-10-04T07:17:24.662462378Z",
        "build_snapshot": false,
        "lucene_version": "9.3.0",
        "minimum_wire_compatibility_version": "7.17.0",
        "minimum_index_compatibility_version": "7.0.0"
    },
    "tagline": "You Know, for Search"
}


In [17]:
if ES_CLIENT.indices.exists(index=INDEX_NAME):
    info_indice = ES_CLIENT.indices.get(index=INDEX_NAME)
    print(json.dumps(info_indice.body, indent=4))

{
    "cooking-recipes": {
        "aliases": {},
        "mappings": {
            "properties": {
                "chunk_number": {
                    "type": "integer"
                },
                "doc_id": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    }
                },
                "id": {
                    "type": "text"
                },
                "ingredients": {
                    "type": "keyword"
                },
                "meals": {
                    "type": "keyword"
                },
                "summart_vector": {
                    "type": "float"
                },
                "summary": {
                    "type": "text"
                },
                "summary_vector": {
                    "type": "dense_vector",
        

In [18]:
if ES_CLIENT.indices.exists(index=INDEX_NAME):
    settings = ES_CLIENT.indices.get_settings(index=INDEX_NAME)
    print(json.dumps(settings.body, indent=4))

{
    "cooking-recipes": {
        "settings": {
            "index": {
                "routing": {
                    "allocation": {
                        "include": {
                            "_tier_preference": "data_content"
                        }
                    }
                },
                "number_of_shards": "1",
                "provided_name": "cooking-recipes",
                "creation_date": "1729698947472",
                "number_of_replicas": "0",
                "uuid": "xRPfDileSMin8UmdVwEoOQ",
                "version": {
                    "created": "8040399"
                }
            }
        }
    }
}


In [19]:
if ES_CLIENT.indices.exists(index=INDEX_NAME):
    count = ES_CLIENT.count(index=INDEX_NAME)['count']
    print(f"Document Num:{count}")

Document Num:232


# Check Embedding

In [20]:
text = "LangChain is a framework for developing applications powered by language models."


embedding_vector = EMBEDDINGS.embed_query(text)
print(type(embedding_vector), len(embedding_vector))
print(embedding_vector[:10])

<class 'list'> 384
[-0.03306527063250542, -0.04929625988006592, 0.0011788202682510018, -0.052408862859010696, -0.037587061524391174, 0.025819718837738037, -0.03928518667817116, 0.05620156601071358, 0.0902889296412468, -0.052350059151649475]


# Code

In [21]:
def parse_evaluation(
    questions_str: str
):
    # Extraer los campos necesarios del documento JSON
    formatted_json = None
    try:
        # Cleaning text
        clean_text = re.sub(r'//.*', '', questions_str)
        clean_text = re.sub('\n', '', clean_text)
        # Compilar la expresión regular 
        pattern = re.compile(r'"Relevance"\s*:\s*"(.*?)"\s*,?\s*"Explanation"\s*:\s*"(.*?)"(?:\s*[\},])?')
        matches = pattern.findall(clean_text)
        if matches:
            relevance, explanation = matches[0]
            formatted_json = {
                "relevance": relevance,
                "explanation": explanation
            }
    except Exception as e:
        #print(f"No se encontró contenido JSON. {text}")
        print(questions_str)
        #print("The error is: ", e)
        print("="*100)

    return formatted_json

In [22]:
def build_evaluations(
    ground_truth_path: str,
    gen_data_dir: str
):
    print(f"[BUILD-EVALUATIONS] ground_truth_path : {ground_truth_path}")
    print(f"[BUILD-EVALUATIONS] gen_data_dir      : {gen_data_dir}")
    #print(f"[BUILD-EVALUATIONS] out_data_dir      : {out_data_dir}")
    generated_rag_dir = os.path.join(gen_data_dir, 'rag')
    generated_llm_as_judge_dir = os.path.join(gen_data_dir, 'llm-as-judge')

    stats = []
    results = []
    all_docs_with_errors = []
    for root, _, files in os.walk(generated_rag_dir):
        for filename in tqdm(files):
            # Read document
            if not filename.endswith(".txt"):
                continue
            
            # Doc id
            es_doc_id = os.path.splitext(filename)[0]
            
            # Read generated answer
            generated_answer_path = os.path.join(root, filename)
            generated_answer_str  = read_text(generated_answer_path)
            
            # Read generated judge
            generated_judge_path = os.path.join(generated_llm_as_judge_dir, filename)
            if not os.path.exists(generated_judge_path):
                continue
            generated_judge_str  = read_text(generated_judge_path)

            # Parse generated judge
            generated_judge_json = parse_evaluation(generated_judge_str)
            if generated_judge_json is None:
                all_docs_with_errors.append(es_doc_id)
                continue

            # Add result
            [doc_id, chunk_number, q_number] = es_doc_id.split('@')
            generated_judge_json['doc_id'] = doc_id
            generated_judge_json['chunk_number'] = int(chunk_number)
            generated_judge_json['number'] = int(q_number)
            generated_judge_json['answer'] = generated_answer_str
            results.append(generated_judge_json)
            
            # Read stats
            stats_filename = f"{es_doc_id}_stats.json"
            gen_answer_stats_path = os.path.join(root, stats_filename)
            gen_judge_stats_path = os.path.join(generated_llm_as_judge_dir, stats_filename)
            if os.path.exists(gen_answer_stats_path) and os.path.exists(gen_judge_stats_path):
                gen_answer_stats = read_document(gen_answer_stats_path)
                gen_judge_stats = read_document(gen_judge_stats_path)
                answer_data = gen_answer_stats | gen_judge_stats
                answer_data['doc_id'] = doc_id
                answer_data['chunk_number'] = int(chunk_number)
                answer_data['number'] = int(q_number)
                answer_data["openai_cost"] = answer_data["total_cost"] + answer_data["eval_total_cost"] 
                answer_data["response_time"] = answer_data["response_time"] + answer_data["eval_response_time"]
                answer_data.pop("eval_total_cost")
                answer_data.pop("eval_response_time")
                stats.append(answer_data)

    # Generate results
    print(f"Generated questions with errors: {len(all_docs_with_errors)}")
    judge_df = pd.DataFrame(results)

    # Realizar el join
    questions_df = pd.read_csv(ground_truth_path)
    evaluations_df = pd.merge(
        questions_df, judge_df, on=['doc_id', 'chunk_number', 'number'], how='inner')

    # Save results
    evaluation_path = os.path.join(gen_data_dir, 'rag-evaluation.csv')
    evaluations_df = evaluations_df[[
        'id', 'doc_id', 'chunk_number', 'number', 
        'answer', 'question', 'relevance', 'explanation'
    ]]
    evaluations_df.to_csv(evaluation_path, index=False)

    if len(stats) > 0:
        stats_df = pd.DataFrame(stats)
        print(f"stats: {len(stats_df)}")
        eval_stats_df = pd.merge(
            stats_df, evaluations_df, on=['doc_id', 'chunk_number', 'number'], how='inner'
        )
        eval_stats_df.rename(columns={'explanation': 'relevance_explanation'}, inplace=True)
        eval_stats_df = eval_stats_df[[
            "question", "answer", "model_used", "response_time", "relevance", "relevance_explanation",
            "prompt_tokens", "completion_tokens", "total_tokens",
            "eval_prompt_tokens", "eval_completion_tokens", "eval_total_tokens",
            "openai_cost"
        ]]
        print(f"eval_stats: {len(eval_stats_df)}")
        # Save results
        eval_stats_path = os.path.join(gen_data_dir, 'rag-evaluation-stats.csv')
        eval_stats_df.to_csv(eval_stats_path, index=False)
        #stats_path = os.path.join(gen_data_dir, 'rag-evaluation-eval_stats.csv')
        #stats_df.to_csv(stats_path, index=False)

In [23]:
def build_recipe_context(search_results, entry_template):
    separator = "\n-----------\n"
    formatted_docs = []
    for doc in search_results:
        doc['meals'] = ', '.join(doc['meals'])
        doc['ingredients'] = ', '.join(doc['ingredients'])
        doc['tips'] = ' '.join([t if t.endswith('.') else t + "." for t in doc['tips']])
        formatted_doc = entry_template.format(**doc)
        #context = context + ENTRY_TEMPLATE.format(**doc) + "\n\n"
        #context = context + doc["text"] + "\n\n"
        formatted_docs.append(formatted_doc)
    #return context.strip()
    return separator.join(formatted_docs).strip()

In [24]:
def build_retriever(es_cnf:dict, entry_template):
    # Create connection
    es_url = es_cnf["url"]
    es_client = Elasticsearch(hosts=[es_url])

    boosting = es_cnf["boosting"]
    index_name = es_cnf["index_name"]
    search_type = es_cnf['type']
    if search_type == 'bm25':
        print(f"search_type: {search_type}")
        retriever = lambda query: build_recipe_context(
            es_bm25_query(es_client, index_name, query, boosting),
            entry_template
        )
    elif search_type == 'hybrid':
        print(f"search_type: {search_type}")
        embedding_model_name = es_cnf["embedding"]["model_name"]
        embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
        vector_field = es_cnf["vector_field"]
        retriever = lambda query: build_recipe_context(
            es_hybrid_query(es_client, index_name, query, embeddings, vector_field, boosting),
            entry_template
        )
    elif search_type == 'hybrid':
        print(f"search_type: {search_type}")
        embedding_model_name = es_cnf["embedding"]["model_name"]
        embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
        vector_field = es_cnf["vector_field"]
        retriever = lambda query: build_recipe_context(
            es_hybrid_rrf_query(es_client, index_name, query, embeddings, vector_field, boosting),
            entry_template
        )
    else:
        raise Exception(f"Not found model name: {search_type}")
    return retriever


In [25]:
def build_llm(
    model_name:str,
):
    # Build llm
    if model_name.startswith("llama"):
        print(f"model: {model_name}")
        llm = ChatOllama(model=model_name)
    elif model_name.startswith("gpt"):
        print(f"model: {model_name}")
        llm = ChatOpenAI(model_name=model_name)
    else:
        raise Exception(f"Not found model name: {model_name}")

    return llm

In [26]:
def build_eval(
    model_name:str,
    template:str
):
    # Create prompt
    #prompt = ChatPromptTemplate.from_template(
    prompt = PromptTemplate.from_template(
        template=template
    )
    
    # Build llm
    llm = build_llm(model_name)
    
    # Build chain
    qa_chain = (
          prompt 
        | llm 
        | StrOutputParser()
    )
    return qa_chain

In [27]:
def build_rag(
    model_name:str,
    template:str,
    retriever
):
    # Create prompt
    #prompt = ChatPromptTemplate.from_template(
    prompt = PromptTemplate.from_template(
        template=template
    )
    
    # Build llm
    llm = build_llm(model_name)
    
    # Build chain
    qa_chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt 
        | llm 
        | StrOutputParser()
    )
    return qa_chain

In [28]:
def evaluate_rag_colab(
    sample, 
    rag_chain,
    eval_chain,
    src_data_dir,
    out_data_dir,
):
    print(f"[EVAL-RAG] sample       : {len(sample)}")
    print(f"[EVAL-RAG] src_data_dir : {src_data_dir}")
    print(f"[EVAL-RAG] out_data_dir : {out_data_dir}")
    generated_rag_dir = os.path.join(out_data_dir, 'rag')
    generated_llm_as_judge_dir = os.path.join(out_data_dir, 'llm-as-judge')
    os.makedirs(generated_rag_dir, exist_ok=True)
    os.makedirs(generated_llm_as_judge_dir, exist_ok=True)
    evaluations = []
    count_generated = 0
    for record in tqdm(sample):
        question = record['question']
        doc_id = record['id']
        question__num = record['number']
        #print( f'{doc_id}#{question__num}')
        eval_id = f'{doc_id}@{question__num:03d}'
        # RAG
        rag_answer_path = os.path.join(generated_rag_dir, f'{eval_id}.txt')
        if not os.path.exists(rag_answer_path): 
            print( f'Generating RAG {eval_id}')
            es_ctx_path = os.path.join(src_data_dir, f'{eval_id}_context.txt')
            es_context = read_text(es_ctx_path)
            answer_llm = rag_chain.invoke({"question": question, "context": es_context}) 
            # Save results
            save_text(rag_answer_path, answer_llm)
            count_generated += 1
        else:
            answer_llm = read_text(rag_answer_path)

        # LLLM-as-judge
        llm_as_judge_answer_path = os.path.join(generated_llm_as_judge_dir, f'{eval_id}.txt')
        if not os.path.exists(llm_as_judge_answer_path):
            print( f'Generating JUDGE {eval_id}')
            evaluation = eval_chain.invoke({"question": question, "answer_llm": answer_llm})
            # Save results
            save_text(llm_as_judge_answer_path, evaluation)

        # Stop
        if (count_generated+1) % 100 == 0:
            print("Stop generating questions...")
            break

    # Save testset
    #evaluations_df = build_evaluations(generated_llm_as_judge_dir)
    #evaluation_path = os.path.join(out_data_dir, 'rag-evaluation.csv')
    #evaluations_df.to_csv(evaluation_path, index=False)

In [29]:
def evaluate_rag(
    sample, 
    model_name,
    rag_chain,
    eval_chain,
    output_data_dir,
):
    print(f"[EVAL-RAG] sample          : {len(sample)}")
    print(f"[EVAL-RAG] model_name      : {model_name}")
    print(f"[EVAL-RAG] output_data_dir : {output_data_dir}")
    generated_rag_dir = os.path.join(output_data_dir, 'rag')
    generated_llm_as_judge_dir = os.path.join(output_data_dir, 'llm-as-judge')
    os.makedirs(generated_rag_dir, exist_ok=True)
    os.makedirs(generated_llm_as_judge_dir, exist_ok=True)
    evaluations = []

    for record in tqdm(sample):
        question = record['question']
        doc_id = record['id']
        question__num = record['number']
        #print( f'{doc_id}#{question__num}')
        eval_id = f'{doc_id}@{question__num:03d}'
        # RAG
        rag_answer_path = os.path.join(generated_rag_dir, f'{eval_id}.txt')
        rag_stats_path = os.path.join(generated_rag_dir, f'{eval_id}_stats.json')
        if not os.path.exists(rag_answer_path):
            token_stats = {}
            t0 = time()
            if model_name.startswith("gpt"):
                with get_openai_callback() as cb:
                    answer_llm = rag_chain.invoke(question)
                token_stats = {
                    "prompt_tokens": cb.prompt_tokens,
                    "completion_tokens": cb.completion_tokens,
                    "total_tokens": cb.total_tokens,
                    "total_cost": cb.total_cost
                }
            else:
                answer_llm = rag_chain.invoke(question)
            t1 = time()
            t_gen = t1 - t0
            token_stats["model_used"] = model_name
            token_stats["response_time"] = t_gen
            # Save results
            save_text(rag_answer_path, answer_llm)
            save_document(rag_stats_path, token_stats)
        else:
            answer_llm = read_text(rag_answer_path)

        # LLLM-as-judge
        llm_as_judge_answer_path = os.path.join(generated_llm_as_judge_dir, f'{eval_id}.txt')
        llm_as_judge_stats_path = os.path.join(generated_llm_as_judge_dir, f'{eval_id}_stats.json')
        if not os.path.exists(llm_as_judge_answer_path):
            eval_token_stats = {}
            t0 = time()
            if model_name.startswith("gpt"):
                with get_openai_callback() as cb:
                    evaluation = eval_chain.invoke({"question": question, "answer_llm": answer_llm})
                eval_token_stats = {
                    "eval_prompt_tokens": cb.prompt_tokens,
                    "eval_completion_tokens": cb.completion_tokens,
                    "eval_total_tokens": cb.total_tokens,
                    "eval_total_cost": cb.total_cost
                }
            else:
                evaluation = eval_chain.invoke({"question": question, "answer_llm": answer_llm})
            t1 = time()
            t_eval = t1 - t0
            eval_token_stats["model_used"] = model_name
            eval_token_stats["eval_response_time"] = t_eval
            # Save results
            save_text(llm_as_judge_answer_path, evaluation)
            save_document(llm_as_judge_stats_path, eval_token_stats)

    # Save testset
    #evaluations_df = build_evaluations(generated_llm_as_judge_dir)
    #evaluation_path = os.path.join(output_data_dir, 'rag-evaluation.csv')
    #evaluations_df.to_csv(evaluation_path, index=False)

# Ground-truth

In [30]:
!ls -lh '{RAW_DATA_DIR}' 2>/dev/null | grep pkl  2>/dev/null | head -5

-rw-rw-r-- 1 aztleclan aztleclan 2,2K oct 12 22:44 playlist_info.pkl
-rw-rw-r-- 1 aztleclan aztleclan  11K oct 12 22:44 video_playlist_map.pkl


In [31]:
!ls -lh '{PROCESSED_DATA_DIR}/documents' 2>/dev/null | grep json 2>/dev/null | head -5

-rw-rw-r-- 1 aztleclan aztleclan 3,3K oct 23 17:55 086AnjxzAfg.json
-rw-rw-r-- 1 aztleclan aztleclan 3,0K oct 23 17:55 0iZUayL1RQ0.json
-rw-rw-r-- 1 aztleclan aztleclan 3,1K oct 23 17:55 0X7I-vr2oaM.json
-rw-rw-r-- 1 aztleclan aztleclan 3,1K oct 23 17:55 1WAbPmolGqY.json
-rw-rw-r-- 1 aztleclan aztleclan 3,3K oct 23 17:55 2CxQTUGD-5E.json


In [32]:
!ls -lh "{GROUND_TRUTH_PATH}"

-rw-rw-r-- 1 aztleclan aztleclan 169K oct 26 23:21 ../data/test/ground_truth/ground-truth-retrieval.csv


## Read Questions

In [33]:
df_ground_truth = pd.read_csv(GROUND_TRUTH_PATH)

In [34]:
df_ground_truth.head(5)

Unnamed: 0,id,doc_id,chunk_number,number,question
0,erjXeb0Hscw@000,erjXeb0Hscw,0,1,How do I achieve crispy and golden bacon for m...
1,erjXeb0Hscw@000,erjXeb0Hscw,0,2,What is the recommended size of cheese cubes f...
2,erjXeb0Hscw@000,erjXeb0Hscw,0,3,Can I cook the spaghetti longer than eight min...
3,erjXeb0Hscw@000,erjXeb0Hscw,0,4,How do I enhance the creaminess of the cheese ...
4,erjXeb0Hscw@000,erjXeb0Hscw,0,5,Is my Creamy Spaghetti with Cheese and Bacon c...


In [93]:
df_ground_truth.iloc[0].question

'How do I achieve crispy and golden bacon for my Creamy Spaghetti with Cheese and Bacon?'

In [92]:
df_ground_truth.iloc[1].question

Core version: 11.0.0
Pillow version: 10.4.0
[autoreload of PIL.Image failed: Traceback (most recent call last):
  File "/home/aztleclan/.local/share/virtualenvs/zoomcamp-llm-v-cF8mf4/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "/home/aztleclan/.local/share/virtualenvs/zoomcamp-llm-v-cF8mf4/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 475, in superreload
    module = reload(module)
  File "/usr/lib/python3.10/importlib/__init__.py", line 169, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 619, in _exec
  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "/home/aztleclan/.local/share/virtualenvs/zoomcamp-llm-v-cF8mf4/lib/python3.10/site-packages/PIL/Image.py", line 108, in <module>
    raise ImportError(msg)
ImportError: The _

'What is the recommended size of cheese cubes for the semi-cured sheep cheese, aged cheese, and cheddar cheese in this recipe?'

# Geneate prompts

In [46]:
def genarate_prompt_ctx(
    ground_truth,
    template:str,
    retriever,
    out_data_dir:str
):
    prompt = PromptTemplate.from_template(
        template=template
    )
    for r in tqdm(ground_truth):
        es_doc_id = r["id"]
        question_id = r["number"]
        question = r["question"]
        #print(f"{es_doc_id}@{question_id:03d}")
        question_ctx = retriever(question)
        prompt_rag = prompt.format(question=question, context=question_ctx)
        eval_id = f"{es_doc_id}@{question_id:03d}"
        # Save ctx and prompt
        prompt_rag_path = os.path.join(out_data_dir, f"{eval_id}_prompt.txt")
        save_text(prompt_rag_path, prompt_rag)
        ctx_rag_path = os.path.join(out_data_dir, f"{eval_id}_context.txt")
        save_text(ctx_rag_path, question_ctx)

In [47]:
!ls -lh "{GROUND_TRUTH_DATA_DIR}"
!ls -lh "{EVAL_RETRIEVER_DATA_DIR}"

total 512K
drwxr-xr-x 3 aztleclan aztleclan 4,0K oct 25 16:16 generated
-rw-rw-r-- 1 aztleclan aztleclan 169K oct 26 23:21 ground-truth-retrieval.csv
-rw-r--r-- 1 aztleclan aztleclan 167K oct 25 16:58 ground-truth-retrieval-ok.csv
-rw-r--r-- 1 aztleclan aztleclan 167K oct 25 16:58 ground-truth-retrieval-OK.csv
total 12K
-rw-rw-r-- 1 aztleclan aztleclan 676 oct 26 23:00 res-opt-es-bm25.json
-rw-rw-r-- 1 aztleclan aztleclan 723 oct 26 21:54 res-opt-es-hybrid.json
-rw-rw-r-- 1 aztleclan aztleclan 724 oct 26 22:20 res-opt-es-hybrid-rrf.json


## BM25

In [48]:
%%time
ground_truth = df_ground_truth.to_dict(orient='records')

CPU times: user 4.65 ms, sys: 108 μs, total: 4.76 ms
Wall time: 4.73 ms


In [49]:
LLAMA_MODEL_NAME = 'llama3'

In [73]:
# Retriever
es_url = "http://localhost:9200"
index_name = "cooking-recipes"
es_client = Elasticsearch(hosts=[es_url])
vector_field='text_vector'
opt_es_bm25 = read_document(REST_OPT_ES_BM25_PATH)
print(REST_OPT_ES_BM25_PATH)
boosts_bm25 = opt_es_bm25['best_boosts']
print(json.dumps(boosts_bm25, indent=4))

retriever_bm25 = lambda query: es_retriever_bm25(
        es_client,
        index_name,
        query,
        boosts_bm25
)

../data/evaluation/retriever/res-opt-es-bm25.json
{
    "meals": 3.3780065729545834,
    "title": 2.350519748557916,
    "ingredients": 1.8480562377707013,
    "summary": 4.142904195381872,
    "text": 3.8827414870597465,
    "tips": 3.2750400070515573
}


In [51]:
# Output dir
EVAL_RAG_LLAMA3_BM25_DATA_DIR = f"{EVAL_RAG_DATA_DIR}/{LLAMA_MODEL_NAME}_bm25"
EVAL_RAG_LLAMA3_BM25_PROMPTS_DATA_DIR = f"{EVAL_RAG_LLAMA3_BM25_DATA_DIR}/prompts_rag"
print(EVAL_RAG_LLAMA3_BM25_DATA_DIR)
print(EVAL_RAG_LLAMA3_BM25_PROMPTS_DATA_DIR)
os.makedirs(EVAL_RAG_LLAMA3_BM25_PROMPTS_DATA_DIR, exist_ok=True)

../data/evaluation/rag/llama3_bm25
../data/evaluation/rag/llama3_bm25/prompts_rag


In [52]:
genarate_prompt_ctx(
    ground_truth=ground_truth,
    template=TEMPLATE_RAG_V1,
    retriever=retriever_bm25,
    out_data_dir=EVAL_RAG_LLAMA3_BM25_PROMPTS_DATA_DIR
)

  0%|          | 0/1160 [00:00<?, ?it/s]

In [53]:
!ls -lh "{EVAL_RAG_LLAMA3_BM25_PROMPTS_DATA_DIR}" 2>/dev/null | grep '.*\_context\.txt' | wc -l
!ls -lh "{EVAL_RAG_LLAMA3_BM25_PROMPTS_DATA_DIR}" 2>/dev/null | grep '.*\_prompt\.txt' | wc -l

1160
1160


In [54]:
!ls -lh "{EVAL_RAG_LLAMA3_BM25_PROMPTS_DATA_DIR}" 2>/dev/null | head -5
#!cat "{EVAL_RAG_LLAMA3_BM25_PROMPTS_DATA_DIR}/086AnjxzAfg@000@001_context.txt"
#!cat "{EVAL_RAG_LLAMA3_BM25_PROMPTS_DATA_DIR}/086AnjxzAfg@000@001_prompt.txt"

total 26M
-rw-rw-r-- 1 aztleclan aztleclan 8,3K oct 26 23:35 086AnjxzAfg@000@001_context.txt
-rw-rw-r-- 1 aztleclan aztleclan 8,6K oct 26 23:35 086AnjxzAfg@000@001_prompt.txt
-rw-rw-r-- 1 aztleclan aztleclan 8,0K oct 26 23:35 086AnjxzAfg@000@002_context.txt
-rw-rw-r-- 1 aztleclan aztleclan 8,3K oct 26 23:35 086AnjxzAfg@000@002_prompt.txt


## Hybrid

In [64]:
# Retriever
es_url = "http://localhost:9200"
index_name = "cooking-recipes"
es_client = Elasticsearch(hosts=[es_url])
embeddings=EMBEDDINGS
vector_field='text_vector'
print(REST_OPT_ES_HYBRID_PATH)
opt_es_hybrid = read_document(REST_OPT_ES_HYBRID_PATH)
boosts_hybrid = opt_es_hybrid['best_boosts']
print(json.dumps(boosts_hybrid, indent=4))

retriever_hydrid = lambda query: es_retriever_hybrid(
        es_client,
        index_name,
        query,
        embeddings,
        vector_field,
        boosts_hybrid
)

../data/evaluation/retriever/res-opt-es-hybrid.json
{
    "meals": 1.3267880385949111,
    "title": 1.8744388428292256,
    "ingredients": 1.5107278257070114,
    "summary": 3.3330549166389503,
    "text": 2.9015143483052235,
    "tips": 2.619409172670707,
    "vector_boost": 0.8992024772398135
}


In [65]:
# Output dir
EVAL_RAG_LLAMA3_HYBRID_DATA_DIR = f"{EVAL_RAG_DATA_DIR}/{LLAMA_MODEL_NAME}_hybrid"
EVAL_RAG_LLAMA3_HYBRID_PROMPTS_DATA_DIR = f"{EVAL_RAG_LLAMA3_HYBRID_DATA_DIR}/prompts_rag"
print(EVAL_RAG_LLAMA3_HYBRID_DATA_DIR)
print(EVAL_RAG_LLAMA3_HYBRID_PROMPTS_DATA_DIR)
os.makedirs(EVAL_RAG_LLAMA3_HYBRID_PROMPTS_DATA_DIR, exist_ok=True)

../data/evaluation/rag/llama3_hybrid
../data/evaluation/rag/llama3_hybrid/prompts_rag


In [66]:
genarate_prompt_ctx(
    ground_truth=ground_truth,
    template=TEMPLATE_RAG_V1,
    retriever=retriever_hydrid,
    out_data_dir=EVAL_RAG_LLAMA3_HYBRID_PROMPTS_DATA_DIR
)

  0%|          | 0/1160 [00:00<?, ?it/s]

In [105]:
!ls -lh "{EVAL_RAG_LLAMA3_HYBRID_PROMPTS_DATA_DIR}" 2>/dev/null | grep '.*\_context\.txt' | wc -l
!ls -lh "{EVAL_RAG_LLAMA3_HYBRID_PROMPTS_DATA_DIR}" 2>/dev/null | grep '.*\_prompt\.txt' | wc -l

1160
1160


In [106]:
!ls -lh "{EVAL_RAG_LLAMA3_HYBRID_PROMPTS_DATA_DIR}" 2>/dev/null | head -5
#!cat "{EVAL_RAG_LLAMA3_HYBRID_PROMPTS_DATA_DIR}/086AnjxzAfg@000@001_context.txt"
#!cat "{EVAL_RAG_LLAMA3_HYBRID_PROMPTS_DATA_DIR}/086AnjxzAfg@000@001_prompt.txt"

total 26M
-rw-rw-r-- 1 aztleclan aztleclan 8,5K oct 26 18:09 086AnjxzAfg@000@001_context.txt
-rw-rw-r-- 1 aztleclan aztleclan 8,8K oct 26 18:09 086AnjxzAfg@000@001_prompt.txt
-rw-rw-r-- 1 aztleclan aztleclan 8,7K oct 26 18:09 086AnjxzAfg@000@002_context.txt
-rw-rw-r-- 1 aztleclan aztleclan 9,0K oct 26 18:09 086AnjxzAfg@000@002_prompt.txt


## Hybrid RRF

In [67]:
# Retriever
es_url = "http://localhost:9200"
index_name = "cooking-recipes"
es_client = Elasticsearch(hosts=[es_url])
embeddings=EMBEDDINGS
vector_field='text_vector'
print(REST_OPT_ES_HYBRID_RRF_PATH)
opt_es_hybrid_rrf = read_document(REST_OPT_ES_HYBRID_RRF_PATH)
boosts_hybrid_rrf = opt_es_hybrid_rrf['best_boosts']
print(json.dumps(boosts_hybrid_rrf, indent=4))

retriever_hydrid_rrf = lambda query: es_retriever_hybrid(
        es_client,
        index_name,
        query,
        embeddings,
        vector_field,
        boosts_hybrid
)

../data/evaluation/retriever/res-opt-es-hybrid-rrf.json
{
    "meals": 1.4671273591308425,
    "title": 1.47987055694058,
    "ingredients": 4.882772492268252,
    "summary": 4.882702537681318,
    "text": 4.368904374010877,
    "tips": 1.2126808445885127,
    "vector_boost": 0.10557689395629086
}


In [68]:
# Output dir
EVAL_RAG_LLAMA3_HYBRID_RRF_DATA_DIR = f"{EVAL_RAG_DATA_DIR}/{LLAMA_MODEL_NAME}_hybrid_rrf"
EVAL_RAG_LLAMA3_HYBRID_RRF_PROMPTS_DATA_DIR = f"{EVAL_RAG_LLAMA3_HYBRID_RRF_DATA_DIR}/prompts_rag"
print(EVAL_RAG_LLAMA3_HYBRID_RRF_DATA_DIR)
print(EVAL_RAG_LLAMA3_HYBRID_RRF_PROMPTS_DATA_DIR)
os.makedirs(EVAL_RAG_LLAMA3_HYBRID_RRF_PROMPTS_DATA_DIR, exist_ok=True)

../data/evaluation/rag/llama3_hybrid_rrf
../data/evaluation/rag/llama3_hybrid_rrf/prompts_rag


In [69]:
genarate_prompt_ctx(
    ground_truth=ground_truth,
    template=TEMPLATE_RAG_V1,
    retriever=retriever_hydrid_rrf,
    out_data_dir=EVAL_RAG_LLAMA3_HYBRID_RRF_PROMPTS_DATA_DIR
)

  0%|          | 0/1160 [00:00<?, ?it/s]

In [70]:
!ls -lh "{EVAL_RAG_LLAMA3_HYBRID_RRF_PROMPTS_DATA_DIR}" 2>/dev/null | grep '.*\_context\.txt' | wc -l
!ls -lh "{EVAL_RAG_LLAMA3_HYBRID_RRF_PROMPTS_DATA_DIR}" 2>/dev/null | grep '.*\_prompt\.txt' | wc -l

1160
1160


In [71]:
!ls -lh "{EVAL_RAG_LLAMA3_HYBRID_RRF_PROMPTS_DATA_DIR}" 2>/dev/null | head -5
#!cat "{EVAL_RAG_LLAMA3_HYBRID_RRF_PROMPTS_DATA_DIR}/086AnjxzAfg@000@001_context.txt"
#!cat "{EVAL_RAG_LLAMA3_HYBRID_RRF_PROMPTS_DATA_DIR}/086AnjxzAfg@000@001_prompt.txt"

total 26M
-rw-rw-r-- 1 aztleclan aztleclan 8,3K oct 26 23:40 086AnjxzAfg@000@001_context.txt
-rw-rw-r-- 1 aztleclan aztleclan 8,6K oct 26 23:40 086AnjxzAfg@000@001_prompt.txt
-rw-rw-r-- 1 aztleclan aztleclan 8,5K oct 26 23:40 086AnjxzAfg@000@002_context.txt
-rw-rw-r-- 1 aztleclan aztleclan 8,8K oct 26 23:40 086AnjxzAfg@000@002_prompt.txt


# Evaluation

In [40]:
from dotenv import load_dotenv
load_dotenv()

True

In [90]:
#os.environ["OPENAI_API_KEY"]

In [34]:
!ls -lh "{EVAL_RETRIEVER_DATA_DIR}"

total 12K
-rw-rw-r-- 1 aztleclan aztleclan 676 oct 26 23:00 res-opt-es-bm25.json
-rw-rw-r-- 1 aztleclan aztleclan 723 oct 26 21:54 res-opt-es-hybrid.json
-rw-rw-r-- 1 aztleclan aztleclan 724 oct 26 22:20 res-opt-es-hybrid-rrf.json


In [35]:
!ls -lh "{EVAL_RAG_DATA_DIR}"

total 12K
drwxrwxr-x 5 aztleclan aztleclan 4,0K oct 26 16:49 llama3_bm25
drwxrwxr-x 5 aztleclan aztleclan 4,0K oct 27 17:16 llama3_hybrid
drwxrwxr-x 3 aztleclan aztleclan 4,0K oct 26 18:08 llama3_hybrid_rrf


In [44]:
%%time
# Generate Data Dict
ground_truth = df_ground_truth.to_dict(orient='records')

CPU times: user 7.03 ms, sys: 0 ns, total: 7.03 ms
Wall time: 8.94 ms


## LLAMA3

In [74]:
# Model name
LLAMA_MODEL_NAME = 'llama3'
LLAMA_MODEL_NAME

'llama3'

### Setting

In [75]:
# Retriever
vector_field='text_vector'
print(REST_OPT_ES_HYBRID_PATH)
opt_es_hybrid = read_document(REST_OPT_ES_HYBRID_PATH)
print(json.dumps(opt_es_hybrid, indent=4))

es_hybrid_conf = {
  "url": ES_URL,
  "index_name": INDEX_NAME,
  "type": "hybrid",
  "vector_field": 'text_vector',
  "boosting": opt_es_hybrid['best_boosts'],
  "embedding": {"model_name": MINILM_EMBEDDING_MODEL_NAME}
}
retriever_hydrid = build_retriever(es_hybrid_conf, ENTRY_TEMPLATE)

../data/evaluation/retriever/res-opt-es-hybrid.json
{
    "method": "es_hybrid",
    "best_boosts": {
        "meals": 1.3267880385949111,
        "title": 1.8744388428292256,
        "ingredients": 1.5107278257070114,
        "summary": 3.3330549166389503,
        "text": 2.9015143483052235,
        "tips": 2.619409172670707,
        "vector_boost": 0.8992024772398135
    },
    "best_mrr": 0.9194683908045982,
    "base_train_hit_rate": 0.9181034482758621,
    "base_train_mrr": 0.8942887931034492,
    "base_valid_hit_rate": 0.9310344827586207,
    "base_valid_mrr": 0.8894396551724137,
    "boost_train_hit_rate": 0.9224137931034483,
    "boost_train_mrr": 0.9194683908045982,
    "boost_valid_hit_rate": 0.9396551724137931,
    "boost_valid_mrr": 0.9077586206896553
}
search_type: hybrid


In [76]:
# Output dir
EVAL_RAG_LLAMA3_HYBRID_DATA_DIR = f"{EVAL_RAG_DATA_DIR}/{LLAMA_MODEL_NAME}_hybrid"
EVAL_RAG_LLAMA3_HYBRID_PROMPTS_DATA_DIR = f"{EVAL_RAG_LLAMA3_HYBRID_DATA_DIR}/prompts_rag"
print(EVAL_RAG_LLAMA3_HYBRID_DATA_DIR)
print(EVAL_RAG_LLAMA3_HYBRID_PROMPTS_DATA_DIR)
os.makedirs(EVAL_RAG_LLAMA3_HYBRID_PROMPTS_DATA_DIR, exist_ok=True)

../data/evaluation/rag/llama3_hybrid
../data/evaluation/rag/llama3_hybrid/prompts_rag


In [77]:
# RAG
#rag_chain = build_rag(LLAMA_MODEL_NAME, TEMPLATE_RAG_V1, retriever_bm25)
rag_chain = build_eval(LLAMA_MODEL_NAME, TEMPLATE_RAG_V1)

# LLM-AS-JUDGE
eval_chain = build_eval(LLAMA_MODEL_NAME, TEMPLATE_LLM_JUDGE_V1)

model: llama3
model: llama3


### Run Evaluation

In [60]:
# Evaluate RAG Colab
results = evaluate_rag_colab(
    ground_truth, 
    rag_chain,
    eval_chain,
    EVAL_RAG_LLAMA3_HYBRID_PROMPTS_DATA_DIR,
    EVAL_RAG_LLAMA3_HYBRID_DATA_DIR
)

[EVAL-RAG] sample       : 1160
[EVAL-RAG] src_data_dir : ../data/evaluation/rag/llama3_hybrid/prompts_rag
[EVAL-RAG] out_data_dir : ../data/evaluation/rag/llama3_hybrid


  0%|          | 0/1160 [00:00<?, ?it/s]

[BUILD-EVALUATIONS] src_data_dir : ../data/evaluation/rag/llama3_hybrid/llm-as-judge
../data/evaluation/rag/llama3_hybrid/llm-as-judge ../data/evaluation/rag/llama3_hybrid/llm-as-judge 1160


  0%|          | 0/1160 [00:00<?, ?it/s]

Generated questions with errors: 0


In [80]:
%%time
# Evaluate RAG
results = evaluate_rag(
    ground_truth, 
    LLAMA_MODEL_NAME,
    rag_chain,
    eval_chain,
    EVAL_RAG_LLAMA3_HYBRID_DATA_DIR
)

[EVAL-RAG] sample          : 1160
[EVAL-RAG] model_name      : llama3
[EVAL-RAG] output_data_dir : ../data/evaluation/rag/llama3_hybrid


  0%|          | 0/1160 [00:00<?, ?it/s]

CPU times: user 59.5 ms, sys: 19.9 ms, total: 79.3 ms
Wall time: 77.1 ms


In [81]:
%%time
# Build evaluation
build_evaluations(
    ground_truth_path=GROUND_TRUTH_PATH,
    gen_data_dir=EVAL_RAG_LLAMA3_HYBRID_DATA_DIR
)

[BUILD-EVALUATIONS] ground_truth_path : ../data/test/ground_truth/ground-truth-retrieval.csv
[BUILD-EVALUATIONS] gen_data_dir      : ../data/evaluation/rag/llama3_hybrid


  0%|          | 0/1160 [00:00<?, ?it/s]

Generated questions with errors: 0
CPU times: user 111 ms, sys: 56.7 ms, total: 167 ms
Wall time: 164 ms


### Show Results

In [82]:
!ls -lh "{EVAL_RAG_LLAMA3_HYBRID_DATA_DIR}"

total 1,3M
drwxr-xr-x 2 aztleclan aztleclan  60K oct 27 02:55 llm-as-judge
drwxrwxr-x 2 aztleclan aztleclan 128K oct 26 23:37 prompts_rag
drwxr-xr-x 2 aztleclan aztleclan  60K oct 27 02:55 rag
-rw-r--r-- 1 aztleclan aztleclan 1,1M oct 28 01:10 rag-evaluation.csv


In [83]:
df_eval_rag_hybrid_llam3 = pd.read_csv(f"{EVAL_RAG_LLAMA3_HYBRID_DATA_DIR}/rag-evaluation.csv")

In [84]:
df_eval_rag_hybrid_llam3

Unnamed: 0,id,doc_id,chunk_number,number,answer,question,relevance,explanation
0,erjXeb0Hscw@000,erjXeb0Hscw,0,1,I think there may be some confusion! The recip...,How do I achieve crispy and golden bacon for m...,PARTLY_RELEVANT,The generated answer provides tips on how to a...
1,erjXeb0Hscw@000,erjXeb0Hscw,0,2,According to the instructions in the Creamy Sp...,What is the recommended size of cheese cubes f...,RELEVANT,The generated answer directly addresses the qu...
2,erjXeb0Hscw@000,erjXeb0Hscw,0,3,"According to the recipe ""Creamy Spaghetti with...",Can I cook the spaghetti longer than eight min...,RELEVANT,The generated answer directly addresses the qu...
3,erjXeb0Hscw@000,erjXeb0Hscw,0,4,A great question!\n\nTo enhance the creaminess...,How do I enhance the creaminess of the cheese ...,RELEVANT,The generated answer directly addresses the qu...
4,erjXeb0Hscw@000,erjXeb0Hscw,0,5,"According to the Tips for the ""Creamy Spaghett...",Is my Creamy Spaghetti with Cheese and Bacon c...,RELEVANT,The generated answer directly addresses the qu...
...,...,...,...,...,...,...,...,...
1155,9DIMOoVbyyE@000,9DIMOoVbyyE,0,1,"I'm happy to help! However, I must point out t...",How do I prepare the pre-ferment for Roscón de...,PARTLY_RELEVANT,The generated answer does not directly provide...
1156,9DIMOoVbyyE@000,9DIMOoVbyyE,0,2,"I'm happy to help!\n\nHowever, I must point ou...",What type of flour should I use to make the st...,NON_RELEVANT,The generated answer does not address the spec...
1157,9DIMOoVbyyE@000,9DIMOoVbyyE,0,3,According to the context of our recipe databas...,"In the instructions for Roscón de Reyes, it me...",RELEVANT,The generated answer directly addresses the qu...
1158,9DIMOoVbyyE@000,9DIMOoVbyyE,0,4,"According to our recipe database, for Roscón d...",Can you clarify how to shape the dough into a ...,RELEVANT,The generated answer directly addresses the qu...


In [85]:
df_eval_rag_hybrid_llam3[["answer", "question", "relevance", "explanation"]].head(5)

Unnamed: 0,answer,question,relevance,explanation
0,I think there may be some confusion! The recip...,How do I achieve crispy and golden bacon for m...,PARTLY_RELEVANT,The generated answer provides tips on how to a...
1,According to the instructions in the Creamy Sp...,What is the recommended size of cheese cubes f...,RELEVANT,The generated answer directly addresses the qu...
2,"According to the recipe ""Creamy Spaghetti with...",Can I cook the spaghetti longer than eight min...,RELEVANT,The generated answer directly addresses the qu...
3,A great question!\n\nTo enhance the creaminess...,How do I enhance the creaminess of the cheese ...,RELEVANT,The generated answer directly addresses the qu...
4,"According to the Tips for the ""Creamy Spaghett...",Is my Creamy Spaghetti with Cheese and Bacon c...,RELEVANT,The generated answer directly addresses the qu...


In [86]:
df_eval_rag_hybrid_llam3.relevance.value_counts()

relevance
RELEVANT           570
NON_RELEVANT       367
PARTLY_RELEVANT    223
Name: count, dtype: int64

In [87]:
df_eval_rag_hybrid_llam3.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.491379
NON_RELEVANT       0.316379
PARTLY_RELEVANT    0.192241
Name: proportion, dtype: float64

## gpt-4o-mini

In [58]:
# Model name
GPT_4O_MINI_MODEL_NAME = 'gpt-4o-mini'
GPT_4O_MINI_MODEL_NAME

'gpt-4o-mini'

#### Setting

In [59]:
# Retriever
vector_field='text_vector'
print(REST_OPT_ES_HYBRID_PATH)
opt_es_hybrid = read_document(REST_OPT_ES_HYBRID_PATH)
print(json.dumps(opt_es_hybrid, indent=4))

es_hybrid_conf = {
  "url": ES_URL,
  "index_name": INDEX_NAME,
  "type": "hybrid",
  "vector_field": 'text_vector',
  "boosting": opt_es_hybrid['best_boosts'],
  "embedding": {"model_name": MINILM_EMBEDDING_MODEL_NAME}
}
retriever_hydrid = build_retriever(es_hybrid_conf, ENTRY_TEMPLATE)

../data/evaluation/retriever/res-opt-es-hybrid.json
{
    "method": "es_hybrid",
    "best_boosts": {
        "meals": 1.3267880385949111,
        "title": 1.8744388428292256,
        "ingredients": 1.5107278257070114,
        "summary": 3.3330549166389503,
        "text": 2.9015143483052235,
        "tips": 2.619409172670707,
        "vector_boost": 0.8992024772398135
    },
    "best_mrr": 0.9194683908045982,
    "base_train_hit_rate": 0.9181034482758621,
    "base_train_mrr": 0.8942887931034492,
    "base_valid_hit_rate": 0.9310344827586207,
    "base_valid_mrr": 0.8894396551724137,
    "boost_train_hit_rate": 0.9224137931034483,
    "boost_train_mrr": 0.9194683908045982,
    "boost_valid_hit_rate": 0.9396551724137931,
    "boost_valid_mrr": 0.9077586206896553
}
search_type: hybrid


In [60]:
# Output dir
EVAL_RAG_GPT_4O_MINI_HYBRID_DATA_DIR = f"{EVAL_RAG_DATA_DIR}/{GPT_4O_MINI_MODEL_NAME}_hybrid"
EVAL_RAG_GPT_4O_MINI_HYBRID_PROMPTS_DATA_DIR = f"{EVAL_RAG_GPT_4O_MINI_HYBRID_DATA_DIR}/prompts_rag"
print(EVAL_RAG_GPT_4O_MINI_HYBRID_DATA_DIR)
print(EVAL_RAG_GPT_4O_MINI_HYBRID_PROMPTS_DATA_DIR)
os.makedirs(EVAL_RAG_GPT_4O_MINI_HYBRID_PROMPTS_DATA_DIR, exist_ok=True)

../data/evaluation/rag/gpt-4o-mini_hybrid
../data/evaluation/rag/gpt-4o-mini_hybrid/prompts_rag


In [61]:
# RAG
gpt_4o_mini_rag_chain = build_rag(GPT_4O_MINI_MODEL_NAME, TEMPLATE_RAG_V1, retriever_hydrid)
#gpt_4o_mini_rag_chain = build_eval(LLAMA_MODEL_NAME, TEMPLATE_RAG_V1)

# LLM-AS-JUDGE
gpt_4o_mini_eval_chain = build_eval(GPT_4O_MINI_MODEL_NAME, TEMPLATE_LLM_JUDGE_V1)

model: gpt-4o-mini
model: gpt-4o-mini


### Run Evaluation

In [269]:
# Evaluate RAG
results = evaluate_rag(
    ground_truth, 
    GPT_4O_MINI_MODEL_NAME,
    gpt_4o_mini_rag_chain,
    gpt_4o_mini_eval_chain,
    EVAL_RAG_GPT_4O_MINI_HYBRID_DATA_DIR
)

[EVAL-RAG] sample          : 1160
[EVAL-RAG] output_data_dir : ../data/evaluation/rag/gpt-4o-mini_hybrid


  0%|          | 0/1160 [00:00<?, ?it/s]

In [62]:
%%time
# Build evaluation
build_evaluations(
    ground_truth_path=GROUND_TRUTH_PATH,
    gen_data_dir=EVAL_RAG_GPT_4O_MINI_HYBRID_DATA_DIR
)

[BUILD-EVALUATIONS] ground_truth_path : ../data/test/ground_truth/ground-truth-retrieval.csv
[BUILD-EVALUATIONS] gen_data_dir      : ../data/evaluation/rag/gpt-4o-mini_hybrid


  0%|          | 0/2320 [00:00<?, ?it/s]

Generated questions with errors: 0
stats: 1160
eval_stats: 1160
CPU times: user 327 ms, sys: 170 ms, total: 497 ms
Wall time: 908 ms


### Show Results

In [63]:
!ls -lh "{EVAL_RAG_GPT_4O_MINI_HYBRID_DATA_DIR}"

total 1,9M
drwxrwxr-x 2 aztleclan aztleclan 120K oct 27 23:16 llm-as-judge
drwxrwxr-x 2 aztleclan aztleclan 4,0K oct 27 19:12 prompts_rag
drwxrwxr-x 2 aztleclan aztleclan 120K oct 27 23:16 rag
-rw-rw-r-- 1 aztleclan aztleclan 817K oct 28 01:07 rag-evaluation.csv
-rw-rw-r-- 1 aztleclan aztleclan 863K oct 28 01:07 rag-evaluation-stats.csv


In [64]:
df_eval_rag_hybrid_gpt_4o_mini = pd.read_csv(f"{EVAL_RAG_GPT_4O_MINI_HYBRID_DATA_DIR}/rag-evaluation.csv")

In [65]:
df_eval_rag_hybrid_gpt_4o_mini

Unnamed: 0,id,doc_id,chunk_number,number,answer,question,relevance,explanation
0,erjXeb0Hscw@000,erjXeb0Hscw,0,1,To achieve crispy and golden bacon for your Cr...,How do I achieve crispy and golden bacon for m...,RELEVANT,The generated answer provides specific instruc...
1,erjXeb0Hscw@000,erjXeb0Hscw,0,2,The recommended size for the cheese cubes for ...,What is the recommended size of cheese cubes f...,RELEVANT,The generated answer directly addresses the qu...
2,erjXeb0Hscw@000,erjXeb0Hscw,0,3,"Yes, you can cook the spaghetti longer than ei...",Can I cook the spaghetti longer than eight min...,RELEVANT,The generated answer directly addresses the qu...
3,erjXeb0Hscw@000,erjXeb0Hscw,0,4,To enhance the creaminess of the cheese sauce ...,How do I enhance the creaminess of the cheese ...,RELEVANT,The generated answer directly addresses the qu...
4,erjXeb0Hscw@000,erjXeb0Hscw,0,5,The Creamy Spaghetti with Cheese and Bacon is ...,Is my Creamy Spaghetti with Cheese and Bacon c...,RELEVANT,The generated answer directly addresses the qu...
...,...,...,...,...,...,...,...,...
1155,9DIMOoVbyyE@000,9DIMOoVbyyE,0,1,To prepare the pre-ferment for Roscón de Reyes...,How do I prepare the pre-ferment for Roscón de...,RELEVANT,The generated answer provides a clear method f...
1156,9DIMOoVbyyE@000,9DIMOoVbyyE,0,2,"You should use strong flour, which is typicall...",What type of flour should I use to make the st...,RELEVANT,The generated answer directly addresses the qu...
1157,9DIMOoVbyyE@000,9DIMOoVbyyE,0,3,"To soften butter without melting it, you can f...","In the instructions for Roscón de Reyes, it me...",RELEVANT,The generated answer directly addresses the qu...
1158,9DIMOoVbyyE@000,9DIMOoVbyyE,0,4,To shape the dough for Roscón de Reyes into a ...,Can you clarify how to shape the dough into a ...,RELEVANT,The generated answer directly addresses both p...


In [66]:
df_eval_rag_hybrid_gpt_4o_mini[["answer", "question", "relevance", "explanation"]].head(5)

Unnamed: 0,answer,question,relevance,explanation
0,To achieve crispy and golden bacon for your Cr...,How do I achieve crispy and golden bacon for m...,RELEVANT,The generated answer provides specific instruc...
1,The recommended size for the cheese cubes for ...,What is the recommended size of cheese cubes f...,RELEVANT,The generated answer directly addresses the qu...
2,"Yes, you can cook the spaghetti longer than ei...",Can I cook the spaghetti longer than eight min...,RELEVANT,The generated answer directly addresses the qu...
3,To enhance the creaminess of the cheese sauce ...,How do I enhance the creaminess of the cheese ...,RELEVANT,The generated answer directly addresses the qu...
4,The Creamy Spaghetti with Cheese and Bacon is ...,Is my Creamy Spaghetti with Cheese and Bacon c...,RELEVANT,The generated answer directly addresses the qu...


In [67]:
df_eval_rag_hybrid_gpt_4o_mini.relevance.value_counts()

relevance
RELEVANT           1023
PARTLY_RELEVANT     116
NON_RELEVANT         21
Name: count, dtype: int64

In [68]:
df_eval_rag_hybrid_gpt_4o_mini.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.881897
PARTLY_RELEVANT    0.100000
NON_RELEVANT       0.018103
Name: proportion, dtype: float64

In [69]:
df_eval_rag_hybrid_gpt_4o_mini_stats = pd.read_csv(
    f"{EVAL_RAG_GPT_4O_MINI_HYBRID_DATA_DIR}/rag-evaluation-stats.csv"
)

In [70]:
df_eval_rag_hybrid_gpt_4o_mini_stats.head(5)

Unnamed: 0,question,answer,model_used,response_time,relevance,relevance_explanation,prompt_tokens,completion_tokens,total_tokens,eval_prompt_tokens,eval_completion_tokens,eval_total_tokens,openai_cost
0,What is the recommended temperature and time t...,The recommended temperature to bake the rolled...,gpt-4o-mini,1.349952,RELEVANT,The generated answer provides the exact temper...,2254,43,2297,258,47,305,0.000431
1,What's the best way to achieve a slightly liqu...,To achieve a slightly liquid consistency for t...,gpt-4o-mini,1.939975,RELEVANT,The generated answer directly addresses the qu...,2267,78,2345,288,58,346,0.000465
2,Can I adjust the cooking time of the millefeui...,"The cooking time for the Eggplant, Tomato, and...",gpt-4o-mini,2.05544,RELEVANT,The generated answer directly addresses the qu...,2178,79,2257,294,49,343,0.000448
3,How do I grease the mold for the Creamy Fried ...,To grease the mold for the Creamy Fried Milk d...,gpt-4o-mini,1.479998,RELEVANT,The generated answer directly addresses the qu...,1972,59,2031,270,52,322,0.000403
4,What is the ideal temperature and timing to ch...,The filled chicken in the Chicken Villaray rec...,gpt-4o-mini,1.629993,RELEVANT,The generated answer provides both the timing ...,1971,56,2027,262,60,322,0.000405


## gpt-4o

In [37]:
# Model name
GPT_4O_MODEL_NAME = 'gpt-4o'
GPT_4O_MODEL_NAME

'gpt-4o'

### Setting

In [35]:
# Retriever
vector_field='text_vector'
print(REST_OPT_ES_HYBRID_PATH)
opt_es_hybrid = read_document(REST_OPT_ES_HYBRID_PATH)
print(json.dumps(opt_es_hybrid, indent=4))

es_hybrid_conf = {
  "url": ES_URL,
  "index_name": INDEX_NAME,
  "type": "hybrid",
  "vector_field": 'text_vector',
  "boosting": opt_es_hybrid['best_boosts'],
  "embedding": {"model_name": MINILM_EMBEDDING_MODEL_NAME}
}
retriever_hydrid = build_retriever(es_hybrid_conf, ENTRY_TEMPLATE)

../data/evaluation/retriever/res-opt-es-hybrid.json
{
    "method": "es_hybrid",
    "best_boosts": {
        "meals": 1.3267880385949111,
        "title": 1.8744388428292256,
        "ingredients": 1.5107278257070114,
        "summary": 3.3330549166389503,
        "text": 2.9015143483052235,
        "tips": 2.619409172670707,
        "vector_boost": 0.8992024772398135
    },
    "best_mrr": 0.9194683908045982,
    "base_train_hit_rate": 0.9181034482758621,
    "base_train_mrr": 0.8942887931034492,
    "base_valid_hit_rate": 0.9310344827586207,
    "base_valid_mrr": 0.8894396551724137,
    "boost_train_hit_rate": 0.9224137931034483,
    "boost_train_mrr": 0.9194683908045982,
    "boost_valid_hit_rate": 0.9396551724137931,
    "boost_valid_mrr": 0.9077586206896553
}
search_type: hybrid


In [38]:
# Output dir
EVAL_RAG_GPT_4O_HYBRID_DATA_DIR = f"{EVAL_RAG_DATA_DIR}/{GPT_4O_MODEL_NAME}_hybrid"
EVAL_RAG_GPT_4O_HYBRID_PROMPTS_DATA_DIR = f"{EVAL_RAG_GPT_4O_HYBRID_DATA_DIR}/prompts_rag"
print(EVAL_RAG_GPT_4O_HYBRID_DATA_DIR)
print(EVAL_RAG_GPT_4O_HYBRID_PROMPTS_DATA_DIR)
os.makedirs(EVAL_RAG_GPT_4O_HYBRID_PROMPTS_DATA_DIR, exist_ok=True)

../data/evaluation/rag/gpt-4o_hybrid
../data/evaluation/rag/gpt-4o_hybrid/prompts_rag


In [42]:
# RAG
gpt_4o_rag_chain = build_rag(GPT_4O_MODEL_NAME, TEMPLATE_RAG_V1, retriever_hydrid)
#gpt_4o_mini_rag_chain = build_eval(LLAMA_MODEL_NAME, TEMPLATE_RAG_V1)

# LLM-AS-JUDGE
gpt_4o_eval_chain = build_eval(GPT_4O_MODEL_NAME, TEMPLATE_LLM_JUDGE_V1)

model: gpt-4o
model: gpt-4o


### Run Evaluation

In [45]:
%%time
# Evaluate RAG
results = evaluate_rag(
    ground_truth[:700], 
    GPT_4O_MODEL_NAME,
    gpt_4o_rag_chain,
    gpt_4o_eval_chain,
    EVAL_RAG_GPT_4O_HYBRID_DATA_DIR
)

[EVAL-RAG] sample          : 700
[EVAL-RAG] model_name      : gpt-4o
[EVAL-RAG] output_data_dir : ../data/evaluation/rag/gpt-4o_hybrid


  0%|          | 0/700 [00:00<?, ?it/s]

CPU times: user 45.4 ms, sys: 34.1 ms, total: 79.5 ms
Wall time: 138 ms


In [47]:
%%time
# Build evaluation
build_evaluations(
    ground_truth_path=GROUND_TRUTH_PATH,
    gen_data_dir=EVAL_RAG_GPT_4O_HYBRID_DATA_DIR
)

[BUILD-EVALUATIONS] ground_truth_path : ../data/test/ground_truth/ground-truth-retrieval.csv
[BUILD-EVALUATIONS] gen_data_dir      : ../data/evaluation/rag/gpt-4o_hybrid


  0%|          | 0/1400 [00:00<?, ?it/s]

Generated questions with errors: 0
stats: 700
eval_stats: 700
CPU times: user 238 ms, sys: 112 ms, total: 350 ms
Wall time: 557 ms


### Show Results

In [49]:
!ls -lh "{EVAL_RAG_GPT_4O_HYBRID_DATA_DIR}"

total 1,2M
drwxrwxr-x 2 aztleclan aztleclan  72K oct 28 00:58 llm-as-judge
drwxrwxr-x 2 aztleclan aztleclan 4,0K oct 27 23:56 prompts_rag
drwxrwxr-x 2 aztleclan aztleclan  72K oct 28 00:58 rag
-rw-rw-r-- 1 aztleclan aztleclan 511K oct 28 01:05 rag-evaluation.csv
-rw-rw-r-- 1 aztleclan aztleclan 532K oct 28 01:05 rag-evaluation-stats.csv


In [50]:
df_eval_rag_hybrid_gpt_4o = pd.read_csv(f"{EVAL_RAG_GPT_4O_HYBRID_DATA_DIR}/rag-evaluation.csv")

In [51]:
df_eval_rag_hybrid_gpt_4o

Unnamed: 0,id,doc_id,chunk_number,number,answer,question,relevance,explanation
0,erjXeb0Hscw@000,erjXeb0Hscw,0,1,To achieve crispy and golden bacon for your Cr...,How do I achieve crispy and golden bacon for m...,RELEVANT,The generated answer directly addresses the qu...
1,erjXeb0Hscw@000,erjXeb0Hscw,0,2,The recommended size for the cheese cubes of s...,What is the recommended size of cheese cubes f...,RELEVANT,The generated answer directly addresses the qu...
2,erjXeb0Hscw@000,erjXeb0Hscw,0,3,"Yes, you can cook the spaghetti longer than ei...",Can I cook the spaghetti longer than eight min...,RELEVANT,The generated answer directly addresses the qu...
3,erjXeb0Hscw@000,erjXeb0Hscw,0,4,To enhance the creaminess of the cheese sauce ...,How do I enhance the creaminess of the cheese ...,RELEVANT,The generated answer directly addresses the qu...
4,erjXeb0Hscw@000,erjXeb0Hscw,0,5,According to the Tips in the recipe for Creamy...,Is my Creamy Spaghetti with Cheese and Bacon c...,RELEVANT,The generated answer directly addresses the qu...
...,...,...,...,...,...,...,...,...
695,zzNpW51VfpQ@000,zzNpW51VfpQ,0,1,"To prepare the gazpacho, start by washing all ...",What steps should I take to prepare the gazpac...,RELEVANT,The generated answer directly addresses the qu...
696,zzNpW51VfpQ@000,zzNpW51VfpQ,0,2,"If your gazpacho is too thick, you can adjust ...",How do I know if my gazpacho is too thick and ...,PARTLY_RELEVANT,The answer provides a solution to adjust the c...
697,zzNpW51VfpQ@000,zzNpW51VfpQ,0,3,To adjust the amount of vinegar in traditional...,"In traditional Andalusian Gazpacho, how do I a...",RELEVANT,The generated answer provides a clear and dire...
698,zzNpW51VfpQ@000,zzNpW51VfpQ,0,4,"Yes, you can garnish your Andalusian Gazpacho ...",Can I garnish my Andalusian Gazpacho with any ...,RELEVANT,The generated answer directly addresses the qu...


In [52]:
df_eval_rag_hybrid_gpt_4o[["answer", "question", "relevance", "explanation"]].head(5)

Unnamed: 0,answer,question,relevance,explanation
0,To achieve crispy and golden bacon for your Cr...,How do I achieve crispy and golden bacon for m...,RELEVANT,The generated answer directly addresses the qu...
1,The recommended size for the cheese cubes of s...,What is the recommended size of cheese cubes f...,RELEVANT,The generated answer directly addresses the qu...
2,"Yes, you can cook the spaghetti longer than ei...",Can I cook the spaghetti longer than eight min...,RELEVANT,The generated answer directly addresses the qu...
3,To enhance the creaminess of the cheese sauce ...,How do I enhance the creaminess of the cheese ...,RELEVANT,The generated answer directly addresses the qu...
4,According to the Tips in the recipe for Creamy...,Is my Creamy Spaghetti with Cheese and Bacon c...,RELEVANT,The generated answer directly addresses the qu...


In [53]:
df_eval_rag_hybrid_gpt_4o.relevance.value_counts()

relevance
RELEVANT           544
PARTLY_RELEVANT    143
NON_RELEVANT        13
Name: count, dtype: int64

In [54]:
df_eval_rag_hybrid_gpt_4o.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.777143
PARTLY_RELEVANT    0.204286
NON_RELEVANT       0.018571
Name: proportion, dtype: float64

In [88]:
df_eval_rag_hybrid_gpt_4o_stats = pd.read_csv(
    f"{EVAL_RAG_GPT_4O_HYBRID_DATA_DIR}/rag-evaluation-stats.csv"
)

In [217]:
import os
from langchain_openai import ChatOpenAI
from langchain.callbacks import get_openai_callback

In [89]:
df_eval_rag_hybrid_gpt_4o_stats.head(5)

Unnamed: 0,question,answer,model_used,response_time,relevance,relevance_explanation,prompt_tokens,completion_tokens,total_tokens,eval_prompt_tokens,eval_completion_tokens,eval_total_tokens,openai_cost
0,Is the Shrimp Cream Soup a hot dish or can it ...,The Shrimp Cream Soup is a hot dish. It takes ...,gpt-4o,3.536518,RELEVANT,The generated answer directly addresses both a...,1453,21,1474,230,64,294,0.005058
1,Is it necessary to add the nutmeg mentioned in...,The nutmeg is mentioned as part of the instruc...,gpt-4o,6.142909,PARTLY_RELEVANT,The generated answer addresses the question ab...,2253,85,2338,294,67,361,0.007888
2,How do I prevent the shortcrust pastry from pu...,To prevent the shortcrust pastry from puffing ...,gpt-4o,3.306008,RELEVANT,The generated answer directly addresses the qu...,2145,43,2188,248,68,316,0.007092
3,"In the Octopus Galician Style recipe, what's t...","In the Octopus Galician Style recipe, the best...",gpt-4o,6.080312,RELEVANT,The generated answer directly addresses both p...,2131,63,2194,272,63,335,0.007267
4,What are some tips for serving the Chickpeas w...,"To serve the Chickpeas with Prawns dish, you c...",gpt-4o,5.282053,RELEVANT,The generated answer provides specific tips fo...,2044,143,2187,347,58,405,0.007987
