# Install

In [1]:
!pip -qq install elasticsearch

In [2]:
!pip -qq install tqdm

In [157]:
!pip -qq install hyperopt

In [152]:
!pip -qq install --upgrade ipywidgets

# Check Google Colab

In [1]:
try:
  import google.colab
  from google.colab import drive
  drive.mount('/content/drive')
  IN_COLAB = True
except:
  IN_COLAB = False
print(f"am I in Colab? {IN_COLAB}")

am I in Colab? False


# Import

In [2]:
import os
import json
import pandas as pd
import numpy as np

from tqdm.auto import tqdm

# Elasticsearch
from elasticsearch import Elasticsearch
from sklearn.model_selection import train_test_split

# Model
#from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings
#from langchain_core.embeddings import Embeddings

# Elasticsearch
from elasticsearch import Elasticsearch

In [3]:
np.__version__

'1.26.4'

# Settings

In [4]:
if IN_COLAB:
  # Colab
  BASE_DIR = "."
  BACKUPS_DATA_DIR   = "/content/drive/MyDrive/Colab Notebooks/Dataclub/llm/data"
else:
  # Local
  BASE_DIR = ".."
  BACKUPS_DATA_DIR   = "../backups"

# Raw directory
RAW_DATA_DIR               = f"{BASE_DIR}/data/raw"
RAW_DOCS_DATA_DIR          = f"{BASE_DIR}/data/raw/documents"
RAW_INFO_DATA_DIR          = f"{BASE_DIR}/data/raw/info"
# Preprocessing
PROCESSED_DATA_DIR         = f"{BASE_DIR}/data/processed"
PROCESSED_DOCS_DATA_DIR    = f"{BASE_DIR}/data/processed/documents"
# Test directory
TEST_DATA_DIR               = f"{BASE_DIR}/data/test"
GROUND_TRUTH_DATA_DIR       = f"{BASE_DIR}/data/test/ground_truth"
GROUND_TRUTH_DOCS_DATA_DIR  = f"{BASE_DIR}/data/test/ground_truth/documents"
GROUND_TRUTH_GEN_DATA_DIR   = f"{BASE_DIR}/data/test/ground_truth/generated"
# Test directory
EVAL_DATA_DIR            = f"{BASE_DIR}/data/evaluation"
EVAL_RETRIEVER_DATA_DIR  = f"{BASE_DIR}/data/evaluation/retriever"
EVAL_RAG_DATA_DIR        = f"{BASE_DIR}/data/evaluation/rag"

# Raw Info
PLAYLIST_INFO_PATH = f"{RAW_INFO_DATA_DIR}/playlist_info.pkl"
VIDEO_PLAYLIST_MAP_PATH = f"{RAW_INFO_DATA_DIR}/video_playlist_map.pkl"

# Ground-truth
GROUND_TRUTH_PATH = f"{GROUND_TRUTH_DATA_DIR}/ground-truth-retrieval.csv"

# Optimization
REST_OPT_ES_BM25_PATH       = f"{EVAL_RETRIEVER_DATA_DIR}/res-opt-es-bm25.json"
REST_OPT_ES_HYBRID_PATH     = f"{EVAL_RETRIEVER_DATA_DIR}/res-opt-es-hybrid.json"
REST_OPT_ES_HYBRID_RRF_PATH = f"{EVAL_RETRIEVER_DATA_DIR}/res-opt-es-hybrid-rrf.json"

# Make dirs if not exists
if not os.path.exists(RAW_DATA_DIR):
  print("Not exists dir: ", RAW_DATA_DIR)
os.makedirs(RAW_DOCS_DATA_DIR, exist_ok=True)
os.makedirs(RAW_INFO_DATA_DIR, exist_ok=True)
os.makedirs(PROCESSED_DOCS_DATA_DIR, exist_ok=True)
os.makedirs(TEST_DATA_DIR, exist_ok=True)
os.makedirs(EVAL_DATA_DIR, exist_ok=True)
os.makedirs(EVAL_RETRIEVER_DATA_DIR, exist_ok=True)
os.makedirs(EVAL_RAG_DATA_DIR, exist_ok=True)
os.makedirs(BACKUPS_DATA_DIR, exist_ok=True)

In [5]:
%load_ext autoreload
%autoreload 2
import os
import sys

# Agregar solo si no está ya en sys.path
if BASE_DIR not in sys.path:
    sys.path.append(BASE_DIR)

# Ahora puedes importar tu módulo
import cooking_recipe_assistant
from cooking_recipe_assistant.commons.utils import (
    read_pickle, 
    read_document, 
    read_text,
    save_pickle, 
    save_document,
    save_text
)

from cooking_recipe_assistant.evaluation.retrievers import evaluate
from cooking_recipe_assistant.evaluation.optimization import run_hyperopt
from cooking_recipe_assistant.rags.retrievers.es_bm25 import es_bm25_query
from cooking_recipe_assistant.rags.retrievers.es_hybrid import es_hybrid_query
from cooking_recipe_assistant.rags.retrievers.es_hybrid_rrf import es_hybrid_rrf_query

In [6]:
ES_URL = "http://localhost:9200"
INDEX_NAME = "cooking-recipes"

In [7]:
ES_CLIENT = Elasticsearch(hosts=[ES_URL])

In [8]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
MINILM_EMBEDDING_MODEL_NAME = 'all-MiniLM-L6-v2'
#NMNET_EMBEDDING_MODEL_NAME = 'all-mpnet-base-v2'
BATCH_SIZE = 100

In [9]:
EMBEDDINGS = HuggingFaceEmbeddings(model_name=MINILM_EMBEDDING_MODEL_NAME)
#EMBEDDINGS = HuggingFaceEmbeddings(model_name=NMNET_EMBEDDING_MODEL_NAME)

In [10]:
!nvidia-smi

Sat Oct 26 21:28:52 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.107.02             Driver Version: 550.107.02     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce GTX 1050        Off |   00000000:01:00.0 Off |                  N/A |
| N/A   46C    P0             N/A / ERR!  |     153MiB /   4096MiB |      1%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# Check ElasticSearch

In [12]:
info_es = ES_CLIENT.info()
print(json.dumps(info_es.body, indent=4))

{
    "name": "bd9dd9bd70f5",
    "cluster_name": "docker-cluster",
    "cluster_uuid": "wg43N1DqSdy9g9z_pOLIDQ",
    "version": {
        "number": "8.4.3",
        "build_flavor": "default",
        "build_type": "docker",
        "build_hash": "42f05b9372a9a4a470db3b52817899b99a76ee73",
        "build_date": "2022-10-04T07:17:24.662462378Z",
        "build_snapshot": false,
        "lucene_version": "9.3.0",
        "minimum_wire_compatibility_version": "7.17.0",
        "minimum_index_compatibility_version": "7.0.0"
    },
    "tagline": "You Know, for Search"
}


In [13]:
if ES_CLIENT.indices.exists(index=INDEX_NAME):
    info_indice = ES_CLIENT.indices.get(index=INDEX_NAME)
    print(json.dumps(info_indice.body, indent=4))

{
    "cooking-recipes": {
        "aliases": {},
        "mappings": {
            "properties": {
                "chunk_number": {
                    "type": "integer"
                },
                "doc_id": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    }
                },
                "id": {
                    "type": "text"
                },
                "ingredients": {
                    "type": "keyword"
                },
                "meals": {
                    "type": "keyword"
                },
                "summart_vector": {
                    "type": "float"
                },
                "summary": {
                    "type": "text"
                },
                "summary_vector": {
                    "type": "dense_vector",
        

In [14]:
if ES_CLIENT.indices.exists(index=INDEX_NAME):
    settings = ES_CLIENT.indices.get_settings(index=INDEX_NAME)
    print(json.dumps(settings.body, indent=4))

{
    "cooking-recipes": {
        "settings": {
            "index": {
                "routing": {
                    "allocation": {
                        "include": {
                            "_tier_preference": "data_content"
                        }
                    }
                },
                "number_of_shards": "1",
                "provided_name": "cooking-recipes",
                "creation_date": "1729698947472",
                "number_of_replicas": "0",
                "uuid": "xRPfDileSMin8UmdVwEoOQ",
                "version": {
                    "created": "8040399"
                }
            }
        }
    }
}


In [15]:
if ES_CLIENT.indices.exists(index=INDEX_NAME):
    count = ES_CLIENT.count(index=INDEX_NAME)['count']
    print(count)

232


# Check Embedding

In [16]:
text = "LangChain is a framework for developing applications powered by language models."


embedding_vector = EMBEDDINGS.embed_query(text)
print(type(embedding_vector), len(embedding_vector))
print(embedding_vector[:10])

<class 'list'> 384
[-0.0330653041601181, -0.04929623752832413, 0.0011787894181907177, -0.052408911287784576, -0.03758711367845535, 0.025819668546319008, -0.03928522393107414, 0.05620158091187477, 0.09028893709182739, -0.052350003272295]


# Ground-truth

## Read Questions

In [117]:
#!ls -lh "{RAW_DATA_DIR}" 2>null | grep json 2>null| head -5
!ls -lh '{RAW_DATA_DIR}' 2>/dev/null | grep pkl  2>/dev/null | head -5

-rw-rw-r-- 1 aztleclan aztleclan 2,2K oct 12 22:44 playlist_info.pkl
-rw-rw-r-- 1 aztleclan aztleclan  11K oct 12 22:44 video_playlist_map.pkl


In [118]:
!ls -lh '{PROCESSED_DATA_DIR}/documents' 2>/dev/null | grep json 2>/dev/null | head -5

-rw-rw-r-- 1 aztleclan aztleclan 3,3K oct 23 17:55 086AnjxzAfg.json
-rw-rw-r-- 1 aztleclan aztleclan 3,0K oct 23 17:55 0iZUayL1RQ0.json
-rw-rw-r-- 1 aztleclan aztleclan 3,1K oct 23 17:55 0X7I-vr2oaM.json
-rw-rw-r-- 1 aztleclan aztleclan 3,1K oct 23 17:55 1WAbPmolGqY.json
-rw-rw-r-- 1 aztleclan aztleclan 3,3K oct 23 17:55 2CxQTUGD-5E.json


In [119]:
!ls -lh "{GROUND_TRUTH_PATH}"

-rw-rw-r-- 1 aztleclan aztleclan 169K oct 26 23:21 ../data/test/ground_truth/ground-truth-retrieval.csv


In [120]:
playlist_info = read_pickle(PLAYLIST_INFO_PATH)
video_playlist_map = read_pickle(VIDEO_PLAYLIST_MAP_PATH)

In [121]:
playlist_info_df = pd.DataFrame.from_dict(playlist_info)
playlist_info_df[["playlist_id", "en_playlist_title"]].head(5)

Unnamed: 0,playlist_id,en_playlist_title
0,PLoyFGpU_IasVb0QvKn2IgbPZ_gKNGzP7Z,Stews and spooning
1,PLoyFGpU_IasWzT5nS-qF-gI2K5eJURRsV,Paste
2,PLoyFGpU_IasX1xkQfxcAw3alSibGj3Mg_,Rices
3,PLoyFGpU_IasXD955aZhg9aZ9_UPtGIrE4,Meats and poultry
4,PLoyFGpU_IasUmRDsq3r_M9rqZcrtH4im5,Christmas recipes


In [122]:
df_ground_truth = pd.read_csv(GROUND_TRUTH_PATH)

In [123]:
df_ground_truth.head(5)

Unnamed: 0,id,doc_id,chunk_number,number,question
0,erjXeb0Hscw@000,erjXeb0Hscw,0,1,How do I achieve crispy and golden bacon for m...
1,erjXeb0Hscw@000,erjXeb0Hscw,0,2,What is the recommended size of cheese cubes f...
2,erjXeb0Hscw@000,erjXeb0Hscw,0,3,Can I cook the spaghetti longer than eight min...
3,erjXeb0Hscw@000,erjXeb0Hscw,0,4,How do I enhance the creaminess of the cheese ...
4,erjXeb0Hscw@000,erjXeb0Hscw,0,5,Is my Creamy Spaghetti with Cheese and Bacon c...


## Divide Training and Test

In [124]:
doc_ids = df_ground_truth['doc_id'].to_numpy()
df_ground_truth_train, df_ground_truth_val = train_test_split(
    df_ground_truth, 
    test_size=0.2, 
    random_state=42, 
    shuffle=True, 
    stratify=doc_ids)

In [125]:
print(f"Dataset: {len(df_ground_truth)}")
print(f"Dataset: {len(df_ground_truth_train)}({100*len(df_ground_truth_train)/len(df_ground_truth)}%)")
print(f"Dataset: {len(df_ground_truth_val)}({100*len(df_ground_truth_val)/len(df_ground_truth)}%)")

Dataset: 1160
Dataset: 928(80.0%)
Dataset: 232(20.0%)


# Text Search

## Test Query

In [126]:
query_test = df_ground_truth.iloc[10].question
doc_id = df_ground_truth.iloc[10].id
print(f"Doc_id: {doc_id}, Query : {query_test}")

Doc_id: PSd2SN2rx6k@000, Query : How do I properly prepare the shrimp heads for frying in the Noodles with Gambones recipe?


In [127]:
res_es_bm25 = es_bm25_query(
    es_client=ES_CLIENT,
    index_name=INDEX_NAME,
    query=query_test,
)

In [128]:
print(f"Query : {query_test}")
print(f"Doc_id: {doc_id}")
print(f"Results: {len(res_es_bm25)}")
print("="*100)
for r in res_es_bm25:
    print(f"doc_id: {r['doc_id']}")
    print(f"title : {r['title']}")
    print(f"text  : {r['text'][:1000]}")
    print("="*100)

Query : How do I properly prepare the shrimp heads for frying in the Noodles with Gambones recipe?
Doc_id: PSd2SN2rx6k@000
Results: 5
doc_id: PSd2SN2rx6k
title : Noodles with Gambones
text  : heat oil in a pan and add the shrimp heads. fry them until they release their juices. strain the oil into a pot for the stew. in the same pot, add shrimp heads, fish bones, a halved tomato, water, onion, and noras. cook for 30 to 40 minutes. after 40 minutes, strain the broth and keep it warm. chop the red and green bell peppers, and set aside. remove the skin from the tomato and dice it. finely chop the onion and set it aside. in a mortar, mash three garlic cloves with parsley until it forms a paste. add red wine and saffron to the garlic mixture and reserve it. cut the gambones into two or three pieces and set them aside. in the pot with the reserved oil, add the chopped bell peppers and onion, cooking them over medium heat with a pinch of salt. once softened, add the diced tomato, salt, and a b

In [31]:
print(f"Query : {query_test}")
print(f"Doc_id: {doc_id}")
print(f"Results: {len(res_es_bm25)}")
print("="*100)
for r in res_es_bm25:
    print(f"doc_id: {r['doc_id']}")
    print(f"title : {r['title']}")
    print(f"text  : {r['text'][:1000]}")
    print("="*100)

Query : What steps should I take to prepare the 'Shrimp Cream Soup' recipe, specifically for the step where I blend everything together until smooth?
Doc_id: uB7o1ZfSW9Q@001
Results: 5
doc_id: uB7o1ZfSW9Q
title : Shrimp Cream Soup
text  : to medium - high for about 22 minutes until the cream reaches a delicious color. turn off the heat and blend everything together until smooth. for added richness, mix in a bit of heavy cream and blend again. taste and adjust the salt if necessary. serve hot, garnished with the reserved shrimp, and enjoy!
doc_id: uB7o1ZfSW9Q
title : Shrimp Cream Soup
text  : start by peeling the shrimp, setting aside some for decoration. heat a good amount of extra virgin olive oil in a pan and fry the shrimp reserved for decoration with a pinch of salt until cooked to your liking. remove and set aside. in the same pan, fry the remaining shrimp for the soup base with some salt to infuse the oil with flavor. lower the heat and add the leek, onion, and red bell pepper. c

## Evaluate

In [129]:
def question_text_bm25(q):
    question = q['question']
    return es_bm25_query(
        ES_CLIENT, 
        INDEX_NAME, 
        question
    )

In [130]:
%%time
ground_truth = df_ground_truth.to_dict(orient='records')
print(f"Dataset size: {len(ground_truth)}")
eval_data_es_bm25 =  evaluate(
    ground_truth, 
    question_text_bm25)

Dataset size: 1160
CPU times: user 927 ms, sys: 47.4 ms, total: 974 ms
Wall time: 6.79 s


In [131]:
print(f"[FULL-DATA] Text : {eval_data_es_bm25}")

[FULL-DATA] Text : {'hit_rate': 0.9146551724137931, 'mrr': 0.8774425287356341}


In [97]:
print(f"[FULL-DATA] Text : {eval_data_es_bm25}")

[FULL-DATA] Text : {'hit_rate': 0.9146551724137931, 'mrr': 0.8774425287356338}


Train Evaluation

In [98]:
%%time
ground_truth_train = df_ground_truth_train.to_dict(orient='records')
print(f"Dataset size: {len(ground_truth_train)}")
eval_train_es_bm25 =  evaluate(
    ground_truth_train, 
    question_text_bm25)

Dataset size: 928
CPU times: user 761 ms, sys: 46.6 ms, total: 808 ms
Wall time: 5.77 s


In [99]:
print(f"[EVAL-TRAIN] Text : {eval_train_es_bm25}")

[EVAL-TRAIN] Text : {'hit_rate': 0.9137931034482759, 'mrr': 0.8790050287356329}


Test Evaluation

In [100]:
%%time
ground_truth_val = df_ground_truth_val.to_dict(orient='records')
print(f"Dataset size: {len(ground_truth_val)}")
eval_valid_es_bm25 =  evaluate(
    ground_truth_val, 
    question_text_bm25)

Dataset size: 232
CPU times: user 174 ms, sys: 27.8 ms, total: 202 ms
Wall time: 1.48 s


In [101]:
print(f"[EVAL-TRAIN] Text : {eval_train_es_bm25}")
print(f"[EVAL-VALID] Text : {eval_valid_es_bm25}")

[EVAL-TRAIN] Text : {'hit_rate': 0.9137931034482759, 'mrr': 0.8790050287356329}
[EVAL-VALID] Text : {'hit_rate': 0.9181034482758621, 'mrr': 0.8711925287356324}


## Optimize

Search optimization with dataset train

In [102]:
%%time
best_es_bm25_boosts, best_es_bm25_mrr = run_hyperopt(
    df=df_ground_truth_train, 
    es=ES_CLIENT, 
    index=INDEX_NAME,
    es_with_boost=es_bm25_query,
    max_evals=50)

100%|████████████████████████████████████████████████████████| 50/50 [05:03<00:00,  6.08s/trial, best loss: 0.10520833333333257]
Mejores parámetros encontrados:
{'ingredients': 2.591815470947524, 'meals': 0.8854500618094214, 'summary': 4.060097538845653, 'text': 3.7200158260107226, 'tips': 4.469334105172512, 'title': 2.7630515885634166, 'vector_boost': 0.47080059231565463}
Boosts optimizados:
{'meals': 0.8854500618094214, 'title': 2.7630515885634166, 'ingredients': 2.591815470947524, 'summary': 4.060097538845653, 'text': 3.7200158260107226, 'tips': 4.469334105172512}
El mejor valor de MRR es: 0.8947916666666674
CPU times: user 39 s, sys: 1.94 s, total: 41 s
Wall time: 5min 3s


In [103]:
print(f"[OPT] Text mmr  : {best_es_bm25_mrr}")
print(f"[OPT] Text Boost: {best_es_bm25_boosts}")

[OPT] Text mmr  : 0.8947916666666674
[OPT] Text Boost: {'meals': 0.8854500618094214, 'title': 2.7630515885634166, 'ingredients': 2.591815470947524, 'summary': 4.060097538845653, 'text': 3.7200158260107226, 'tips': 4.469334105172512}


Train Evaluation with boosting

In [104]:
def question_text_bm25_boosts(q):
    question = q['question']
    return es_bm25_query(
        ES_CLIENT, 
        INDEX_NAME, 
        question, 
        best_es_bm25_boosts
    )

In [105]:
%%time
ground_truth_train = df_ground_truth_train.to_dict(orient='records')
print(f"Dataset size: {len(ground_truth_val)}")
eval_train_es_bm25_boosts = evaluate(
        ground_truth_train, 
        question_text_bm25_boosts)

Dataset size: 232
CPU times: user 694 ms, sys: 38.3 ms, total: 733 ms
Wall time: 5.39 s


In [106]:
print(f"[EVAL-TRAIN] Text      : {eval_train_es_bm25}")
print(f"[EVAL-TRAIN] Text Boost: {eval_train_es_bm25_boosts}")

[EVAL-TRAIN] Text      : {'hit_rate': 0.9137931034482759, 'mrr': 0.8790050287356329}
[EVAL-TRAIN] Text Boost: {'hit_rate': 0.9094827586206896, 'mrr': 0.8947916666666674}


Test Evaluation with boosting

In [107]:
%%time
ground_truth_val = df_ground_truth_val.to_dict(orient='records')
print(f"Dataset size: {len(ground_truth_val)}")
eval_valid_es_bm25_boosts = evaluate(
        ground_truth_val, 
        question_text_bm25_boosts)

Dataset size: 232
CPU times: user 203 ms, sys: 8.15 ms, total: 211 ms
Wall time: 1.42 s


In [108]:
print(f"[EVAL-VALID] Text      : {eval_valid_es_bm25}")
print(f"[EVAL-VALID] Text Boost: {eval_valid_es_bm25_boosts}")

[EVAL-VALID] Text      : {'hit_rate': 0.9181034482758621, 'mrr': 0.8711925287356324}
[EVAL-VALID] Text Boost: {'hit_rate': 0.9267241379310345, 'mrr': 0.8870689655172416}


## Show Results

In [109]:
print(f"[EVAL-TRAIN] Text      : {eval_train_es_bm25}")
print(f"[EVAL-TRAIN] Text Boost: {eval_train_es_bm25_boosts}")
print(f"[EVAL-VALID] Text      : {eval_valid_es_bm25}")
print(f"[EVAL-VALID] Text Boost: {eval_valid_es_bm25_boosts}")

[EVAL-TRAIN] Text      : {'hit_rate': 0.9137931034482759, 'mrr': 0.8790050287356329}
[EVAL-TRAIN] Text Boost: {'hit_rate': 0.9094827586206896, 'mrr': 0.8947916666666674}
[EVAL-VALID] Text      : {'hit_rate': 0.9181034482758621, 'mrr': 0.8711925287356324}
[EVAL-VALID] Text Boost: {'hit_rate': 0.9267241379310345, 'mrr': 0.8870689655172416}


In [172]:
print(f"[EVAL-TRAIN] Text      : {eval_train_es_bm25}")
print(f"[EVAL-TRAIN] Text Boost: {eval_train_es_bm25_boosts}")
print(f"[EVAL-VALID] Text      : {eval_valid_es_bm25}")
print(f"[EVAL-VALID] Text Boost: {eval_valid_es_bm25_boosts}")

[EVAL-TRAIN] Text      : {'hit_rate': 0.8351293103448276, 'mrr': 0.7390804597701152}
[EVAL-TRAIN] Text Boost: {'hit_rate': 0.8599137931034483, 'mrr': 0.7617636494252882}
[EVAL-VALID] Text      : {'hit_rate': 0.8017241379310345, 'mrr': 0.714080459770115}
[EVAL-VALID] Text Boost: {'hit_rate': 0.8103448275862069, 'mrr': 0.727801724137931}


In [43]:
print(f"[EVAL-TRAIN] Text      : {eval_train_es_bm25}")
print(f"[EVAL-TRAIN] Text Boost: {eval_train_es_bm25_boosts}")
print(f"[EVAL-VALID] Text      : {eval_valid_es_bm25}")
print(f"[EVAL-VALID] Text Boost: {eval_valid_es_bm25_boosts}")

[EVAL-TRAIN] Text      : {'hit_rate': 0.8351293103448276, 'mrr': 0.7390804597701152}
[EVAL-TRAIN] Text Boost: {'hit_rate': 0.8588362068965517, 'mrr': 0.7581896551724147}
[EVAL-VALID] Text      : {'hit_rate': 0.8017241379310345, 'mrr': 0.714080459770115}
[EVAL-VALID] Text Boost: {'hit_rate': 0.8103448275862069, 'mrr': 0.7213362068965516}


## Save results

In [94]:
results_opt_es_bm25 = {
    'method': 'es_bm25',
    'best_boosts': best_es_bm25_boosts,
    'best_mrr'   : best_es_bm25_mrr,
    # Base
    'base_train_hit_rate': eval_train_es_bm25['hit_rate'],
    'base_train_mrr'     : eval_train_es_bm25['mrr'],
    'base_valid_hit_rate': eval_valid_es_bm25['hit_rate'],
    'base_valid_mrr'     : eval_valid_es_bm25['mrr'],
    # Boost
    'boost_train_hit_rate' : eval_train_es_bm25_boosts['hit_rate'],
    'boost_train_mrr'      : eval_train_es_bm25_boosts['mrr'],
    'boost_valid_hit_rate' : eval_valid_es_bm25_boosts['hit_rate'],
    'boost_valid_mrr'      : eval_valid_es_bm25_boosts['mrr'],
}
results_opt_es_bm25

{'method': 'es_bm25',
 'best_boosts': {'meals': 3.3780065729545834,
  'title': 2.350519748557916,
  'ingredients': 1.8480562377707013,
  'summary': 4.142904195381872,
  'text': 3.8827414870597465,
  'tips': 3.2750400070515573},
 'best_mrr': 0.8933369252873572,
 'base_train_hit_rate': 0.9137931034482759,
 'base_train_mrr': 0.8790050287356329,
 'base_valid_hit_rate': 0.9181034482758621,
 'base_valid_mrr': 0.8711925287356324,
 'boost_train_hit_rate': 0.9137931034482759,
 'boost_train_mrr': 0.8933369252873572,
 'boost_valid_hit_rate': 0.9310344827586207,
 'boost_valid_mrr': 0.8790229885057472}

In [110]:
save_document(REST_OPT_ES_BM25_PATH, results_opt_es_bm25)

In [111]:
!ls -lh "{EVAL_RETRIEVER_DATA_DIR}"

total 12K
-rw-rw-r-- 1 aztleclan aztleclan 676 oct 26 23:00 res-opt-es-bm25.json
-rw-rw-r-- 1 aztleclan aztleclan 723 oct 26 21:54 res-opt-es-hybrid.json
-rw-rw-r-- 1 aztleclan aztleclan 724 oct 26 22:20 res-opt-es-hybrid-rrf.json


In [112]:
results_opt_es_bm25 = read_document(REST_OPT_ES_BM25_PATH)
results_opt_es_bm25

{'method': 'es_bm25',
 'best_boosts': {'meals': 3.3780065729545834,
  'title': 2.350519748557916,
  'ingredients': 1.8480562377707013,
  'summary': 4.142904195381872,
  'text': 3.8827414870597465,
  'tips': 3.2750400070515573},
 'best_mrr': 0.8933369252873572,
 'base_train_hit_rate': 0.9137931034482759,
 'base_train_mrr': 0.8790050287356329,
 'base_valid_hit_rate': 0.9181034482758621,
 'base_valid_mrr': 0.8711925287356324,
 'boost_train_hit_rate': 0.9137931034482759,
 'boost_train_mrr': 0.8933369252873572,
 'boost_valid_hit_rate': 0.9310344827586207,
 'boost_valid_mrr': 0.8790229885057472}

In [93]:
best_es_bm25_boosts = results_opt_es_bm25['best_boosts']
best_es_bm25_mrr = results_opt_es_bm25['best_mrr']
eval_train_es_bm25_boosts = {    
    'hit_rate': results_opt_es_bm25['boost_train_hit_rate'],
    'mrr': results_opt_es_bm25['boost_train_mrr'],
}
eval_valid_es_bm25_boosts = {
    'hit_rate': results_opt_es_bm25['boost_valid_hit_rate'], 
    'mrr': results_opt_es_bm25['boost_valid_mrr'],
}

# Search Hybrid

https://www.elastic.co/search-labs/blog/elasticsearch-knn-and-num-candidates-strategies

## Test Query

In [20]:
query_test = df_ground_truth.iloc[50].question
doc_id = df_ground_truth.iloc[50].id
print(f"Doc_id: {doc_id}, Query : {query_test}")
VECTOR_FIELD='text_vector'

Doc_id: 0X7I-vr2oaM@000, Query : How do I prepare the Cold Canapés with Sliced Bread, considering it needs to be refrigerated for at least 4 hours or overnight?


In [21]:
res_es_hybrid = es_hybrid_query(
    es_client=ES_CLIENT,
    index_name=INDEX_NAME,
    query=query_test,
    embeddings=EMBEDDINGS,
    field=VECTOR_FIELD, 
)

In [22]:
print(f"Query : {query_test}")
print(f"Doc_id: {doc_id}")
print(f"Results: {len(res_es_hybrid)}")
print("="*100)
for r in res_es_hybrid:
    print(f"doc_id: {r['doc_id']}")
    print(f"title : {r['title']}")
    print(f"text  : {r['text'][:1000]}")
    print("="*100)

Query : How do I prepare the Cold Canapés with Sliced Bread, considering it needs to be refrigerated for at least 4 hours or overnight?
Doc_id: 0X7I-vr2oaM@000
Results: 5
doc_id: 0X7I-vr2oaM
title : Cold Canapés with Sliced Bread
text  : start by spreading a generous layer of pate on one slice of bread, then add slices of serrano ham, ensuring they fit within the bread ' s edges. place a second slice of bread on top, align the edges, cover with plastic wrap, and refrigerate for at least 4 hours, preferably overnight. for the second canape, spread cream cheese on two slices of bread, layer with sweet ham and optional cheddar slices, then cover with another slice of bread and refrigerate for 4 hours or overnight. for the third canape, use three slices of bread and spread cream cheese on each. add smoked salmon between the layers, cover, wrap in plastic, and refrigerate for 4 to 5 hours or overnight. for the last canape, finely chop canned mussels and mix with 100 ml of cream cheese until

## Evaluate

Train Evaluation

In [23]:
def question_text_hybrid(q):
    question = q['question']
    return es_hybrid_query(
        ES_CLIENT, 
        INDEX_NAME, 
        question,
        EMBEDDINGS, 
        VECTOR_FIELD
    )

In [24]:
%%time
ground_truth_train = df_ground_truth_train.to_dict(orient='records')
print(f"Dataset size: {len(ground_truth_train)}")
eval_train_es_hybrid = evaluate(
    ground_truth_train, 
    question_text_hybrid
)

Dataset size: 928
CPU times: user 7.07 s, sys: 82.5 ms, total: 7.16 s
Wall time: 15.1 s


In [25]:
print(f"[EVAL-TRAIN] Hybrid : {eval_train_es_hybrid}")

[EVAL-TRAIN] Hybrid : {'hit_rate': 0.9181034482758621, 'mrr': 0.8942887931034492}


Test Evaluation

In [26]:
%%time
ground_truth_val = df_ground_truth_val.to_dict(orient='records')
print(f"Dataset size: {len(ground_truth_val)}")
eval_valid_es_hybrid = evaluate(
    ground_truth_val, 
    question_text_hybrid)

Dataset size: 232
CPU times: user 1.65 s, sys: 10.5 ms, total: 1.66 s
Wall time: 3.2 s


In [27]:
print(f"[EVAL-TRAIN] Hybrid : {eval_train_es_hybrid}")
print(f"[EVAL-VALID] Hybrid : {eval_valid_es_hybrid}")

[EVAL-TRAIN] Hybrid : {'hit_rate': 0.9181034482758621, 'mrr': 0.8942887931034492}
[EVAL-VALID] Hybrid : {'hit_rate': 0.9310344827586207, 'mrr': 0.8894396551724137}


Show results: Text + Hybrid

In [34]:
print("="*90)
print("= TEXT")
print("="*90)
print(f"[EVAL-TRAIN] Text   : {eval_train_es_bm25}")
print(f"[EVAL-VALID] Text   : {eval_valid_es_bm25}")
print("="*90)
print("= HYBRID")
print("="*90)
print(f"[EVAL-TRAIN] Hybrid : {eval_train_es_hybrid}")
print(f"[EVAL-VALID] Hybrid : {eval_valid_es_hybrid}")

= TEXT
[EVAL-TRAIN] Text   : {'hit_rate': 0.9137931034482759, 'mrr': 0.8790050287356329}
[EVAL-VALID] Text   : {'hit_rate': 0.9181034482758621, 'mrr': 0.8711925287356324}
= HYBRID
[EVAL-TRAIN] Hybrid : {'hit_rate': 0.9181034482758621, 'mrr': 0.8942887931034492}
[EVAL-VALID] Hybrid : {'hit_rate': 0.9310344827586207, 'mrr': 0.8894396551724137}


## Optimize

Search optimization with dataset train

In [38]:
%%time
best_es_hybrid_boosts, best_es_hybrid_mrr = run_hyperopt(
    df=df_ground_truth_train, 
    es=ES_CLIENT, 
    index=INDEX_NAME,
    es_with_boost=es_hybrid_query,
    embeddings=EMBEDDINGS,
    max_evals=50)

100%|████████████████████████████████████████████████████████| 50/50 [09:45<00:00, 11.70s/trial, best loss: 0.08053160919540181]
Mejores parámetros encontrados:
{'ingredients': 1.5107278257070114, 'meals': 1.3267880385949111, 'summary': 3.3330549166389503, 'text': 2.9015143483052235, 'tips': 2.619409172670707, 'title': 1.8744388428292256, 'vector_boost': 0.8992024772398135}
Boosts optimizados:
{'meals': 1.3267880385949111, 'title': 1.8744388428292256, 'ingredients': 1.5107278257070114, 'summary': 3.3330549166389503, 'text': 2.9015143483052235, 'tips': 2.619409172670707, 'vector_boost': 0.8992024772398135}
El mejor valor de MRR es: 0.9194683908045982
CPU times: user 5min 21s, sys: 2.87 s, total: 5min 24s
Wall time: 9min 45s


In [39]:
print(f"[OPT] Hybrid mmr  : {best_es_hybrid_mrr}")
print(f"[OPT] Hybrid Boost: {best_es_hybrid_boosts}")

[OPT] Hybrid mmr  : 0.9194683908045982
[OPT] Hybrid Boost: {'meals': 1.3267880385949111, 'title': 1.8744388428292256, 'ingredients': 1.5107278257070114, 'summary': 3.3330549166389503, 'text': 2.9015143483052235, 'tips': 2.619409172670707, 'vector_boost': 0.8992024772398135}


In [64]:
print(f"[OPT] Hybrid mmr  : {best_es_hybrid_mrr}")
print(f"[OPT] Hybrid Boost: {best_es_hybrid_boosts}")

[OPT] Hybrid mmr  : 0.8987607758620697
[OPT] Hybrid Boost: {'meals': 2.460981535145023, 'title': 2.382239923997698, 'ingredients': 0.053659830493543836, 'summary': 3.4243613112336804, 'text': 3.2992663636996666, 'tips': 2.749609181985322}


Train Evaluation with Hybrid + Boosting

In [40]:
def question_text_hybrid_boosts(q):
    question = q['question']
    return es_hybrid_query(
        ES_CLIENT, 
        INDEX_NAME, 
        question,
        EMBEDDINGS, 
        VECTOR_FIELD, 
        best_es_hybrid_boosts
    )

In [41]:
%%time
ground_truth_train = df_ground_truth_train.to_dict(orient='records')
print(f"Dataset size: {len(ground_truth_train)}")
eval_train_es_hybrid_boosts = evaluate(
    ground_truth_train, 
    question_text_hybrid_boosts
)

Dataset size: 928
CPU times: user 6.62 s, sys: 47.7 ms, total: 6.67 s
Wall time: 12 s


In [42]:
print(f"[EVAL-TRAIN] Hybrid      : {eval_train_es_hybrid}")
print(f"[EVAL-TRAIN] Hybrid Boost: {eval_train_es_hybrid_boosts}")

[EVAL-TRAIN] Hybrid      : {'hit_rate': 0.9181034482758621, 'mrr': 0.8942887931034492}
[EVAL-TRAIN] Hybrid Boost: {'hit_rate': 0.9224137931034483, 'mrr': 0.9194683908045982}


In [67]:
print(f"[EVAL-TRAIN] Hybrid      : {eval_train_es_hybrid}")
print(f"[EVAL-TRAIN] Hybrid Boost: {eval_train_es_hybrid_boosts}")

[EVAL-TRAIN] Hybrid      : {'hit_rate': 0.9181034482758621, 'mrr': 0.8963182471264378}
[EVAL-TRAIN] Hybrid Boost: {'hit_rate': 0.9137931034482759, 'mrr': 0.8987607758620697}


Test Evaluation with Hybrid + Boosting

In [43]:
%%time
ground_truth_val = df_ground_truth_val.to_dict(orient='records')
print(f"Dataset size: {len(ground_truth_val)}")
eval_valid_es_hybrid_boosts = evaluate(
    ground_truth_val, 
    question_text_hybrid_boosts
)

Dataset size: 232
CPU times: user 1.74 s, sys: 28.3 ms, total: 1.77 s
Wall time: 3.13 s


In [44]:
print(f"[EVAL-TRAIN] Hybrid      : {eval_valid_es_hybrid}")
print(f"[EVAL-TRAIN] Hybrid Boost: {eval_valid_es_hybrid_boosts}")

[EVAL-TRAIN] Hybrid      : {'hit_rate': 0.9310344827586207, 'mrr': 0.8894396551724137}
[EVAL-TRAIN] Hybrid Boost: {'hit_rate': 0.9396551724137931, 'mrr': 0.9077586206896553}


In [69]:
print(f"[EVAL-TRAIN] Hybrid      : {eval_valid_es_hybrid}")
print(f"[EVAL-TRAIN] Hybrid Boost: {eval_valid_es_hybrid_boosts}")

[EVAL-TRAIN] Hybrid      : {'hit_rate': 0.9267241379310345, 'mrr': 0.8914511494252874}
[EVAL-TRAIN] Hybrid Boost: {'hit_rate': 0.9267241379310345, 'mrr': 0.8833333333333336}


Show Results: Text + Hybrid + Boosting

In [52]:
print("="*90)
print("= TEXT")
print("="*90)
print(f"[EVAL-TRAIN] Text        : {eval_train_es_bm25}")
print(f"[EVAL-TRAIN] Text Boost  : {eval_train_es_bm25_boosts}")
print(f"[EVAL-VALID] Text        : {eval_valid_es_bm25}")
print(f"[EVAL-VALID] Text Boost  : {eval_valid_es_bm25_boosts}")
print("="*90)
print("= HYBRID")
print("="*90)
print(f"[EVAL-TRAIN] Hybrid      : {eval_train_es_hybrid}")
print(f"[EVAL-TRAIN] Hybrid Boost: {eval_train_es_hybrid_boosts}")
print(f"[EVAL-VALID] Hybrid      : {eval_valid_es_hybrid}")
print(f"[EVAL-VALID] Hybrid Boost: {eval_valid_es_hybrid_boosts}")

= TEXT
[EVAL-TRAIN] Text        : {'hit_rate': 0.9137931034482759, 'mrr': 0.8790050287356329}
[EVAL-TRAIN] Text Boost  : {'hit_rate': 0.9137931034482759, 'mrr': 0.8933369252873572}
[EVAL-VALID] Text        : {'hit_rate': 0.9181034482758621, 'mrr': 0.8711925287356324}
[EVAL-VALID] Text Boost  : {'hit_rate': 0.9310344827586207, 'mrr': 0.8790229885057472}
= HYBRID
[EVAL-TRAIN] Hybrid      : {'hit_rate': 0.9181034482758621, 'mrr': 0.8942887931034492}
[EVAL-TRAIN] Hybrid Boost: {'hit_rate': 0.9224137931034483, 'mrr': 0.9194683908045982}
[EVAL-VALID] Hybrid      : {'hit_rate': 0.9310344827586207, 'mrr': 0.8894396551724137}
[EVAL-VALID] Hybrid Boost: {'hit_rate': 0.9396551724137931, 'mrr': 0.9077586206896553}


In [71]:
print("="*90)
print("= TEXT")
print("="*90)
print(f"[EVAL-TRAIN] Text        : {eval_train_es_bm25}")
print(f"[EVAL-TRAIN] Text Boost  : {eval_train_es_bm25_boosts}")
print(f"[EVAL-VALID] Text        : {eval_valid_es_bm25}")
print(f"[EVAL-VALID] Text Boost  : {eval_valid_es_bm25_boosts}")
print("="*90)
print("= HYBRID")
print("="*90)
print(f"[EVAL-TRAIN] Hybrid      : {eval_train_es_hybrid}")
print(f"[EVAL-TRAIN] Hybrid Boost: {eval_train_es_hybrid_boosts}")
print(f"[EVAL-VALID] Hybrid      : {eval_valid_es_hybrid}")
print(f"[EVAL-VALID] Hybrid Boost: {eval_valid_es_hybrid_boosts}")

= TEXT
[EVAL-TRAIN] Text        : {'hit_rate': 0.9137931034482759, 'mrr': 0.8790050287356329}
[EVAL-TRAIN] Text Boost  : {'hit_rate': 0.9137931034482759, 'mrr': 0.8933369252873572}
[EVAL-VALID] Text        : {'hit_rate': 0.9181034482758621, 'mrr': 0.8711925287356324}
[EVAL-VALID] Text Boost  : {'hit_rate': 0.9310344827586207, 'mrr': 0.8790229885057472}
= HYBRID
[EVAL-TRAIN] Hybrid      : {'hit_rate': 0.9181034482758621, 'mrr': 0.8963182471264378}
[EVAL-TRAIN] Hybrid Boost: {'hit_rate': 0.9137931034482759, 'mrr': 0.8987607758620697}
[EVAL-VALID] Hybrid      : {'hit_rate': 0.9267241379310345, 'mrr': 0.8914511494252874}
[EVAL-VALID] Hybrid Boost: {'hit_rate': 0.9267241379310345, 'mrr': 0.8833333333333336}


## Save results

In [53]:
results_opt_es_hybrid = {
    'method': 'es_hybrid',
    'best_boosts': best_es_hybrid_boosts,
    'best_mrr'   : best_es_hybrid_mrr,
    # Base
    'base_train_hit_rate': eval_train_es_hybrid['hit_rate'],
    'base_train_mrr'     : eval_train_es_hybrid['mrr'],
    'base_valid_hit_rate': eval_valid_es_hybrid['hit_rate'],
    'base_valid_mrr'     : eval_valid_es_hybrid['mrr'],
    # Boost
    'boost_train_hit_rate' : eval_train_es_hybrid_boosts['hit_rate'],
    'boost_train_mrr'      : eval_train_es_hybrid_boosts['mrr'],
    'boost_valid_hit_rate' : eval_valid_es_hybrid_boosts['hit_rate'],
    'boost_valid_mrr'  : eval_valid_es_hybrid_boosts['mrr'],
}
results_opt_es_hybrid

{'method': 'es_hybrid',
 'best_boosts': {'meals': 1.3267880385949111,
  'title': 1.8744388428292256,
  'ingredients': 1.5107278257070114,
  'summary': 3.3330549166389503,
  'text': 2.9015143483052235,
  'tips': 2.619409172670707,
  'vector_boost': 0.8992024772398135},
 'best_mrr': 0.9194683908045982,
 'base_train_hit_rate': 0.9181034482758621,
 'base_train_mrr': 0.8942887931034492,
 'base_valid_hit_rate': 0.9310344827586207,
 'base_valid_mrr': 0.8894396551724137,
 'boost_train_hit_rate': 0.9224137931034483,
 'boost_train_mrr': 0.9194683908045982,
 'boost_valid_hit_rate': 0.9396551724137931,
 'boost_valid_mrr': 0.9077586206896553}

In [54]:
save_document(REST_OPT_ES_HYBRID_PATH, results_opt_es_hybrid)

In [55]:
!ls -lh "{EVAL_RETRIEVER_DATA_DIR}"

total 12K
-rw-rw-r-- 1 aztleclan aztleclan 676 oct 26 01:53 res-opt-es-bm25.json
-rw-rw-r-- 1 aztleclan aztleclan 723 oct 26 21:54 res-opt-es-hybrid.json
-rw-rw-r-- 1 aztleclan aztleclan 682 oct 26 02:23 res-opt-es-hybrid-rrf.json


In [56]:
results_opt_es_hybrid = read_document(REST_OPT_ES_HYBRID_PATH)
results_opt_es_hybrid

{'method': 'es_hybrid',
 'best_boosts': {'meals': 1.3267880385949111,
  'title': 1.8744388428292256,
  'ingredients': 1.5107278257070114,
  'summary': 3.3330549166389503,
  'text': 2.9015143483052235,
  'tips': 2.619409172670707,
  'vector_boost': 0.8992024772398135},
 'best_mrr': 0.9194683908045982,
 'base_train_hit_rate': 0.9181034482758621,
 'base_train_mrr': 0.8942887931034492,
 'base_valid_hit_rate': 0.9310344827586207,
 'base_valid_mrr': 0.8894396551724137,
 'boost_train_hit_rate': 0.9224137931034483,
 'boost_train_mrr': 0.9194683908045982,
 'boost_valid_hit_rate': 0.9396551724137931,
 'boost_valid_mrr': 0.9077586206896553}

# Reranking

## Test Query

In [57]:
query_test = df_ground_truth.iloc[20].question
doc_id = df_ground_truth.iloc[20].id
print(f"Doc_id: {doc_id}, Query : {query_test}")
VECTOR_FIELD='text_vector'

Doc_id: svBQ246217s@000, Query : What type of oil should I use to fry the chicken wings in the Delicious Mixed Rice recipe?


In [58]:
res_es_hybrid_rff = es_hybrid_rrf_query(
    es_client=ES_CLIENT,
    index_name=INDEX_NAME,
    embeddings=EMBEDDINGS,
    field=VECTOR_FIELD, 
    query=query_test
)

In [59]:
print(f"Query : {query_test}")
print(f"Doc_id: {doc_id}")
print(f"Results: {len(res_es_hybrid_rff)}")
print("="*100)
for r in res_es_hybrid_rff:
    print(f"doc_id: {r['doc_id']}")
    print(f"title : {r['title']}")
    print(f"text  : {r['text'][:1000]}")
    print("="*100)

Query : What type of oil should I use to fry the chicken wings in the Delicious Mixed Rice recipe?
Doc_id: svBQ246217s@000
Results: 5
doc_id: svBQ246217s
title : Delicious Mixed Rice
text  : in a large paella pan, heat a generous amount of oil and fry the chicken wings, seasoning with salt until marked, then set aside. add chopped onion, red and green bell peppers to the pan and saute over medium heat until softened, then add minced garlic, being careful not to burn it. once the vegetables are cooked, incorporate the dried mushrooms and saute until they are well - fried before adding the fresh tomato. after a few minutes, return the chicken to the pan and mix all ingredients thoroughly, then pour in a glass of cider to deglaze. allow everything to cook together for a few minutes before adding the rice and stirring for one minute, then add food coloring for color. pour in hot broth and turn on the heat ; once it starts to boil, increase the heat for one minute and then reduce it. add th

## Evaluate

Train Evaluation with Hybrid

In [60]:
def question_text_hybrid_rrf(q):
    question = q['question']
    return es_hybrid_rrf_query(
        ES_CLIENT, 
        INDEX_NAME, 
        question,
        EMBEDDINGS, 
        VECTOR_FIELD, 
    )

In [61]:
%%time
ground_truth_train = df_ground_truth_train.to_dict(orient='records')
print(f"Dataset size: {len(ground_truth_train)}")
eval_train_es_hybrid_rrf = evaluate(
    ground_truth_train, 
    question_text_hybrid_rrf)

Dataset size: 928
CPU times: user 16 s, sys: 271 ms, total: 16.3 s
Wall time: 27.7 s


In [62]:
print(f"[EVAL-TRAIN] Hybrid-rff : {eval_train_es_hybrid_rrf}")

[EVAL-TRAIN] Hybrid-rff : {'hit_rate': 0.9181034482758621, 'mrr': 0.8894576149425297}


Test Evaluation with Hybrid RRF

In [63]:
%%time
ground_truth_val = df_ground_truth_val.to_dict(orient='records')
print(f"Dataset size: {len(ground_truth_val)}")
eval_valid_es_hybrid_rrf = evaluate(
    ground_truth_val, 
    question_text_hybrid_rrf)

Dataset size: 232
CPU times: user 3.96 s, sys: 55.1 ms, total: 4.01 s
Wall time: 6.73 s


In [64]:
print(f"[EVAL-TRAIN] Hybrid-rff : {eval_train_es_hybrid_rrf}")
print(f"[EVAL-VALID] Hybrid-rff : {eval_valid_es_hybrid_rrf}")

[EVAL-TRAIN] Hybrid-rff : {'hit_rate': 0.9181034482758621, 'mrr': 0.8894576149425297}
[EVAL-VALID] Hybrid-rff : {'hit_rate': 0.9310344827586207, 'mrr': 0.8849137931034482}


Show results: Text + Hybrid + Hybrid RRF

In [65]:
print("="*90)
print("= TEXT")
print("="*90)
print(f"[EVAL-TRAIN] Text       : {eval_train_es_bm25}")
print(f"[EVAL-VALID] Text       : {eval_valid_es_bm25}")
print("="*90)
print("= HYBRID")
print("="*90)
print(f"[EVAL-TRAIN] Hybrid     : {eval_train_es_hybrid}")
print(f"[EVAL-VALID] Hybrid     : {eval_valid_es_hybrid}")
print("="*90)
print("= HYBRID RRF")
print("="*90)
print(f"[EVAL-TRAIN] Hybrid-rrf : {eval_train_es_hybrid_rrf}")
print(f"[EVAL-VALID] Hybrid-rrf : {eval_valid_es_hybrid_rrf}")

= TEXT
[EVAL-TRAIN] Text       : {'hit_rate': 0.9137931034482759, 'mrr': 0.8790050287356329}
[EVAL-VALID] Text       : {'hit_rate': 0.9181034482758621, 'mrr': 0.8711925287356324}
= HYBRID
[EVAL-TRAIN] Hybrid     : {'hit_rate': 0.9181034482758621, 'mrr': 0.8942887931034492}
[EVAL-VALID] Hybrid     : {'hit_rate': 0.9310344827586207, 'mrr': 0.8894396551724137}
= HYBRID RRF
[EVAL-TRAIN] Hybrid-rrf : {'hit_rate': 0.9181034482758621, 'mrr': 0.8894576149425297}
[EVAL-VALID] Hybrid-rrf : {'hit_rate': 0.9310344827586207, 'mrr': 0.8849137931034482}


## Optimize

earch optimization with dataset train

In [66]:
%%time
best_es_hybrid_rrf_boosts, best_es_hybrid_rff_mrr = run_hyperopt(
    df=df_ground_truth_train, 
    es=ES_CLIENT, 
    index=INDEX_NAME,
    es_with_boost=es_hybrid_rrf_query,
    embeddings=EMBEDDINGS,
    max_evals=50)

100%|████████████████████████████████████████████████████████| 50/50 [21:37<00:00, 25.95s/trial, best loss: 0.09163074712643593]        
Mejores parámetros encontrados:
{'ingredients': 4.882772492268252, 'meals': 1.4671273591308425, 'summary': 4.882702537681318, 'text': 4.368904374010877, 'tips': 1.2126808445885127, 'title': 1.47987055694058, 'vector_boost': 0.10557689395629086}
Boosts optimizados:
{'meals': 1.4671273591308425, 'title': 1.47987055694058, 'ingredients': 4.882772492268252, 'summary': 4.882702537681318, 'text': 4.368904374010877, 'tips': 1.2126808445885127, 'vector_boost': 0.10557689395629086}
El mejor valor de MRR es: 0.9083692528735641
CPU times: user 12min 43s, sys: 13.2 s, total: 12min 56s
Wall time: 21min 37s


In [67]:
print(f"[OPT] Hybrid-rrf mmr  : {best_es_hybrid_rff_mrr}")
print(f"[OPT] Hybrid-rrf boost: {best_es_hybrid_rrf_boosts}")

[OPT] Hybrid-rrf mmr  : 0.9083692528735641
[OPT] Hybrid-rrf boost: {'meals': 1.4671273591308425, 'title': 1.47987055694058, 'ingredients': 4.882772492268252, 'summary': 4.882702537681318, 'text': 4.368904374010877, 'tips': 1.2126808445885127, 'vector_boost': 0.10557689395629086}


In [95]:
print(f"[OPT] Hybrid-rrf mmr  : {best_es_hybrid_rff_mrr}")
print(f"[OPT] Hybrid-rrf boost: {best_es_hybrid_rrf_boosts}")

[OPT] Hybrid-rrf mmr  : 0.8948096264367823
[OPT] Hybrid-rrf boost: {'meals': 3.3780065729545834, 'title': 2.350519748557916, 'ingredients': 1.8480562377707013, 'summary': 4.142904195381872, 'text': 3.8827414870597465, 'tips': 3.2750400070515573}


Train Evaluation with Hybrid RRF + Boosting

In [68]:
def question_text_hybrid_rrf_boosts(q):
    question = q['question']
    return es_hybrid_rrf_query(
        ES_CLIENT, 
        INDEX_NAME, 
        question,
        EMBEDDINGS, 
        VECTOR_FIELD, 
        best_es_hybrid_rrf_boosts
    )

In [69]:
%%time
ground_truth_train = df_ground_truth_train.to_dict(orient='records')
print(f"Dataset size: {len(ground_truth_train)}")
eval_train_es_hybrid_rrf_boosts = evaluate(
    ground_truth_train, 
    question_text_hybrid_rrf_boosts
)

Dataset size: 928
CPU times: user 15.2 s, sys: 289 ms, total: 15.5 s
Wall time: 26 s


In [70]:
print(f"[EVAL-TRAIN] Hybrid-rrf      : {eval_train_es_hybrid_rrf}")
print(f"[EVAL-TRAIN] Hybrid-rff Boost: {eval_train_es_hybrid_rrf_boosts}")

[EVAL-TRAIN] Hybrid-rrf      : {'hit_rate': 0.9181034482758621, 'mrr': 0.8894576149425297}
[EVAL-TRAIN] Hybrid-rff Boost: {'hit_rate': 0.9224137931034483, 'mrr': 0.9083692528735641}


In [98]:
print(f"[EVAL-TRAIN] Hybrid-rrf      : {eval_train_es_hybrid_rrf}")
print(f"[EVAL-TRAIN] Hybrid-rff Boost: {eval_train_es_hybrid_rrf_boosts}")

[EVAL-TRAIN] Hybrid-rrf      : {'hit_rate': 0.9181034482758621, 'mrr': 0.8894576149425297}
[EVAL-TRAIN] Hybrid-rff Boost: {'hit_rate': 0.9148706896551724, 'mrr': 0.8948096264367823}


Test Evaluation with Hybrid RRF + Boosting

In [71]:
%%time
ground_truth_val = df_ground_truth_val.to_dict(orient='records')
print(f"Dataset size: {len(ground_truth_val)}")
eval_valid_es_hybrid_rrf_boosts = evaluate(
    ground_truth_val, 
    question_text_hybrid_rrf_boosts)

Dataset size: 232
CPU times: user 3.84 s, sys: 35.6 ms, total: 3.87 s
Wall time: 6.51 s


In [72]:
print(f"[EVAL-TRAIN] Hybrid-rff      : {eval_valid_es_hybrid_rrf}")
print(f"[EVAL-TRAIN] Hybrid-rff Boost: {eval_valid_es_hybrid_rrf_boosts}")

[EVAL-TRAIN] Hybrid-rff      : {'hit_rate': 0.9310344827586207, 'mrr': 0.8849137931034482}
[EVAL-TRAIN] Hybrid-rff Boost: {'hit_rate': 0.9396551724137931, 'mrr': 0.8985632183908048}


In [100]:
print(f"[EVAL-TRAIN] Hybrid-rff      : {eval_valid_es_hybrid_rrf}")
print(f"[EVAL-TRAIN] Hybrid-rff Boost: {eval_valid_es_hybrid_rrf_boosts}")

[EVAL-TRAIN] Hybrid-rff      : {'hit_rate': 0.9310344827586207, 'mrr': 0.8849137931034482}
[EVAL-TRAIN] Hybrid-rff Boost: {'hit_rate': 0.9353448275862069, 'mrr': 0.8829741379310347}


Show Results: Text + Hybrid + Hybrid RRF + Boosting

In [73]:
print("="*90)
print("= TEXT")
print("="*90)
print(f"[EVAL-TRAIN] Text            : {eval_train_es_bm25}")
print(f"[EVAL-TRAIN] Text Boost      : {eval_train_es_bm25_boosts}")
print(f"[EVAL-VALID] Text            : {eval_valid_es_bm25}")
print(f"[EVAL-VALID] Text Boost      : {eval_valid_es_bm25_boosts}")
print("="*90)
print("= HYBRID")
print("="*90)
print(f"[EVAL-TRAIN] Hybrid          : {eval_train_es_hybrid}")
print(f"[EVAL-TRAIN] Hybrid Boost    : {eval_train_es_hybrid_boosts}")
print(f"[EVAL-VALID] Hybrid          : {eval_valid_es_hybrid}")
print(f"[EVAL-VALID] Hybrid Boost    : {eval_valid_es_hybrid_boosts}")
print("="*90)
print("= HYBRID-RFF")
print("="*90)
print(f"[EVAL-TRAIN] Hybrid-rrf      : {eval_train_es_hybrid_rrf}")
print(f"[EVAL-TRAIN] Hybrid-rrf Boost: {eval_train_es_hybrid_rrf_boosts}")
print(f"[EVAL-VALID] Hybrid-rrf      : {eval_valid_es_hybrid_rrf}")
print(f"[EVAL-VALID] Hybrid-rrf Boost: {eval_valid_es_hybrid_rrf_boosts}")

= TEXT
[EVAL-TRAIN] Text            : {'hit_rate': 0.9137931034482759, 'mrr': 0.8790050287356329}
[EVAL-TRAIN] Text Boost      : {'hit_rate': 0.9137931034482759, 'mrr': 0.8933369252873572}
[EVAL-VALID] Text            : {'hit_rate': 0.9181034482758621, 'mrr': 0.8711925287356324}
[EVAL-VALID] Text Boost      : {'hit_rate': 0.9310344827586207, 'mrr': 0.8790229885057472}
= HYBRID
[EVAL-TRAIN] Hybrid          : {'hit_rate': 0.9181034482758621, 'mrr': 0.8942887931034492}
[EVAL-TRAIN] Hybrid Boost    : {'hit_rate': 0.9224137931034483, 'mrr': 0.9194683908045982}
[EVAL-VALID] Hybrid          : {'hit_rate': 0.9310344827586207, 'mrr': 0.8894396551724137}
[EVAL-VALID] Hybrid Boost    : {'hit_rate': 0.9396551724137931, 'mrr': 0.9077586206896553}
= HYBRID-RFF
[EVAL-TRAIN] Hybrid-rrf      : {'hit_rate': 0.9181034482758621, 'mrr': 0.8894576149425297}
[EVAL-TRAIN] Hybrid-rrf Boost: {'hit_rate': 0.9224137931034483, 'mrr': 0.9083692528735641}
[EVAL-VALID] Hybrid-rrf      : {'hit_rate': 0.931034482758620

In [102]:
print("="*90)
print("= TEXT")
print("="*90)
print(f"[EVAL-TRAIN] Text            : {eval_train_es_bm25}")
print(f"[EVAL-TRAIN] Text Boost      : {eval_train_es_bm25_boosts}")
print(f"[EVAL-VALID] Text            : {eval_valid_es_bm25}")
print(f"[EVAL-VALID] Text Boost      : {eval_valid_es_bm25_boosts}")
print("="*90)
print("= HYBRID")
print("="*90)
print(f"[EVAL-TRAIN] Hybrid          : {eval_train_es_hybrid}")
print(f"[EVAL-TRAIN] Hybrid Boost    : {eval_train_es_hybrid_boosts}")
print(f"[EVAL-VALID] Hybrid          : {eval_valid_es_hybrid}")
print(f"[EVAL-VALID] Hybrid Boost    : {eval_valid_es_hybrid_boosts}")
print("="*90)
print("= HYBRID-RFF")
print("="*90)
print(f"[EVAL-TRAIN] Hybrid-rrf      : {eval_train_es_hybrid_rrf}")
print(f"[EVAL-TRAIN] Hybrid-rrf Boost: {eval_train_es_hybrid_rrf_boosts}")
print(f"[EVAL-VALID] Hybrid          : {eval_valid_es_hybrid_rrf}")
print(f"[EVAL-VALID] Hybrid-rrf Boost: {eval_valid_es_hybrid_rrf_boosts}")

= TEXT
[EVAL-TRAIN] Text            : {'hit_rate': 0.9137931034482759, 'mrr': 0.8790050287356329}
[EVAL-TRAIN] Text Boost      : {'hit_rate': 0.9137931034482759, 'mrr': 0.8933369252873572}
[EVAL-VALID] Text            : {'hit_rate': 0.9181034482758621, 'mrr': 0.8711925287356324}
[EVAL-VALID] Text Boost      : {'hit_rate': 0.9310344827586207, 'mrr': 0.8790229885057472}
= HYBRID
[EVAL-TRAIN] Hybrid          : {'hit_rate': 0.9181034482758621, 'mrr': 0.8963182471264378}
[EVAL-TRAIN] Hybrid Boost    : {'hit_rate': 0.9137931034482759, 'mrr': 0.8987607758620697}
[EVAL-VALID] Hybrid          : {'hit_rate': 0.9267241379310345, 'mrr': 0.8914511494252874}
[EVAL-VALID] Hybrid Boost    : {'hit_rate': 0.9267241379310345, 'mrr': 0.8833333333333336}
= HYBRID-RFF
[EVAL-TRAIN] Hybrid-rrf      : {'hit_rate': 0.9181034482758621, 'mrr': 0.8894576149425297}
[EVAL-TRAIN] Hybrid-rrf Boost: {'hit_rate': 0.9148706896551724, 'mrr': 0.8948096264367823}
[EVAL-VALID] Hybrid          : {'hit_rate': 0.931034482758620

## Save results

In [77]:
results_opt_es_hybrid_rrf = {
    'method': 'es_hybrid_rrf',
    'best_boosts': best_es_hybrid_rrf_boosts,
    'best_mrr'   : best_es_hybrid_rff_mrr,
    # Base
    'base_train_hit_rate': eval_train_es_hybrid_rrf['hit_rate'],
    'base_train_mrr'     : eval_train_es_hybrid_rrf['mrr'],
    'base_valid_hit_rate': eval_valid_es_hybrid_rrf['hit_rate'],
    'base_valid_mrr'     : eval_valid_es_hybrid_rrf['mrr'],
    # Boost
    'boost_train_hit_rate' : eval_train_es_hybrid_rrf_boosts['hit_rate'],
    'boost_train_mrr'      : eval_train_es_hybrid_rrf_boosts['mrr'],
    'boost_valid_hit_rate' : eval_valid_es_hybrid_rrf_boosts['hit_rate'],
    'boost_valid_mrr'  : eval_valid_es_hybrid_rrf_boosts['mrr'],

}
results_opt_es_hybrid_rrf

{'method': 'es_hybrid_rrf',
 'best_boosts': {'meals': 1.4671273591308425,
  'title': 1.47987055694058,
  'ingredients': 4.882772492268252,
  'summary': 4.882702537681318,
  'text': 4.368904374010877,
  'tips': 1.2126808445885127,
  'vector_boost': 0.10557689395629086},
 'best_mrr': 0.9083692528735641,
 'base_train_hit_rate': 0.9181034482758621,
 'base_train_mrr': 0.8894576149425297,
 'base_valid_hit_rate': 0.9310344827586207,
 'base_valid_mrr': 0.8849137931034482,
 'boost_train_hit_rate': 0.9224137931034483,
 'boost_train_mrr': 0.9083692528735641,
 'boost_valid_hit_rate': 0.9396551724137931,
 'boost_valid_mrr': 0.8985632183908048}

In [78]:
save_document(REST_OPT_ES_HYBRID_RRF_PATH, results_opt_es_hybrid_rrf)

In [79]:
!ls -lh "{EVAL_RETRIEVER_DATA_DIR}"

total 12K
-rw-rw-r-- 1 aztleclan aztleclan 676 oct 26 01:53 res-opt-es-bm25.json
-rw-rw-r-- 1 aztleclan aztleclan 723 oct 26 21:54 res-opt-es-hybrid.json
-rw-rw-r-- 1 aztleclan aztleclan 724 oct 26 22:20 res-opt-es-hybrid-rrf.json


In [82]:
results_opt_es_hybrid_rrf = read_document(REST_OPT_ES_HYBRID_RRF_PATH)
results_opt_es_hybrid_rrf

{'method': 'es_hybrid_rrf',
 'best_boosts': {'meals': 1.4671273591308425,
  'title': 1.47987055694058,
  'ingredients': 4.882772492268252,
  'summary': 4.882702537681318,
  'text': 4.368904374010877,
  'tips': 1.2126808445885127,
  'vector_boost': 0.10557689395629086},
 'best_mrr': 0.9083692528735641,
 'base_train_hit_rate': 0.9181034482758621,
 'base_train_mrr': 0.8894576149425297,
 'base_valid_hit_rate': 0.9310344827586207,
 'base_valid_mrr': 0.8849137931034482,
 'boost_train_hit_rate': 0.9224137931034483,
 'boost_train_mrr': 0.9083692528735641,
 'boost_valid_hit_rate': 0.9396551724137931,
 'boost_valid_mrr': 0.8985632183908048}

# Show Results

In [113]:
!ls -lh "{EVAL_RETRIEVER_DATA_DIR}"

total 12K
-rw-rw-r-- 1 aztleclan aztleclan 676 oct 26 23:00 res-opt-es-bm25.json
-rw-rw-r-- 1 aztleclan aztleclan 723 oct 26 21:54 res-opt-es-hybrid.json
-rw-rw-r-- 1 aztleclan aztleclan 724 oct 26 22:20 res-opt-es-hybrid-rrf.json


In [114]:
records_opt = []
for filename in os.listdir(EVAL_RETRIEVER_DATA_DIR):
    file_path = os.path.join(EVAL_RETRIEVER_DATA_DIR, filename)
    if not filename.endswith(".json"):
        continue
    doc = read_document(file_path)
    records_opt.append(doc)

In [115]:
metric_df = pd.DataFrame.from_dict(records_opt)
metric_df = metric_df.sort_values(by='boost_valid_mrr', ascending=False)

In [116]:
metric_df[["method", "base_train_mrr", "base_valid_mrr", "boost_train_mrr", "boost_valid_mrr"]]

Unnamed: 0,method,base_train_mrr,base_valid_mrr,boost_train_mrr,boost_valid_mrr
1,es_hybrid,0.894289,0.88944,0.919468,0.907759
2,es_hybrid_rrf,0.889458,0.884914,0.908369,0.898563
0,es_bm25,0.879005,0.871193,0.893337,0.879023
