#### Evaluation 
This notebook tests and assesses the performance of different retrieval methods and reranker

In [11]:
import pickle
import sys
import faiss
import chromadb

sys.path.append('../')
from utils import retriever_evaluation, display_results_retriever

from llama_index.core import (
    VectorStoreIndex,
    StorageContext
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.postprocessor.colbert_rerank import ColbertRerank
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker
from llama_index.core.postprocessor import SentenceTransformerRerank

#import data, queries and embedding model
qa_dataset_path = "../data/icrc_qa_dataset_semantic2_2_2048.pkl"
nodes_path = "../data/nodes_icrc_semantic2_2_2048.pkl"
model_url = "https://huggingface.co/bartowski/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf"
embed_model = "dunzhang/stella_en_400M_v5"

embed_model = HuggingFaceEmbedding(model_name =embed_model, trust_remote_code=True)
nodes = pickle.load(open(nodes_path,'rb'))
qa_dataset = pickle.load(open(qa_dataset_path,'rb'))
llm_llama3 = LlamaCPP(
    model_url=model_url,
    model_path=None,
    temperature=0.1,
    max_new_tokens=512,
    context_window=3900,
    generate_kwargs={},
    model_kwargs={"n_gpu_layers": -1},
    verbose=True,
)

You try to use a model that was created with version 3.0.1, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")
Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  torch.load(os.path.join(input_path, "pytorch_model.bin"), map_location=

### Simple embedding

In [12]:
# embedding model as a retriever
index = VectorStoreIndex(nodes, embed_model=embed_model, show_progress=True) #compute indexes
base_retriever = index.as_retriever(similarity_top_k=3) #build retriever
base_retriever_evaluator = retriever_evaluation(base_retriever, metrics=["hit_rate","mrr","ndcg"]) #set evaluator
base_eval_results2 = await base_retriever_evaluator.aevaluate_dataset(qa_dataset) #evaluate
display_results_retriever("Base Retriever", base_eval_results2, metrics=["hit_rate","mrr","ndcg"]) #display results

Generating embeddings: 100%|██████████| 2048/2048 [01:04<00:00, 31.83it/s]
Generating embeddings: 100%|██████████| 2048/2048 [01:07<00:00, 30.47it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:40<00:00, 50.76it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:37<00:00, 54.15it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:41<00:00, 48.94it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:42<00:00, 48.24it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:44<00:00, 46.09it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:42<00:00, 48.25it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:49<00:00, 41.25it/s]
Generating embeddings: 100%|██████████| 1988/1988 [00:50<00:00, 39.20it/s]


Unnamed: 0,Retriever Name,hit_rate,mrr,ndcg
0,Base Retriever,0.658,0.551333,0.271599


### With Re-Ranker

In [13]:
#using a reranker
base_retriever = index.as_retriever(similarity_top_k=10)
bge_reranker = FlagEmbeddingReranker(
    top_n=3,
    model="BAAI/bge-reranker-large", # "Alibaba-NLP/gte-Qwen2-7B-instruct"
    use_fp16=False
)

base_bge_retriever_evaluator = retriever_evaluation(base_retriever, node_postprocessor=[bge_reranker], metrics =["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"])

base_bge_eval_results =  await base_bge_retriever_evaluator.aevaluate_dataset(qa_dataset)
display_results_retriever("Base and bge Retriever", base_bge_eval_results, metrics =["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"])

Unnamed: 0,Retriever Name,hit_rate,mrr,precision,recall,ap,ndcg
0,Base and bge Retriever,0.718,0.652667,0.239333,0.718,0.652667,0.31417


In [None]:
base_retriever = index.as_retriever(similarity_top_k=10)
colbert_reranker = ColbertRerank(
    top_n=3,
    model="colbert-ir/colbertv2.0",
    tokenizer="colbert-ir/colbertv2.0",
    keep_retrieval_score=True,
)
base_colbert_retriever_evaluator = retriever_evaluation(base_retriever, node_postprocessor=[colbert_reranker], metrics =["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"])

base_colbert_eval_results =  await base_colbert_retriever_evaluator.aevaluate_dataset(qa_dataset)
display_results_retriever("Base and colbert Retriever", base_colbert_eval_results)

In [None]:
base_retriever = index.as_retriever(similarity_top_k=10)
bge_reranker = FlagEmbeddingReranker(
    top_n=3,
    model="BAAI/bge-reranker-base", # "Alibaba-NLP/gte-Qwen2-7B-instruct"
    use_fp16=False
)

base_bge_retriever_evaluator = retriever_evaluation(base_retriever, node_postprocessor=[bge_reranker], metrics =["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"])

base_bge_eval_results =  await base_bge_retriever_evaluator.aevaluate_dataset(qa_dataset)
display_results_retriever("Base and bge Retriever", base_bge_eval_results)

In [None]:
base_retriever = index.as_retriever(similarity_top_k=10)
postprocessor = SentenceTransformerRerank(
model="cross-encoder/ms-marco-MiniLM-L-2-v2",
 top_n=5
)

base_ce_retriever_evaluator = retriever_evaluation(base_retriever, node_postprocessor=[postprocessor], metrics =["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"])

base_ce_eval_results =  await base_ce_retriever_evaluator.aevaluate_dataset(qa_dataset)
display_results_retriever("Base and bge Retriever", base_ce_eval_results)

### BM25

In [None]:
BM25retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=3)
BM25_retriever_evaluator = retriever_evaluation(BM25retriever)
BM25_eval_results =  await BM25_retriever_evaluator.aevaluate_dataset(qa_dataset)
display_results_retriever("BM25 Retriever", BM25_eval_results)

In [None]:
BM25retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=10)
colbert_reranker = ColbertRerank(
    top_n=3,
    model="colbert-ir/colbertv2.0",
    tokenizer="colbert-ir/colbertv2.0",
    keep_retrieval_score=True,
)

BM25_colbert_retriever_evaluator = retriever_evaluation(BM25retriever, node_postprocessor=[colbert_reranker], metrics =["hit_rate", "mrr", "ndcg"])

BM25_colbert_eval_results =  await BM25_colbert_retriever_evaluator.aevaluate_dataset(qa_dataset)
display_results_retriever("BM25 and colbert Retriever", BM25_colbert_eval_results, ["hit_rate", "mrr", "ndcg"])

In [None]:
BM25retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=10)
bge_reranker = FlagEmbeddingReranker(
    top_n=3,
    model="BAAI/bge-reranker-base",
    use_fp16=False
)

BM25_bge_retriever_evaluator = retriever_evaluation(BM25retriever, node_postprocessor=[bge_reranker], metrics =["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"])

BM25_bge_eval_results =  await BM25_bge_retriever_evaluator.aevaluate_dataset(qa_dataset)
display_results_retriever("BM25 and bge Retriever", BM25_bge_eval_results, ["hit_rate", "mrr", "ap", "ndcg"])

### Faiss

In [6]:
d = 1024
faiss_index = faiss.IndexFlatL2(d)

vector_store = FaissVectorStore(faiss_index=faiss_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex(nodes, storage_context=storage_context, embed_model = embed_model
)

base_retriever = index.as_retriever(similarity_top_k=3)
base_bge_retriever_evaluator = retriever_evaluation(base_retriever, metrics =["hit_rate", "mrr"])
base_bge_eval_results =  await base_bge_retriever_evaluator.aevaluate_dataset(qa_dataset)
display_results_retriever("Base and bge Retriever", base_bge_eval_results)

### Chroma DB

In [4]:
# create client and a new collection
chroma_client = chromadb.EphemeralClient()
chroma_collection = chroma_client.create_collection("quickstart")

vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex(
    nodes, storage_context=storage_context, embed_model=embed_model
)

from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker
base_retriever = index.as_retriever(similarity_top_k=3)

base_bge_retriever_evaluator = retriever_evaluation(base_retriever, metrics =["hit_rate", "mrr"])

base_bge_eval_results =  await base_bge_retriever_evaluator.aevaluate_dataset(qa_dataset)
display_results_retriever("Base and bge Retriever", base_bge_eval_results)


### Elastic search

In [None]:
# import
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.elasticsearch import ElasticsearchStore
from llama_index.core import StorageContext

vector_store = ElasticsearchStore(
    es_cloud_id="",
    es_api_key="",  # see Elasticsearch Vector Store for more authentication options
    index_name="test",
    embed_model = embed_model,
    model = llm_llama3
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex(nodes, storage_context=storage_context, show_progress = True
)

In [None]:
base_retriever = index.as_retriever(similarity_top_k=3)
base_bge_retriever_evaluator = retriever_evaluation(base_retriever, metrics =["hit_rate", "mrr"])
base_bge_eval_results =  await base_bge_retriever_evaluator.aevaluate_dataset(qa_dataset)
display_results_retriever("Base and bge Retriever", base_bge_eval_results)