### Importations

In [1]:
!pip install llama-index
!pip install datasets
!pip install llama-index-retrievers-bm25
!pip install llama-index-embeddings-huggingface
!pip install llama-index-llms-llama-cpp
!pip install ragatouille
!pip install llama-index-postprocessor-colbert-rerank
!pip install FlagEmbedding
%pip install llama-index-postprocessor-flag-embedding-reranker

# %pip install llama-index-llms-openai

Collecting llama-index
  Downloading llama_index-0.10.55-py3-none-any.whl (6.8 kB)
Collecting llama-index-agent-openai<0.3.0,>=0.1.4 (from llama-index)
  Downloading llama_index_agent_openai-0.2.8-py3-none-any.whl (13 kB)
Collecting llama-index-cli<0.2.0,>=0.1.2 (from llama-index)
  Downloading llama_index_cli-0.1.12-py3-none-any.whl (26 kB)
Collecting llama-index-core==0.10.55 (from llama-index)
  Downloading llama_index_core-0.10.55-py3-none-any.whl (15.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.5/15.5 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting llama-index-embeddings-openai<0.2.0,>=0.1.5 (from llama-index)
  Downloading llama_index_embeddings_openai-0.1.10-py3-none-any.whl (6.2 kB)
Collecting llama-index-indices-managed-llama-cloud>=0.2.0 (from llama-index)
  Downloading llama_index_indices_managed_llama_cloud-0.2.5-py3-none-any.whl (9.3 kB)
Collecting llama-index-legacy<0.10.0,>=0.9.48 (from llama-index)
  Downloading llama_index_le

In [2]:
!pip install farm-haystack[colab,inference,metrics]

Collecting farm-haystack[colab,inference,metrics]
  Downloading farm_haystack-1.26.2-py3-none-any.whl (763 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m763.7/763.7 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting boilerpy3 (from farm-haystack[colab,inference,metrics])
  Downloading boilerpy3-1.0.7-py3-none-any.whl (22 kB)
Collecting events (from farm-haystack[colab,inference,metrics])
  Downloading Events-0.5-py3-none-any.whl (6.8 kB)
Collecting lazy-imports==0.3.1 (from farm-haystack[colab,inference,metrics])
  Downloading lazy_imports-0.3.1-py3-none-any.whl (12 kB)
Collecting posthog (from farm-haystack[colab,inference,metrics])
  Downloading posthog-3.5.0-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.3/41.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting prompthub-py==4.0.0 (from farm-haystack[colab,inference,metrics])
  Downloading prompthub_py-4.0.0-py3-none-any.whl (6.9 kB)
Colle

### Utils


In [3]:
import re
import numpy as np
import pandas as pd
from datasets import load_dataset
from llama_index.core import Document, VectorStoreIndex, ServiceContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.core.evaluation import generate_question_context_pairs, RetrieverEvaluator, BatchEvalRunner, FaithfulnessEvaluator, RelevancyEvaluator
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.response.notebook_utils import display_response

from llama_index.postprocessor.colbert_rerank import ColbertRerank

def istitle(line):
    return len(re.findall(r'^\s* = [^=]* = $', line)) != 0

def read_file(lines):
    articles = []
    current_article = ''
    for i,line in enumerate(lines):
        current_article += line
        if i < len(lines)-2 and (lines[i+1] == ' \n' or lines[i+1]=='') and istitle(lines[i+2]):
            articles.append(current_article)
            current_article = ''
        if i >= 80000 :
            break
    articles.append(current_article)
    return np.array(articles)

def display_results_retriever(name, eval_results, metrics = ["mrr", "hit_rate"]):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    result = {}
    result["Retriever Name"] =  [name]
    for metric in metrics :
        result[metric]= full_df[metric].mean()

    return pd.DataFrame( result)

from llama_index.core.evaluation import EmbeddingQAFinetuneDataset

def select_qa(dataset):

    new_corpus = dataset.corpus

    keys = [k for k, v in dataset.queries.items() if not (v.startswith("```"))and not (v.startswith("Quest")) and not (v.startswith("import")) and not len(v.split())<20]
    new_queries = {x: dataset.queries[x] for x in keys}
    new_docs = {x: dataset.relevant_docs[x] for x in keys}

    return new_queries, new_corpus, new_docs

def retriever_evaluation (retriever, node_postprocessor = None, metrics = ["hit_rate","mrr"]) :
  # print(node_postprocessor)
  retriever_evaluator = RetrieverEvaluator.from_metric_names(
      metric_names =metrics, retriever=retriever, node_postprocessors = node_postprocessor
  )

  return retriever_evaluator


### Load Documents and models (from computer)

In [None]:
# sentence transformers
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name = "BAAI/bge-small-en")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
from google.colab import drive
import pickle
drive.mount('/content/drive')

DATA_PATH = "/content/nodes.pkl"
infile = open(DATA_PATH,'rb')
nodes = pickle.load(infile)

Mounted at /content/drive


In [None]:
index = VectorStoreIndex(nodes, embed_model=embed_model, show_progress=True)

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1477 [00:00<?, ?it/s]

In [7]:
from google.colab import drive
import pickle
drive.mount('/content/drive')

DATA_PATH = "/content/wikitext_qa_dataset.pkl"
infile = open(DATA_PATH,'rb')
wikitext_qa_dataset = pickle.load(infile)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
keys = list(wikitext_qa_dataset.queries.keys())[:100]
new_queries = {x: wikitext_qa_dataset.queries[x] for x in keys}
new_docs = {x: wikitext_qa_dataset.relevant_docs[x] for x in keys}
new_corpus = wikitext_qa_dataset.corpus

wikitext_qa_dataset_short = EmbeddingQAFinetuneDataset(
        queries=new_queries, corpus=new_corpus, relevant_docs=new_docs
    )

print(len(wikitext_qa_dataset_short.queries.values()))

100


### Load Documents and models (from llama3)

In [None]:
ds = load_dataset("Salesforce/wikitext", "wikitext-103-raw-v1")
data = read_file(ds["train"]['text'])

In [None]:
# sentence transformers
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name = "BAAI/bge-small-en")#model_name="Salesforce/SFR-Embedding-2_R")



In [None]:
# create document
docs = []
for i, row in enumerate(data):
    # print(i,row)
    docs.append(Document(text =row, doc_id=i))
splitter = SentenceSplitter(chunk_size=512, chunk_overlap=20)
nodes = splitter.get_nodes_from_documents(docs)

In [None]:
index = VectorStoreIndex(nodes, embed_model=embed_model, show_progress=True)

In [None]:
model_url = "https://huggingface.co/bartowski/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf"
llm_llama3 = LlamaCPP(
    model_url=model_url,
    model_path=None,
    temperature=0.1,
    max_new_tokens=256,
    context_window=3900,
    generate_kwargs={},
    model_kwargs={"n_gpu_layers": -1},  # if compiled to use GPU
    verbose=True,
)

# from transformers import BitsAndBytesConfig
# from transformers import AutoModelForCausalLM, AutoTokenizer
# from peft import PeftModel, PeftConfig, LoraConfig, TaskType, get_peft_model

#1st possibility :
# quantization_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     # bnb_4bit_compute_dtype=torch.float16,
#     # bnb_4bit_quant_type="nf4",
#     # bnb_4bit_use_double_quant=True,
# )
# llm = HuggingFaceLLM(
# #     model_name="meta-llama/Meta-Llama-3-8B",
#     tokenizer_name="meta-llama/Meta-Llama-3-8B",
#     context_window=3900,
#     max_new_tokens=256,
#     model_kwargs={"quantization_config": quantization_config},
#     generate_kwargs={"temperature": 0.1},
#     # messages_to_prompt=messages_to_prompt,
#     # completion_to_prompt=completion_to_prompt,
#     device_map="cuda",
# )

#2nd one :
# from transformers import BitsAndBytesConfig
# quantization_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     # bnb_4bit_compute_dtype=torch.float16,
#     # bnb_4bit_quant_type="nf4",
#     # bnb_4bit_use_double_quant=True,
# )
# model = AutoModelForCausal.from_pretrained("meta-llama/Meta-Llama-3-8B", quantization_config=q_config)
# peft_config = LoraConfig(
#         r=args.lora_r, # Rank
#         lora_alpha=args.lora_alpha,
#         target_modules=args.lora_target_modules,
#         lora_dropout=args.lora_dropout,
#         bias="none",
#         task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
#     )
# model = get_peft_model(model, peft_config)

# llm = HuggingFaceLLM(
#     model = model
#     tokenizer_name="meta-llama/Meta-Llama-3-8B",
#     context_window=3900,
#     max_new_tokens=256,
#     generate_kwargs={"temperature": 0.1},
#     # messages_to_prompt=messages_to_prompt,
#     # completion_to_prompt=completion_to_prompt,
#     device_map="cuda",
# )


wikitext_qa_dataset_llama = generate_question_context_pairs(
    nodes[0::10][0:5],
    llm=llm_llama3,
    num_questions_per_chunk=3,
)

queries = list(wikitext_qa_dataset_llama.queries.values())
queries

Downloading url https://huggingface.co/bartowski/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf to path /tmp/llama_index/models/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf
total size (MB): 4920.73


4693it [00:47, 99.64it/s]                           
llama_model_loader: loaded meta data with 26 key-value pairs and 291 tensors from /tmp/llama_index/models/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
llama_model_loader: - kv   2:                          llama.block_count u32              = 32
llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.attention.head_count u32 

['```',
 'What is the name of the game developed by Sega and Media.Vision?',
 '```',
 '```',
 'What was the purpose of constructing the Tower Building of the Little Rock Arsenal?',
 '```',
 '```',
 'What was Cicely Mary Barker known for?',
 '```',
 '```',
 'Here are three questions based on the given text:',
 'What was the outcome of the match between Gambia and Tunisia in the 2012 Azerbaijan World Cup?',
 '```',
 'What was the outcome of the investigation into the clock pause during the Blue Jackets game?',
 '```']

In [None]:
def select_qa(dataset):

    new_corpus = dataset.corpus

    keys = [k for k, v in dataset.queries.items() if not (v.startswith("```"))and not ("uestion" in v) and not (v.startswith("import")) and not len(v.split())<5 ]
    new_queries = {x: dataset.queries[x] for x in keys}
    new_docs = {x: dataset.relevant_docs[x] for x in keys}

    return new_queries, new_corpus, new_docs

In [None]:
new_queries, new_corpus, new_docs = select_qa(wikitext_qa_dataset_llama)

new_qa_dataset = EmbeddingQAFinetuneDataset(
        queries=new_queries, corpus=new_corpus, relevant_docs=new_docs
    )

print(new_qa_dataset.queries.values())

dict_values(['What is the name of the game developed by Sega and Media.Vision?', 'What was the purpose of constructing the Tower Building of the Little Rock Arsenal?', 'What was Cicely Mary Barker known for?', 'What was the outcome of the match between Gambia and Tunisia in the 2012 Azerbaijan World Cup?', 'What was the outcome of the investigation into the clock pause during the Blue Jackets game?'])


In [None]:
# response = llm_llama3.complete("Hello, how are you?")
# print(str(response))

### Evaluation Different RAG models with llama_index


In [None]:
base_retriever = index.as_retriever(similarity_top_k=3)
base_retriever_evaluator = retriever_evaluation(base_retriever)
base_eval_results = await base_retriever_evaluator.aevaluate_dataset(wikitext_qa_dataset_short)
display_results_retriever("Base Retriever", base_eval_results)

Unnamed: 0,mrr,hit_rate,Retriever Name
0,0.818333,0.91,Base Retriever


In [None]:
base_retriever = index.as_retriever(similarity_top_k=10)
colbert_reranker = ColbertRerank(
    top_n=3,
    model="colbert-ir/colbertv2.0",
    tokenizer="colbert-ir/colbertv2.0",
    keep_retrieval_score=True,
)
base_colbert_retriever_evaluator = retriever_evaluation(base_retriever, node_postprocessor=[colbert_reranker], metrics =["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"])

base_colbert_eval_results =  await base_colbert_retriever_evaluator.aevaluate_dataset(wikitext_qa_dataset_short)
display_results_retriever("Base and colbert Retriever", base_colbert_eval_results)

tokenizer_config.json:   0%|          | 0.00/405 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Unnamed: 0,Retriever Name,mrr,hit_rate
0,BM25 and colbert Retriever,0.94,0.99


In [None]:
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker

bge_reranker = FlagEmbeddingReranker(
    top_n=3,
    model="BAAI/bge-reranker-base",
    use_fp16=False
)

base_bge_retriever_evaluator = retriever_evaluation(base_retriever, node_postprocessor=[bge_reranker], metrics =["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"])

base_bge_eval_results =  await base_bge_retriever_evaluator.aevaluate_dataset(wikitext_qa_dataset_short)
display_results_retriever("Base and bge Retriever", base_bge_eval_results)

Unnamed: 0,Retriever Name,mrr,hit_rate
0,BM25 and bge Retriever,0.933333,0.99


In [None]:
BM25retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=3)
BM25_retriever_evaluator = retriever_evaluation(BM25retriever)
BM25_eval_results =  await BM25_retriever_evaluator.aevaluate_dataset(wikitext_qa_dataset_short)
display_results_retriever("BM25 Retriever", BM25_eval_results)

DEBUG:bm25s:Building index from IDs objects


Unnamed: 0,Retriever Name,mrr,hit_rate
0,BM25 Retriever,0.701667,0.78


In [None]:
BM25retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=10)
colbert_reranker = ColbertRerank(
    top_n=3,
    model="colbert-ir/colbertv2.0",
    tokenizer="colbert-ir/colbertv2.0",
    keep_retrieval_score=True,
)

BM25_colbert_retriever_evaluator = retriever_evaluation(BM25retriever, node_postprocessor=[colbert_reranker], metrics =["hit_rate", "mrr", "ndcg"])

BM25_colbert_eval_results =  await BM25_colbert_retriever_evaluator.aevaluate_dataset(wikitext_qa_dataset_short)
display_results_retriever("BM25 and colbert Retriever", BM25_colbert_eval_results, ["hit_rate", "mrr", "ndcg"])

DEBUG:bm25s:Building index from IDs objects


Unnamed: 0,Retriever Name,hit_rate,mrr,precision,recall,ap,ndcg
0,BM25 and colbert Retriever,0.92,0.888333,0.306667,0.92,0.888333,0.42073


In [None]:
BM25retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=10)
bge_reranker = FlagEmbeddingReranker(
    top_n=3,
    model="BAAI/bge-reranker-base",
    use_fp16=False
)

BM25_bge_retriever_evaluator = retriever_evaluation(BM25retriever, node_postprocessor=[bge_reranker], metrics =["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"])

BM25_bge_eval_results =  await BM25_bge_retriever_evaluator.aevaluate_dataset(wikitext_qa_dataset_short)
display_results_retriever("BM25 and bge Retriever", BM25_bge_eval_results, ["hit_rate", "mrr", "ap", "ndcg"])

DEBUG:bm25s:Building index from IDs objects


Unnamed: 0,Retriever Name,hit_rate,mrr,precision,recall,ap,ndcg
0,BM25 and bge Retriever,0.91,0.875,0.303333,0.91,0.875,0.41492


In [None]:
from llama_index.core.llama_pack import download_llama_pack


# download and install dependencies
RAGatouilleRetrieverPack = download_llama_pack(
    "RAGatouilleRetrieverPack", "./ragatouille_pack"
)

ragatouille_pack = RAGatouilleRetrieverPack(
    docs,
    llm=llm_llama3,
    index_name="my_index",
    top_k=3,
)

colbert_retriever = ragatouille_pack.get_modules()["retriever"]

colbert_retriever_evaluator = retriever_evaluation(colbert_retriever)
colbert_eval_results =  await colbert_retriever_evaluator.aevaluate_dataset(wikitext_qa_dataset_short)
display_results_retriever("RAGatouille Retriever", colbert_eval_results)

### DPR

In [None]:
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
import faiss

In [None]:
tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
model = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/492 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
inputs = tokenizer(list(data), return_tensors="pt", truncation=True, padding=True)
embeddings = model(**inputs).pooler_output.detach().numpy()

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)

In [None]:
def ask_question(question, data, retriever_index, model):
    # Retrieve relevant passages
    query_inputs = tokenizer(question, return_tensors="pt", truncation=True, padding=True)
    query_embeddings = model(**query_inputs).pooler_output.detach().numpy()
    _, indices = retriever_index.search(query_embeddings, k=3)

    retrieved_texts = [data[i] for i in indices[0]]
    context = " ".join(retrieved_texts)
    return context

question = wikitext_qa_dataset_short.queries.values()[0]
print(question)
key = list(wikitext_qa_dataset_short.queries.keys())[0]
doc = wikitext_qa_dataset.relevant_docs[key]
corpus = wikitext_qa_dataset.corpus[doc]

context = ask_question(question, list(data), index, model)
print(f"Answer: {context}")

if corpus in context:
    # Find index of the string
    hit_rate = 1
    index = context.index(corpus)
    MRR = 1/index

print(hit_rate, MRR)







In [None]:
print(wikitext_qa_dataset.corpus)

### DPR Haystack

In [9]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

from haystack.nodes import DensePassageRetriever
from haystack.utils import fetch_archive_from_http
from haystack.document_stores import InMemoryDocumentStore

INFO:haystack.telemetry:Haystack sends anonymous usage data to understand the actual usage and steer dev efforts towards features that are most meaningful to users. You can opt-out at anytime by manually setting the environment variable HAYSTACK_TELEMETRY_ENABLED as described for different operating systems in the [documentation page](https://docs.haystack.deepset.ai/docs/telemetry#how-can-i-opt-out). More information at [Telemetry](https://docs.haystack.deepset.ai/docs/telemetry).


In [10]:
documents = []
for i, row in enumerate(nodes):
    # print(i,row)
    documents.append({"content": row.text, "meta":{"id":row.id_}})

In [17]:
from haystack.document_stores.memory import InMemoryDocumentStore
document_store = InMemoryDocumentStore()# use_bm25=True)
document_store.write_documents(documents)

INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
INFO:haystack.document_stores.base:Duplicate Documents: Document with id '3657e157f016fdd0a7a23fa0d9a18619' already exists in index 'document'
INFO:haystack.document_stores.base:Duplicate Documents: Document with id '409d022f2328b92cac03072096fd64a' already exists in index 'document'
INFO:haystack.document_stores.base:Duplicate Documents: Document with id '5bce8262f627d45db3443d7db16a13c8' already exists in index 'document'
INFO:haystack.document_stores.base:Duplicate Documents: Document with id 'ca3d73080658c86710b1623af9b53cab' already exists in index 'document'
INFO:haystack.document_stores.base:Duplicate Documents: Document with id '7853772d006206139a5b1ebcb0a8b76a' already exists in index 'document'
INFO:haystack.document_stores.base:Duplicate Documents: Document with id '93bfb53934041009f3ea9cbb45fa395' already exists in index 'document'
INFO:haystack.document_stores.base:Duplicate Documents: Document with id 

In [12]:
from haystack.nodes import SentenceTransformersRanker, BM25Retriever

bm25retriever = BM25Retriever(document_store=document_store)

In [18]:
query_model = "facebook/dpr-question_encoder-single-nq-base"
passage_model = "facebook/dpr-ctx_encoder-single-nq-base"

retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model=query_model,
    passage_embedding_model=passage_model,
    max_seq_len_query=64,
    max_seq_len_passage=512,
)

document_store.update_embeddings(retriever)

INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/493 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

INFO:haystack.modeling.model.language_model:Auto-detected model language: english


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/492 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

INFO:haystack.modeling.model.language_model:Auto-detected model language: english
INFO:haystack.document_stores.memory:Updating embeddings for 0 docs ...
Updating Embedding:   0%|          | 0/11698 [00:00<?, ? docs/s]
Create embeddings:   0%|          | 0/10000 [00:00<?, ? Docs/s][A
Create embeddings:   0%|          | 16/10000 [00:01<20:41,  8.04 Docs/s][A
Create embeddings:   0%|          | 32/10000 [00:02<11:45, 14.12 Docs/s][A
Create embeddings:   0%|          | 48/10000 [00:03<08:52, 18.69 Docs/s][A
Create embeddings:   1%|          | 64/10000 [00:03<07:32, 21.95 Docs/s][A
Create embeddings:   1%|          | 80/10000 [00:04<06:47, 24.33 Docs/s][A
Create embeddings:   1%|          | 96/10000 [00:04<06:23, 25.86 Docs/s][A
Create embeddings:   1%|          | 112/10000 [00:05<06:04, 27.12 Docs/s][A
Create embeddings:   1%|▏         | 128/10000 [00:05<05:55, 27.80 Docs/s][A
Create embeddings:   1%|▏         | 144/10000 [00:06<05:46, 28.48 Docs/s][A
Create embeddings:   2%|▏  

In [19]:
def retriever_evaluation_haystack (dataset_qa, retriever, name) :

  hit_rate = []
  MRR = []
  for i,question in enumerate(list(dataset_qa.queries.values())) :
    key = list(dataset_qa.queries.keys())[i]
    doc = dataset_qa.relevant_docs[key][0]

    answers = retriever.retrieve(query=question, top_k=3)

    answers = [answers[i].meta["id"] for i in range(3)]

    if doc in answers:
      hit_rate.append(1)
      index = answers.index(doc)
      MRR.append(1/(index+1))
    else :
      hit_rate.append(0)
      MRR.append(0)

  result = {}
  result["Retriever Name"] =  [name]
  result["Hit Rate"]= np.mean(hit_rate)
  result["MRR"]=  np.mean(MRR)

  return pd.DataFrame( result)

In [21]:
retriever_evaluation_haystack (wikitext_qa_dataset_short, retriever, "DPR retriever")

Unnamed: 0,Retriever Name,Hit Rate,MRR
0,DPR retriever,0.76,0.615


### Evaluation Faithfulness, Relevancy

In [None]:
from llama_index.core.evaluation import BatchEvalRunner, FaithfulnessEvaluator, RelevancyEvaluator
from llama_index.core import ServiceContext

# Let's pick 10 queries to do evaluation
batch_eval_queries = queries

service_context = ServiceContext.from_defaults(llm=llm , embed_model=embed_model)
faithfulness = FaithfulnessEvaluator(service_context=service_context)
relevancy = RelevancyEvaluator(service_context=service_context)

# Initiate BatchEvalRunner to compute FaithFulness and Relevancy Evaluation.
runner = BatchEvalRunner(
    {"faithfulness": faithfulness, "relevancy": relevancy},
    workers=8,
)

# Compute evaluation
eval_results = await runner.aevaluate_queries(
    query_engine, queries=batch_eval_queries
)

  service_context = ServiceContext.from_defaults(llm=llm , embed_model=embed_model)
Llama.generate: prefix-match hit

llama_print_timings:        load time =  203682.69 ms
llama_print_timings:      sample time =      13.19 ms /    22 runs   (    0.60 ms per token,  1667.68 tokens per second)
llama_print_timings: prompt eval time =  431695.43 ms /  1107 tokens (  389.97 ms per token,     2.56 tokens per second)
llama_print_timings:        eval time =   13815.15 ms /    21 runs   (  657.86 ms per token,     1.52 tokens per second)
llama_print_timings:       total time =  445542.56 ms /  1128 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =  203682.69 ms
llama_print_timings:      sample time =      53.21 ms /    93 runs   (    0.57 ms per token,  1747.69 tokens per second)
llama_print_timings: prompt eval time =  407188.84 ms /  1050 tokens (  387.80 ms per token,     2.58 tokens per second)
llama_print_timings:        eval time =   60855.28 ms /    92 runs

In [None]:
faithfulness_score = sum(result.passing for result in eval_results['faithfulness']) / len(eval_results['faithfulness'])
relevancy_score = sum(result.passing for result in eval_results['relevancy']) / len(eval_results['relevancy'])
print(faithfulness_score, relevancy_score)

0.5 0.5
