# Embedding Methods

In [2]:
!pip3 install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [None]:
import utils
from tqdm import tqdm
import os
import openai
openai.api_key = utils.get_openai_api_key()

In [None]:
from llama_index import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_files=["../data/sample_data/AI Career/data/eBook-How-to-Build-a-Career-in-AI.pdf"]
).load_data()

In [None]:
from llama_index import Document

document = Document(text="\n\n".join([doc.text for doc in documents]))

In [None]:
from llama_index import VectorStoreIndex
from llama_index import ServiceContext
from llama_index.llms import OpenAI

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
service_context = ServiceContext.from_defaults(
    llm=llm, embed_model="local:BAAI/bge-base-en-v1.5", chunk_size=256
)
index = VectorStoreIndex.from_documents([document],
                                        service_context=service_context)

In [None]:
eval_questions = []
with open('eval_questions.txt', 'r') as file:
    for line in file:
        # Remove newline character and convert to integer
        item = line.strip()
        print(item)
        eval_questions.append(item)

## BM25

from llama_index.retrievers import BM25Retriever

bm25_retriever = BM25Retriever.from_defaults(index, similarity_top_k=10)

In [None]:
from llama_index.query_engine import RetrieverQueryEngine

bm25_query_engine = RetrieverQueryEngine.from_args(
    retriever=bm25_retriever,
    service_context=service_context,
)

In [None]:
from trulens_eval import Tru
tru = Tru()

tru.reset_database()

In [None]:
from utils import get_prebuilt_trulens_recorder

tru_recorder = get_prebuilt_trulens_recorder(bm25_query_engine,
                                             app_id="BM25 Engine")

In [None]:
with tru_recorder as recording:
    for question in tqdm(eval_questions, total=len(eval_questions)):
        response = bm25_query_engine.query(question)

In [None]:
tru.get_leaderboard(app_ids=[])

In [None]:
tru.run_dashboard()

## BAAI/bge-base-en-v1.5

from llama_index.query_engine import RetrieverQueryEngine
vector_retriever = index.as_retriever(similarity_top_k=10)

In [None]:
bge_query_engine = RetrieverQueryEngine.from_args(
    retriever=vector_retriever,
    service_context=service_context,
)
# bge_query_engine = index.as_query_engine()

In [None]:
from utils import get_prebuilt_trulens_recorder
tru_recorder = get_prebuilt_trulens_recorder(bge_query_engine,
                                             app_id="BGE Engine")

In [None]:
from trulens_eval import Tru
tru = Tru()

tru.reset_database()

In [None]:
with tru_recorder as recording:
    for question in tqdm(eval_questions, total=len(eval_questions)):
        response = bge_query_engine.query(question)

In [None]:
tru.get_leaderboard(app_ids=[])

In [None]:
tru.run_dashboard()

# Hybrid

In [None]:
from llama_index.retrievers import BaseRetriever


class HybridRetriever(BaseRetriever):
    def __init__(self, vector_retriever, bm25_retriever):
        self.vector_retriever = vector_retriever
        self.bm25_retriever = bm25_retriever

    def _retrieve(self, query, **kwargs):
        bm25_nodes = self.bm25_retriever.retrieve(query, **kwargs)
        vector_nodes = self.vector_retriever.retrieve(query, **kwargs)

        # combine the two lists of nodes
        all_nodes = []
        node_ids = set()
        for n in bm25_nodes + vector_nodes:
            if n.node.node_id not in node_ids:
                all_nodes.append(n)
                node_ids.add(n.node.node_id)
        return all_nodes
    
hybrid_retriever = HybridRetriever(vector_retriever, bm25_retriever)

In [None]:
from llama_index.query_engine import RetrieverQueryEngine

query_engine = RetrieverQueryEngine.from_args(
    retriever=hybrid_retriever,
    service_context=service_context,
)


In [None]:
tru_recorder = get_prebuilt_trulens_recorder(bge_query_engine, app_id="Hybrid Engine")

In [None]:
tru = Tru()
tru.reset_database()

In [None]:
with tru_recorder as recording:
    for question in tqdm(eval_questions, total=len(eval_questions)):
        response = bge_query_engine.query(question)

In [None]:
tru.get_leaderboard(app_ids=[])

In [None]:
tru.run_dashboard()

## Ensemble Search with Weaviaet

In [None]:
import weaviate

In [None]:
# cloud
resource_owner_config = weaviate.AuthClientPassword(
    username="<your_username>",
    password="<your_password>",
)
client = weaviate.Client(
    "https://llama-test-ezjahb4m.weaviate.network",
    auth_client_secret=resource_owner_config,
)

# local
# client = weaviate.Client("http://localhost:8080")

In [None]:
from llama_index.storage.storage_context import StorageContext

vector_store = WeaviateVectorStore(
    weaviate_client=client, index_name="LlamaIndex-Weaviate"
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context
)

In [None]:
query_engine = index.as_query_engine()

In [None]:
tru_recorder = get_prebuilt_trulens_recorder(bge_query_engine, app_id="Hybrid Engine")

In [None]:
with tru_recorder as recording:
    for question in tqdm(eval_questions, total=len(eval_questions)):
        response = bge_query_engine.query(question)

In [None]:
tru.get_leaderboard(app_ids=[])

In [None]:
tru.run_dashboard()