## Hybrid Retrieval Engine - PoC
Here, we will combine the 2 layered hybrid retrieval engine - 
1. Stage 1 - Lexical: Sparse Retrieval using BM25
2. Stage 2 - Semantic: Dense Retrieval using Vector Embeddings (Gemini: `gemini-embedding-001` embedding model)

Let's get started.

In [None]:
from pathlib import Path
import os

root = Path().absolute().parents[1]
os.chdir(str(root))

In [None]:
import json
import bm25s
import Stemmer
from google.genai import types
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.docstore.in_memory import InMemoryDocstore


class HybridRetrieval:
    def __init__(self, top_k_stage_1: int, top_k_stage_2: int):
        self.top_k_stage_1 = top_k_stage_1
        self.top_k_stage_2 = top_k_stage_2

    def _instantiate_stage_1(self):
        self.stemmer = Stemmer.Stemmer("english")
        self.retriever = bm25s.BM25.load("artifacts/bm25", load_corpus=True)
        with open("artifacts/bm25/corpus.jsonl", "r") as f:
            self.corpus = [json.loads(line) for line in f]

    def _instantiate_stage_2(self):
        embeddings = GoogleGenerativeAIEmbeddings(
            model="gemini-embedding-001",
            task_type="semantic_similarity",
            config=types.EmbedContentConfig(
                    output_dimensionality=3072,
                    task_type="SEMANTIC_SIMILARITY",
                )
        )
        self.vector_store = FAISS.load_local(
            folder_path="artifacts/cso_smry/faiss_index",
            embeddings=embeddings,
            allow_dangerous_deserialization=True
        )

    def search(self, query: str):
        # Stage 1: Lexical Search
        query_tokens = bm25s.tokenize(query, stemmer=self.stemmer)
        docs, scores = self.retriever.retrieve(query_tokens, k=self.top_k_stage_1, corpus=self.corpus)
        stage_1_results = {
            doc["id"] : float(score) for doc, score in zip(docs[0], scores[0])
        }

        # Stage 2: Semantic Search
        stage_2_results = self.vector_store.similarity_search(query, top_k=self.top_k_stage_2)

        # Combine results from both stages
        combined_results = stage_1_results + stage_2_results
        return combined_results