## Hybrid Retrieval Engine - PoC
Here, we will combine the 2 layered hybrid retrieval engine - 
1. Stage 1 - Lexical: Sparse Retrieval using BM25
2. Stage 2 - Semantic: Dense Retrieval using Vector Embeddings (Gemini: `gemini-embedding-001` embedding model)

Let's get started.

In [1]:
from pathlib import Path
import os

root = Path().absolute().parents[1]
os.chdir(str(root))

In [35]:
import json
import bm25s
import Stemmer
import pandas as pd
from google.genai import types
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.docstore.in_memory import InMemoryDocstore


class HybridRetrieval:
    def __init__(self, top_k_stage_1: int, top_k_stage_2: int):
        self.top_k_stage_1 = top_k_stage_1
        self.top_k_stage_2 = top_k_stage_2
        self._instantiate_stage_1()
        self._instantiate_stage_2()

    def _instantiate_stage_1(self):
        self.stemmer = Stemmer.Stemmer("english")
        self.retriever = bm25s.BM25.load("artifacts/bm25", load_corpus=True)
        with open("artifacts/bm25/corpus.jsonl", "r") as f:
            self.corpus = [json.loads(line) for line in f]

    def _instantiate_stage_2(self):
        embeddings = GoogleGenerativeAIEmbeddings(
            model="gemini-embedding-001",
            task_type="semantic_similarity",
            config=types.EmbedContentConfig(
                    output_dimensionality=3072,
                    task_type="SEMANTIC_SIMILARITY",
                )
        )
        self.vector_store = FAISS.load_local(
            folder_path="artifacts/cso_smry/faiss_index",
            embeddings=embeddings,
            allow_dangerous_deserialization=True
        )

    def search(self, query: str):
        # Stage 1: Lexical Search
        query_tokens = bm25s.tokenize(query, stemmer=self.stemmer)
        docs, scores = self.retriever.retrieve(query_tokens, k=self.top_k_stage_1, corpus=self.corpus)
        self.stage_1_results = {
            doc["id"] : float(score) for doc, score in zip(docs[0], scores[0])
        }

        # Stage 2: Semantic Search
        very_high_integer = 10000000 # total number of docs is ~80K, 10M limit set.
        docs_with_scores = self.vector_store.similarity_search_with_score(
            query,
            fetch_k=very_high_integer,
            k=very_high_integer,
            # k=self.top_k_stage_2,
            filter=lambda x: x["id"] in self.stage_1_results
        )
        stage_2_results = {
            doc.metadata["id"]: float(score) for doc, score in docs_with_scores
        }

        # Convert to pandas dataframe
        ids = list(self.stage_1_results.keys())

        # return stage_1_results, stage_2_results, docs_with_scores
        records = {
            "id": ids,
            "stage_1_score": [self.stage_1_results[id] for id in ids],
            "stage_2_score": [stage_2_results.get(id, 0) for id in ids],
        }

        return pd.DataFrame(records)# , stage_1_results, stage_2_results

In [36]:
retriever = HybridRetrieval(top_k_stage_1=200, top_k_stage_2=20)

In [42]:
# 4) Query
question = "does the fish purr like a cat?"

question = "What's the electricity generation mix in Ireland? for renewable and non-renewable energy?"
# question = "What is the share of renewable energy in Ireland?"
# question = "What's the breakup between renewable and non-renewable energy production in Ireland?"
question = "Ireland rural vs urban population mix"
# question = "What are ireland's top exports?"
# question = "What are Prodcom sales for skincare beauty and makeup products in 2023?"
# question = "Give me a breakup of Ireland's share of transportation sector."
# question = "Sold productions - quantity (kg) for beauty, makeup and skincare preparations in Ireland in 2023?"
question = "beauty makeup & skincare production in prodcom data for ireland in 2023"
# question = "PRODCOM production quantity."
# question = "What's the mining and quarrying production in Ireland?"
# question = "nano cellulose production in ireland"
# question = "give me pharmaceuticals production in ireland"
# question = "what are the different types of pharmaceuticals products produced in ireland"


response = retriever.search(query=question)
response

                                                     

Unnamed: 0,id,stage_1_score,stage_2_score
0,CPM13,8.789187,0.460212
1,PCA23,4.585402,0.328005
2,IAIP13,7.194671,0.396397
3,CPM16,7.114656,0.444113
4,CPM18,7.114656,0.436573
...,...,...,...
71,NSA88,4.598532,0.422409
72,NSA97,4.598532,0.429908
73,MIP26,4.596580,0.407813
74,MIP10,4.577620,0.408266
