Load FAISS index, set up BM25 caching, CrossEncoder reranker, and define a custom retriever class that reranks FAISS results.

In [None]:
import sys
import os

# 1) Attempt 1: If current working dir is 'notebooks', add parent folder
notebooks_dir = os.getcwd()
print("Notebook cwd before:", notebooks_dir)

# The code below computes candidate root: one level above notebooks.
project_root_candidate = os.path.abspath(os.path.join(notebooks_dir, os.pardir))
if os.path.isdir(os.path.join(project_root_candidate, "modules")):
    if project_root_candidate not in sys.path:
        sys.path.insert(0, project_root_candidate)
        print("Inserted project root into sys.path:", project_root_candidate)
else:
    # 2) Fallback: maybe cwd is already project root
    if os.path.isdir(os.path.join(notebooks_dir, "modules")):
        if notebooks_dir not in sys.path:
            sys.path.insert(0, notebooks_dir)
            print("Inserted notebooks_dir as project root into sys.path:", notebooks_dir)

# Confirm sys.path
print("First entries of sys.path:", sys.path[:3])


In [None]:
import os
import pickle
from hazm import word_tokenize
from rank_bm25 import BM25Okapi
from langchain_huggingface import HuggingFaceEmbeddings
from sentence_transformers import CrossEncoder
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
from modules.utils import rerank_documents  # to be defined in utils.py
from typing import List, Any
from langchain.schema import BaseRetriever
from sentence_transformers import CrossEncoder
from rank_bm25 import BM25Okapi

In [None]:
class CustomRetriever(BaseRetriever):
    # 1) Declare Pydantic fields (exact names matter)
    base_retriever: Any
    chunks: List[Any]
    cross_encoder: CrossEncoder
    bm25: BM25Okapi
    bm25_weight: float = 0.4
    cross_encoder_weight: float = 0.6
    batch_size: int = 8

    def __init__(
        self,
        *,
        base_retriever: Any,
        chunks: List[Any],
        cross_encoder: CrossEncoder,
        bm25: BM25Okapi,
        bm25_weight: float = 0.4,
        cross_encoder_weight: float = 0.6,
        batch_size: int = 8,
    ):
        # 2) Pass all declared fields as keywords into super().__init__
        super().__init__(
            base_retriever=base_retriever,
            chunks=chunks,
            cross_encoder=cross_encoder,
            bm25=bm25,
            bm25_weight=bm25_weight,
            cross_encoder_weight=cross_encoder_weight,
            batch_size=batch_size,
        )

    def _get_relevant_documents(self, query: str):
        # 3) Use your stored attributes
        docs = self.base_retriever.get_relevant_documents(query)
        reranked = rerank_documents(
            query,
            docs,
            self.chunks,
            self.cross_encoder,
            bm25_weight=self.bm25_weight,
            cross_encoder_weight=self.cross_encoder_weight,
            batch_size=self.batch_size,
            min_score=0.5
        )
        return reranked

# --------------------
# Example of instantiation:
# --------------------
# retriever_base = vectorstore.as_retriever(search_kwargs={"k": 100})
# chunks = [...]  # your list of Document objects
# cross_encoder = CrossEncoder("cross-encoder/mmarco-mMiniLMv2-L12-H384-v1", device="cpu")
# bm25 = BM25Okapi(tokenized_chunks)
#
# retriever = CustomRetriever(
#     base_retriever=retriever_base,
#     chunks=chunks,
#     cross_encoder=cross_encoder,
#     bm25=bm25
# )
# print("CustomRetriever ready.")

In [None]:
def getRetriever():

    # Load chunks from Notebook 2 (unchanged)
    with open(os.path.join("..", "data", "chunks.pkl"), "rb") as f:
        chunks = pickle.load(f)

    embeddings = HuggingFaceEmbeddings(
        model_name="HooshvareLab/bert-fa-base-uncased",
        model_kwargs={"device": "cpu"}  # or "cuda:0" if GPU‐backed
    )

    # Load FAISS index from Notebook 3
    vectorstore = FAISS.load_local(os.path.join("..", "data", "faiss_index.faiss"), embeddings=embeddings , allow_dangerous_deserialization = True)
    # Note: embeddings argument is only needed for saving; here we load just to get retriever
    retriever_base = vectorstore.as_retriever(search_kwargs={"k": 100})
    
    print("FAISS retriever (k=100) ready.")
    
    # BM25 on tokenized chunks
    tokenized_chunks = [word_tokenize(doc.page_content) for doc in chunks]
    bm25 = BM25Okapi(tokenized_chunks)
    
    # CrossEncoder for reranking (running on CPU for now)
    cross_encoder = CrossEncoder(
        "cross-encoder/mmarco-mMiniLMv2-L12-H384-v1",
        device="cpu"  # or "cuda:0" if you have a GPU
    )

    retriever = CustomRetriever(
        base_retriever=retriever_base,
        chunks=chunks,
        cross_encoder=cross_encoder,
        bm25=bm25
    )
    
    return retriever
print("CustomRetriever ready.")