In [164]:
import numpy as np

from typing import Any, Dict, Iterable, List
from typing_extensions import Self

from langchain.schema.retriever import BaseRetriever
from langchain_core.vectorstores import VectorStoreRetriever
from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
from langchain.schema import Document

from langchain.retrievers import BM25Retriever

from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

## Custom Mixed Retriever (Reciprocal Rank Fusion) as retriever

In [197]:
def hybrid_score(sparse_score, dense_score, alpha = 0.5):
    return (1 - alpha) * sparse_score + alpha * dense_score

class HybridRetriever(BaseRetriever):

    alpha: float=0.5
    sparse_retriever: BM25Retriever
    dense_retriever: VectorStoreRetriever

    class Config:
        """Configuration for this pydantic object."""
        arbitrary_types_allowed = True

    @classmethod
    def from_texts(cls, texts: Iterable[str], **kwargs: Any, ) -> Self:
        sparse_retriever = BM25Retriever.from_texts(texts)
        dense_retriever = FAISS.from_texts(texts, embedding=OpenAIEmbeddings()).as_retriever()
        return cls(sparse_retriever=sparse_retriever, dense_retriever=dense_retriever)
    
    @classmethod
    def from_documents(cls, documents: Iterable[Document], **kwargs: Any, ) -> Self:
        sparse_retriever = BM25Retriever.from_documents(documents)
        dense_retriever = FAISS.from_documents(documents, embedding=OpenAIEmbeddings()).as_retriever()
        return cls(sparse_retriever=sparse_retriever, dense_retriever=dense_retriever)

    def _get_relevant_documents(self,
        query: str, *, run_manager: CallbackManagerForRetrieverRun
    ) -> List[Document]:
        bm25_documents = self.sparse_retriever.get_relevant_documents(query, callbacks=run_manager.get_child())
        openai_documents = self.dense_retriever.get_relevant_documents(query, callbacks=run_manager.get_child())
        
        docs,scores = [] ,[]      
        for i,d in enumerate(bm25_documents, start=1):
            if d in openai_documents:
                rank2 = 1 / (1+openai_documents.index(d))
            else: rank2=0
            docs.append(d)
            scores.append(hybrid_score(1/i, rank2, alpha=self.alpha))

        for i,d in enumerate(openai_documents, start=1):
            if d not in bm25_documents:
                docs.append(d)
                scores.append(hybrid_score(0, 1/i, alpha=self.alpha))
        
        print(self.alpha)
        ordered_docs = np.array(docs)[np.argsort(scores)[::-1]]
        return list(ordered_docs)

In [198]:
retriever = HybridRetriever.from_documents([
        Document(page_content="foo"),
        Document(page_content="bar"),
        Document(page_content="world"),
        Document(page_content="hello"),
        Document(page_content="foo bar"),
    ])

retriever.get_relevant_documents(query="fao")

0.5


[Document(page_content='foo bar'),
 Document(page_content='foo'),
 Document(page_content='hello'),
 Document(page_content='world'),
 Document(page_content='bar')]

In [178]:
retriever.sparse_retriever.get_relevant_documents("fao")

[Document(page_content='foo bar'),
 Document(page_content='hello'),
 Document(page_content='world'),
 Document(page_content='bar')]

In [179]:
retriever.dense_retriever.get_relevant_documents("fao")

[Document(page_content='foo'),
 Document(page_content='foo bar'),
 Document(page_content='hello'),
 Document(page_content='world')]

## Custom Cross-Encoder model (as retriever)

In [None]:
from sentence_transformers.cross_encoder import CrossEncoder

In [215]:
class CrossEncoderRetriever(BaseRetriever):

    model = CrossEncoder('cross-encoder/stsb-TinyBERT-L-4')
    docs = []
    
    class Config:
        """Configuration for this pydantic object."""
        arbitrary_types_allowed = True

    @classmethod
    def from_texts(cls, texts: Iterable[str], **kwargs: Any, ) -> Self:
        return cls(docs=texts)
    
    @classmethod
    def from_documents(cls, documents: Iterable[Document], **kwargs: Any, ) -> Self:
        return cls(docs=documents)

    def _get_relevant_documents(self,
        query: str, *, run_manager: CallbackManagerForRetrieverRun
    ) -> List[Document]:
        searches = [[query, text.page_content] for text in self.docs]
        scores = self.model.predict(searches)
        ordered_docs = np.array(self.docs)[np.argsort(scores)[::-1]]
        return ordered_docs

In [216]:
retriever = CrossEncoderRetriever.from_documents([
        Document(page_content="foo"),
        Document(page_content="bar"),
        Document(page_content="world"),
        Document(page_content="hello"),
        Document(page_content="foo bar"),
    ])

retriever.get_relevant_documents(query="fooa")

array([Document(page_content='foo'), Document(page_content='foo bar'),
       Document(page_content='hello'), Document(page_content='world'),
       Document(page_content='bar')], dtype=object)

## Hybrid Search with CrossEncoder Re-Ranking
using langchain implementation of HybridSearch

In [12]:
import numpy as np

from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain_community.vectorstores import FAISS
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.embeddings import HuggingFaceEmbeddings

from sentence_transformers.cross_encoder import CrossEncoder

### Loading doc

In [13]:
loader = TextLoader("./datasets/kjv.txt")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, 
    chunk_overlap=50, 
    separators="\n"
    )
docs = text_splitter.split_documents(documents)
print(len(docs))

10564


### Sparse retriever

In [14]:
bm25_retriever = BM25Retriever.from_documents(docs)
bm25_retriever.k = 20

### Dense retriever
Based on a local embedding model bge-small-en-v1.5

In [15]:
modelPath = "./bge-small-en-v1.5"
model_kwargs = {'device':'cuda'}
encode_kwargs = {'normalize_embeddings': True}

embeddings = HuggingFaceEmbeddings(
    model_name=modelPath, 
    model_kwargs=model_kwargs, 
    encode_kwargs=encode_kwargs 
)

faiss_vectorstore = FAISS.from_documents(docs, embedding=embeddings)
faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": 20})

### Define de Hybrid Retriever

In [16]:
# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, faiss_retriever], 
    weights=[0.5, 0.5]
)

### Reranking
Based on a local crossencoder

In [24]:
def search(query):
    docs_found = ensemble_retriever.invoke(query)
    model_reranker = CrossEncoder('cross-encoder/stsb-TinyBERT-L-4')

    searches = [[query, text.page_content] for text in docs_found]
    scores = model_reranker.predict(searches)
    ordered_docs = np.array(docs_found)[np.argsort(scores)[::-1]]
    scores = np.array(scores)[np.argsort(scores)[::-1]]
    return ordered_docs, scores

### Results

In [30]:
import pandas as pd

scores,ordered_docs = search("shall mine head be lifted up above mine enemies")
pd.DataFrame({"score":scores, "text":ordered_docs}).head(3)

Unnamed: 0,score,text
0,page_content='Psa27:6 And now shall mine head ...,0.556627
1,"page_content='Psa3:3 But thou, O LORD, art a s...",0.451356
2,page_content='Dan4:34 And at the end of the da...,0.356573


In [31]:
scores,ordered_docs = search("Psa29:10")
pd.DataFrame({"score":scores, "text":ordered_docs}).head(3)

Unnamed: 0,score,text
0,"page_content='Psa29:1 Give unto the LORD, O ye...",0.360064
1,page_content='Psa29:6 He maketh them also to s...,0.303222
2,page_content='Psa31:10 For my life is spent wi...,0.283385


In [32]:
faiss_retriever.get_relevant_documents("Psa29:10")[:3]

[Document(page_content="Psa44:24 Wherefore hidest thou thy face, and forgettest our affliction and our oppression?\nPsa44:25 For our soul is bowed down to the dust: our belly cleaveth unto the earth.\nPsa44:26 Arise for our help, and redeem us for thy mercies' sake.\nPsa45:1 My heart is inditing a good matter: I speak of the things which I have made touching the king: my tongue is the pen of a ready writer.", metadata={'source': './datasets/kjv.txt'}),
 Document(page_content='Psa90:10 The days of our years are threescore years and ten; and if by reason of strength they be fourscore years, yet is their strength labour and sorrow; for it is soon cut off, and we fly away.\nPsa90:11 Who knoweth the power of thine anger? even according to thy fear, so is thy wrath.\nPsa90:12 So teach us to number our days, that we may apply our hearts unto wisdom.\nPsa90:13 Return, O LORD, how long? and let it repent thee concerning thy servants.', metadata={'source': './datasets/kjv.txt'}),
 Document(page_

In [33]:
bm25_retriever.get_relevant_documents("Psa29:10")[:3]

[Document(page_content='Psa29:6 He maketh them also to skip like a calf; Lebanon and Sirion like a young unicorn.\nPsa29:7 The voice of the LORD divideth the flames of fire.\nPsa29:8 The voice of the LORD shaketh the wilderness; the LORD shaketh the wilderness of Kadesh.\nPsa29:9 The voice of the LORD maketh the hinds to calve, and discovereth the forests: and in his temple doth every one speak of his glory.\nPsa29:10 The LORD sitteth upon the flood; yea, the LORD sitteth King for ever.', metadata={'source': './datasets/kjv.txt'}),
 Document(page_content='1Ki19:5 And as he lay and slept under a juniper tree, behold, then an angel touched him, and said unto him, Arise and eat.\n1Ki19:6 And he looked, and, behold, there was a cake baken on the coals, and a cruse of water at his head. And he did eat and drink, and laid him down again.\n1Ki19:7 And the angel of the LORD came again the second time, and touched him, and said, Arise and eat; because the journey is too great for thee.', metada