# Hybrid search / Ensemble Retreiver

## understanding how TFIDF works with sparse embeddings

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Sample documents
documents = [
    "This is a list which containig sample documents.",
    "Keywords are important for keyword-based search.",
    
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings."
]

query="keyword-based search"

In [3]:
import re
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    return text


In [None]:
# Preprocess documents
preprocess_documents=[preprocess_text(doc) for doc in documents]
preprocess_documents

['this is a list which containig sample documents',
 'keywords are important for keywordbased search',
 'document analysis involves extracting keywords',
 'keywordbased search relies on sparse embeddings']

In [5]:
print("Preprocessed Documents:")
for doc in preprocess_documents:
    print(doc)

Preprocessed Documents:
this is a list which containig sample documents
keywords are important for keywordbased search
document analysis involves extracting keywords
keywordbased search relies on sparse embeddings


In [6]:
print("Preprocessed Query:")
print(query)

Preprocessed Query:
keyword-based search


In [None]:
# Preprocess query
preprocessed_query = preprocess_text(query)
preprocessed_query

'keywordbased search'

In [None]:
# Create TF-IDF vectorizer and transform documents
vector=TfidfVectorizer()
X=vector.fit_transform(preprocess_documents)
X.toarray()

array([[0.        , 0.        , 0.37796447, 0.        , 0.37796447,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.37796447, 0.        , 0.        , 0.37796447, 0.        ,
        0.        , 0.37796447, 0.        , 0.        , 0.37796447,
        0.37796447],
       [0.        , 0.4533864 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.4533864 , 0.4533864 , 0.        ,
        0.        , 0.35745504, 0.35745504, 0.        , 0.        ,
        0.        , 0.        , 0.35745504, 0.        , 0.        ,
        0.        ],
       [0.46516193, 0.        , 0.        , 0.46516193, 0.        ,
        0.        , 0.46516193, 0.        , 0.        , 0.46516193,
        0.        , 0.        , 0.36673901, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.43671931, 0.        , 0.        , 0.       

In [None]:
#first senetence
X.toarray()[0]

array([0.        , 0.        , 0.37796447, 0.        , 0.37796447,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.37796447, 0.        , 0.        , 0.37796447, 0.        ,
       0.        , 0.37796447, 0.        , 0.        , 0.37796447,
       0.37796447])

In [None]:
# Transform the query
query_embedding=vector.transform([preprocessed_query])
query_embedding.toarray()

# Calculate cosine similarity between query and document embeddings
similarities = cosine_similarity(X, query_embedding)
similarities

array([[0.        ],
       [0.50551777],
       [0.        ],
       [0.48693426]])

In [None]:
# Get the indices of the documents sorted by similarity
np.argsort(similarities,axis=0)

array([[0],
       [2],
       [3],
       [1]])

In [None]:
# Get the ranked indices and documents
ranked_indices=np.argsort(similarities,axis=0)[::-1].flatten()
print(f"Ranked Indices: {ranked_indices}")
ranked_documents = [documents[i] for i in ranked_indices]
print(f"Ranked Documents: {ranked_documents}")

Ranked Indices: [1 3 2 0]
Ranked Documents: ['Keywords are important for keyword-based search.', 'Keyword-based search relies on sparse embeddings.', 'Document analysis involves extracting keywords.', 'This is a list which containig sample documents.']


In [13]:
# Output the ranked documents
for i, doc in enumerate(ranked_documents):
    print(f"Rank {i+1}: {doc}")

Rank 1: Keywords are important for keyword-based search.
Rank 2: Keyword-based search relies on sparse embeddings.
Rank 3: Document analysis involves extracting keywords.
Rank 4: This is a list which containig sample documents.


## Dense Vectors -mock which can be created using sentence transformers

In [None]:
documents = [
    "This is a list which containig sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings."
]

query="keyword-based search"

In [14]:
document_embeddings = np.array([
    [0.634, 0.234, 0.867, 0.042, 0.249],
    [0.123, 0.456, 0.789, 0.321, 0.654],
    [0.987, 0.654, 0.321, 0.123, 0.456]
])

In [15]:
# Sample search query (represented as a dense vector)
query_embedding = np.array([[0.789, 0.321, 0.654, 0.987, 0.123]])

In [16]:
# Calculate cosine similarity between query and documents
similarities = cosine_similarity(document_embeddings, query_embedding)
similarities

array([[0.73558979],
       [0.67357898],
       [0.71517305]])

In [17]:
ranked_indices = np.argsort(similarities, axis=0)[::-1].flatten()
ranked_indices

array([0, 2, 1])

In [18]:
# Output the ranked documents
for i, idx in enumerate(ranked_indices):
    print(f"Rank {i+1}: Document {idx+1}")

Rank 1: Document 1
Rank 2: Document 3
Rank 3: Document 2


## Creating RAG

In [21]:
doc_path="RAG_paper.pdf"

In [23]:
import os
from dotenv import load_dotenv

load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load the PDF document
loader=PyPDFLoader(doc_path)
docs=loader.load()

# Split the document into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=200,chunk_overlap=30)
chunks = splitter.split_documents(docs)
chunks


Ignoring wrong pointing object 31 0 (offset 0)
Ignoring wrong pointing object 32 0 (offset 0)
Ignoring wrong pointing object 89 0 (offset 0)


[Document(metadata={'producer': 'macOS Version 10.15.7 (Build 19H15) Quartz PDFContext', 'creator': 'LaTeX with hyperref', 'creationdate': "D:20210107190157Z00'00'", 'moddate': "D:20210107190157Z00'00'", 'source': 'RAG_paper.pdf', 'total_pages': 16, 'page': 0, 'page_label': '1'}, page_content='Retrieval-Augmented Generation for\nKnowledge-Intensive NLP Tasks\nPatrick Lewis†‡ , Ethan Perez?,\nAleksandra Piktus† , Fabio Petroni† , Vladimir Karpukhin† , Naman Goyal† , Heinrich Küttler† ,'),
 Document(metadata={'producer': 'macOS Version 10.15.7 (Build 19H15) Quartz PDFContext', 'creator': 'LaTeX with hyperref', 'creationdate': "D:20210107190157Z00'00'", 'moddate': "D:20210107190157Z00'00'", 'source': 'RAG_paper.pdf', 'total_pages': 16, 'page': 0, 'page_label': '1'}, page_content='Mike Lewis† , Wen-tau Yih† , Tim Rocktäschel†‡ , Sebastian Riedel†‡ , Douwe Kiela†\n† Facebook AI Research;‡ University College London;?New York University;\nplewis@fb.com\nAbstract'),
 Document(metadata={'produc

In [None]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

#alternative
# from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
# embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=HF_TOKEN, model_name="BAAI/bge-base-en-v1.5")

In [26]:
from langchain.vectorstores import Chroma

# dense vectors
vectorstore=Chroma.from_documents(chunks,embeddings)
vectorstore_retreiver = vectorstore.as_retriever(search_kwargs={"k": 3})
vectorstore_retreiver

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x0000014A37333590>, search_kwargs={'k': 3})

# Mixing vector search and keyword search for Hybrid search

### hybrid_score = (1 — alpha) * sparse_score + alpha * dense_score

In [28]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

#keyword retriever 
keyword_retriever = BM25Retriever.from_documents(chunks)
keyword_retriever.k =  3

# Ensemble retriever combining both vectorstore and keyword retrievers
ensemble_retriever = EnsembleRetriever(retrievers=[vectorstore_retreiver,keyword_retriever],weights=[0.3, 0.7])

## Initialize LLM

In [29]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-4o")


# alternative loading quantized model from hugging face
# model_name = "HuggingFaceH4/zephyr-7b-beta"

# import torch
# from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline, )
# from langchain import HuggingFacePipeline

# # function for loading 4-bit quantized model
# def load_quantized_model(model_name: str):
#     """
#     model_name: Name or path of the model to be loaded.
#     return: Loaded quantized model.
#     """
#     bnb_config = BitsAndBytesConfig(
#         load_in_4bit=True,
#         bnb_4bit_use_double_quant=True,
#         bnb_4bit_quant_type="nf4",
#         bnb_4bit_compute_dtype=torch.bfloat16,
#     )

#     model = AutoModelForCausalLM.from_pretrained(
#         model_name,
#         torch_dtype=torch.bfloat16,
#         quantization_config=bnb_config,
#     )
#     return model

# # initializing tokenizer
# def initialize_tokenizer(model_name: str):
#     """
#     model_name: Name or path of the model for tokenizer initialization.
#     return: Initialized tokenizer.
#     """
#     tokenizer = AutoTokenizer.from_pretrained(model_name, return_token_type_ids=False)
#     tokenizer.bos_token_id = 1  # Set beginning of sentence token id
#     return tokenizer

# tokenizer = initialize_tokenizer(model_name)
# model = load_quantized_model(model_name)
# pipeline = pipeline(
#     "text-generation",
#     model=model,
#     tokenizer=tokenizer,
#     use_cache=True,
#     device_map="auto",
#     max_length=2048,
#     do_sample=True,
#     top_k=5,
#     num_return_sequences=1,
#     eos_token_id=tokenizer.eos_token_id,
#     pad_token_id=tokenizer.pad_token_id,
# )
# llm = HuggingFacePipeline(pipeline=pipeline)

## Retrieval QA chain

In [30]:
from langchain.chains import RetrievalQA

normal_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=vectorstore_retreiver
)

hybrid_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=ensemble_retriever
)

## Output from normal vector retreiver

In [33]:
response1 = normal_chain.invoke("What is Abstractive Question Answering?")
print(response1.get("result"))

Abstractive Question Answering is a type of question-answering task where the model generates free-form, natural language responses to questions. This approach goes beyond simply extracting answers from a document by synthesizing information to form a new, coherent answer, even when the exact answer is not found in the retrieved documents. This allows the model to provide responses that better match the intent of the question and offers more flexibility compared to extractive models.


## Output from hybrid retreiver

In [34]:
response2 = hybrid_chain.invoke("What is Abstractive Question Answering?")
print(response2.get("result"))

Abstractive Question Answering (QA) is a type of question answering task where the model generates a free-form, original response that goes beyond simply extracting information from a given text. Unlike extractive QA, which pulls exact phrases or sentences from the source material, abstractive QA allows the model to provide answers that may not explicitly appear in any retrieved document, offering a more comprehensive and synthesized response. This capability allows models to achieve a level of understanding and accuracy that includes generating summaries or explanations that might not be directly stated in the reference material.
