**Install Required Packages**


In [None]:
# Install required packages
!pip install langchain PyMuPDF
!pip install langchain_google_genai
!pip install sentence-transformers


**Import Libraries**

In [None]:
# Import necessary libraries
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Qdrant
import os
import numpy as np
import google.generativeai as genai
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from langchain.document_loaders import DirectoryLoader, PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


**Load PDF Files**

In [None]:
# Specify the data folder path
DATA_FOLDER_PATH = "/content/drive/MyDrive/data"

# Load PDF files from the specified folder
loader = DirectoryLoader(DATA_FOLDER_PATH, glob="**/*.pdf", loader_cls=PyMuPDFLoader, show_progress=True)
documents = loader.load()

# Split loaded documents into chunks with chunk size of 1000 characters and 40 characters overlap
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=40)
docs = text_splitter.split_documents(documents)


**Initialize SentenceTransformer Embedding Model**

In [None]:
embeddings = SentenceTransformerEmbeddings(model_name="all-Mpnet-base-v2")  # Use a model that produces 768 dimensions


**Configure Qdrant**

In [None]:
# Qdrant Cloud Configuration
qdrant_cloud_api_key = "qdrant_api"
qdrant_url = "qdrant_url"

qdrant_cloud = Qdrant.from_documents(
    docs,
    embeddings,
    url=qdrant_url,
    prefer_grpc=True,
    api_key=qdrant_cloud_api_key,
    collection_name="new_docs",
    force_recreate=True,
)


**Configure Gemini API**

In [None]:
# Set the Gemini API key from the environment variable
gemini_api_key = 'replace_with_your_api'
if not gemini_api_key:
    raise ValueError("Gemini API key not found. Please set the GEMINI API KEY ")

genai.configure(api_key=gemini_api_key)


**Define functions**

In [None]:
# Function to get the conversational QA chain using the Gemini model
def get_conversational_chain():
    prompt_template1 = """
    Answer the question as detailed as possible from the provided context. Check for the relationships in the content. Generate meaningful context-aware bullet points. If the answer is not in
    the provided context, just say "answer is not available in the context." Do not provide a wrong answer.\n\n
    Context:\n {context}\n
    Question:\n {question}\n

    Answer:
    """

    model = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3, api_key=gemini_api_key)
    prompt = PromptTemplate(template=prompt_template1, input_variables=["context", "question"])
    chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)

    return chain

# Function to interact with Gemini's generative AI
def get_gemini_response(input_text):
    # Create the Gemini model
    model = genai.GenerativeModel('gemini-pro')

    # Generate a response using the model's generate_content method
    response = model.generate_content(input_text)
    print("\n\n Summary:\n\n")
    try:
        # Access the 'candidates' list and extract the 'text' from the first candidate
        content = response.candidates[0].content.parts[0].text
        return content
    except AttributeError as e:
        # Handle attribute errors in case of incorrect structure
        print(f"Attribute error: {e}")
        return "Error: Unable to retrieve response content."
    except Exception as e:
        # Handle any other exceptions
        print(f"An error occurred: {e}")
        return "Error: Something went wrong."

# Function to perform the hybrid search with RRF
def hybrid_search(query, qdrant_store, documents, k=5):
    # Perform similarity search in Qdrant
    found_docs_qdrant = qdrant_store.similarity_search(query)

    # Perform keyword search in the loaded documents
    found_docs_keyword = [doc for doc in documents if query.lower() in doc.page_content.lower()]

    # Combine results from both searches
    combined_docs = found_docs_qdrant + found_docs_keyword[:k]  # Limit keyword search results to k

    # Get ranks for reciprocal rank fusion (RRF)
    ranks_qdrant = np.array([1 / (i + 1) for i in range(len(found_docs_qdrant))])
    ranks_keyword = np.array([1 / (i + 1) for i in range(len(found_docs_keyword[:k]))])

    # Create a combined list of scores
    combined_scores = np.concatenate((ranks_qdrant, ranks_keyword))

    # Get the indices of the sorted scores in descending order
    sorted_indices = np.argsort(-combined_scores)

    # Sort the combined documents based on the sorted indices
    sorted_docs = [combined_docs[i] for i in sorted_indices]

    return sorted_docs


**Main Function to Process the Query**

In [None]:
# Main function to process the query, perform hybrid search and generate responses
def main():
    # Define a query for hybrid search
    query = "Describe the rivers in Maharashtra"

    # Perform hybrid search
    hybrid_results = hybrid_search(query, qdrant_cloud, docs, k=5)

    # Display the results of hybrid search
    print(f"\n\nNumber of relevant documents: {len(hybrid_results)}")
    if hybrid_results:
        print("Retrieved Content:")
        print(hybrid_results[0].page_content)

# Call the main function
if __name__ == "__main__":
    main()
