In [37]:
## Merging two Documents i.e Pdf and xlsx

import pandas as pd
from langchain_community.vectorstores import Chroma
from langchain_nomic.embeddings import NomicEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
import fitz  # PyMuPDF

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

# Extract data from Excel
def extract_data_from_excel(excel_path):
    df = pd.read_excel(excel_path)
    return df.to_string()  # Convert DataFrame to string directly

# Combine PDF text and Excel data
def combine_data(pdf_text, excel_text):
    return f"{pdf_text}\n\n{excel_text}"

# Split text into chunks
def split_text_into_chunks(text, chunk_size=1000, chunk_overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return text_splitter.split_text(text)

# Extract and process data
pdf_text = extract_text_from_pdf(r"C:\Users\Pritish\Downloads\Complete (2).pdf")
excel_text = extract_data_from_excel(r"C:\Users\Pritish\Downloads\Yearly_Returns.xlsx")

merged_data = combine_data(pdf_text, excel_text)

# Split the merged data into chunks
doc_splits = split_text_into_chunks(merged_data)

# Convert chunks into Document objects
documents = [Document(page_content=chunk) for chunk in doc_splits]

# Create and configure the vector store
vectorstore = Chroma.from_documents(
    documents=documents,
    collection_name="rag-chroma",
    embedding=NomicEmbeddings(model="nomic-embed-text-v1.5", inference_mode="local"),
)

# Retrieve from the vector store
retriever = vectorstore.as_retriever()

# Groq API
from langchain_groq import ChatGroq
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate

# Configure the Groq LLM
groq_api_key = "xxx"
llm = ChatGroq(groq_api_key=groq_api_key, model_name="Gemma2-9b-It", temperature=0)

# Define the prompt template
template = """I will provide you pieces of [Context] to answer the [Question]. 
If you don't know the answer based on [Context], just say that you don't know. Don't make up an answer.
[Context]: {context}
[Question]: {question}
Helpful Answer:"""

# Create a PromptTemplate object
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=template,
)

# Example question
question = "What is the document about"

# Retrieve relevant documents
retrieved_docs = retriever.invoke(question)
context_from_docs = retrieved_docs[0].page_content  # Assuming you want the first relevant document's content

# Create the final prompt using the template
final_prompt = prompt.format(context=context_from_docs, question=question)

# Invoke the LLM directly with the formatted prompt
response = llm.invoke(final_prompt)

# Print the result
print(response)


content='Based on the provided context, it appears to be a table of financial data. \n\nIt includes information like:\n\n* **Fund Names:**  "1 Year T Bill (Additional Benchmark)" and "ICICI Prudential Corporate Bond Fund"\n* **Numerical Values:**  These likely represent financial metrics like yields or returns.\n* **Dates:**  "2011-04-05 00:00:00" is associated with the second fund.\n\n\nWithout more context, it\'s difficult to say for sure what the specific document is about. \n' response_metadata={'token_usage': {'completion_tokens': 123, 'prompt_tokens': 183, 'total_tokens': 306, 'completion_time': 0.223636364, 'prompt_time': 0.009945688, 'queue_time': 0.005479221000000001, 'total_time': 0.233582052}, 'model_name': 'Gemma2-9b-It', 'system_fingerprint': 'fp_10c08bf97d', 'finish_reason': 'stop', 'logprobs': None} id='run-52ccc3d9-8a7c-42bd-b181-33282fbf4abe-0' usage_metadata={'input_tokens': 183, 'output_tokens': 123, 'total_tokens': 306}


In [39]:
## Routing two Documents i.e Pdf and xlsx


import pandas as pd
from langchain_community.vectorstores import Chroma
from langchain_nomic.embeddings import NomicEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
import fitz  # PyMuPDF
from langchain_groq import ChatGroq
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

# Extract data from Excel
def extract_data_from_excel(excel_path):
    df = pd.read_excel(excel_path)
    return df.to_string()  # Convert DataFrame to string directly

# Split text into chunks
def split_text_into_chunks(text, chunk_size=1000, chunk_overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return text_splitter.split_text(text)

# Process PDF data
pdf_text = extract_text_from_pdf(r"C:\Users\Pritish\Downloads\Complete (2).pdf")
pdf_chunks = split_text_into_chunks(pdf_text)
pdf_documents = [Document(page_content=chunk) for chunk in pdf_chunks]

# Create vector store for PDF data
pdf_vectorstore = Chroma.from_documents(
    documents=pdf_documents,
    collection_name="pdf-chroma",
    embedding=NomicEmbeddings(model="nomic-embed-text-v1.5", inference_mode="local"),
)

# Process Excel data
excel_text = extract_data_from_excel(r"C:\Users\Pritish\Downloads\Yearly_Returns.xlsx")
excel_chunks = split_text_into_chunks(excel_text)
excel_documents = [Document(page_content=chunk) for chunk in excel_chunks]

# Create vector store for Excel data
excel_vectorstore = Chroma.from_documents(
    documents=excel_documents,
    collection_name="excel-chroma",
    embedding=NomicEmbeddings(model="nomic-embed-text-v1.5", inference_mode="local"),
)

# Set up retrievers
pdf_retriever = pdf_vectorstore.as_retriever()
excel_retriever = excel_vectorstore.as_retriever()

# Configure the Groq LLM
groq_api_key = "xxx"  # Ensure this key is correct
llm = ChatGroq(groq_api_key=groq_api_key, model_name="Gemma2-9b-It", temperature=0)

# Define the prompt template
template = """I will provide you pieces of [Context] to answer the [Question]. 
If you don't know the answer based on [Context], just say that you don't know. Don't make up an answer.
[Context]: {context}
[Question]: {question}
Helpful Answer:"""

# Create a PromptTemplate object
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=template,
)

# Function to route query to the appropriate retriever
def route_query(query, source_type):
    if source_type == 'pdf':
        retriever = pdf_retriever
    elif source_type == 'excel':
        retriever = excel_retriever
    else:
        raise ValueError("Unsupported source type")

    # Retrieve relevant documents
    retrieved_docs = retriever.invoke(query)
    if not retrieved_docs:
        return "No relevant documents found."
    
    context_from_docs = retrieved_docs[0].page_content  # Get content from the first relevant document

    # Create the final prompt using the template
    final_prompt = prompt.format(context=context_from_docs, question=query)

    # Invoke the LLM directly with the formatted prompt
    response = llm.invoke(final_prompt)
    return response

# Example usage
query = "What is the document about?"
source_type = 'excel'  # Can be 'pdf' or 'excel'
response = route_query(query, source_type)

print(response)


content='Based on the provided context, the document appears to be about financial instruments and their performance. \n\nIt lists various items, including:\n\n* **1 Year T Bill (Additional Benchmark):**  Likely a benchmark for short-term interest rates.\n* **ICICI Prudential Floating Interest Fund:** A type of mutual fund that invests in floating-rate debt securities.\n* **NIFTY Low Duration Debt Index A-I (Benchmark):** A benchmark index tracking low-duration debt securities.\n\nEach entry includes numerical values that could represent yields, returns, or other financial metrics.  \n\n\nLet me know if you have more context or specific questions! \n' response_metadata={'token_usage': {'completion_tokens': 136, 'prompt_tokens': 281, 'total_tokens': 417, 'completion_time': 0.247272727, 'prompt_time': 0.045158853, 'queue_time': 0.429282371, 'total_time': 0.29243158}, 'model_name': 'Gemma2-9b-It', 'system_fingerprint': 'fp_10c08bf97d', 'finish_reason': 'stop', 'logprobs': None} id='run-62