In [27]:
import nest_asyncio
nest_asyncio.apply()

import os
from dotenv import load_dotenv
from llama_parse import LlamaParse
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.chains import RetrievalQA
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

load_dotenv()
GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')

embeddings = GoogleGenerativeAIEmbeddings(model='models/embedding-001', google_api_key=GEMINI_API_KEY)
splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=30)
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-exp", google_api_key=GEMINI_API_KEY)

In [8]:
# doc_path = "D:\prod\pipeline\sharepoint-pipeline\Business-Correspondence.pdf"
# parser = LlamaParse(result_type="markdown").load_data(doc_path)

# file_name = "weaviate_db.md"
# with open(file_name, 'w', encoding='utf-8') as output_file:
#     for item in parser:
#         output_file.write(item.text)

  doc_path = "D:\prod\pipeline\sharepoint-pipeline\Business-Correspondence.pdf"


Started parsing the file under job_id 40b4374f-0222-496c-9c61-2ccaf0ba6927
.

In [9]:
loader = UnstructuredMarkdownLoader("D:\prod\pipeline\sharepoint-pipeline\weaviate_db.md")
docs = loader.load()
chunks = splitter.split_documents(docs)

  loader = UnstructuredMarkdownLoader("D:\prod\pipeline\sharepoint-pipeline\weaviate_db.md")


In [24]:

text = """
    You are an advanced AI assistant.
    Give the answer in brief and also give the Page Number on which the data is present.
    context: {context} input: {input} answer:
"""

In [28]:
vectorstore = Chroma.from_documents(chunks, embeddings)
vectorstore_retreiver = vectorstore.as_retriever(search_kwargs={"k": 3})
keyword_retriever = BM25Retriever.from_documents(chunks)

ensemble_retriever = EnsembleRetriever(retrievers=[vectorstore_retreiver, keyword_retriever], weights=[0.3, 0.7])

normal_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vectorstore_retreiver)

hybrid_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=ensemble_retriever)

# response = hybrid_chain.invoke("Making an offer")
# response




prompt = PromptTemplate.from_template(text)
combine_docs_chain = create_stuff_documents_chain(llm, prompt)
retrieval_chain = create_retrieval_chain(ensemble_retriever, combine_docs_chain)


response = retrieval_chain.invoke({"input": "What should I do if I don't know the gender of my contact ?"})
print(response["answer"])

Use their first name and surname. (Page Number: 23)



In [29]:
response = retrieval_chain.invoke({"input": "What phrases to use to make an offer in terms to other team's request ?"})
print(response["answer"])

**Phrases to use:**

*   "Thank you for your interest in our products/services." (Page 3)
*   "We are pleased/happy/delighted to be able to email you our terms and conditions." (Page 3)
*   "Please note that the offer is valid for 30 days." (Page 3)



In [20]:
response = hybrid_chain.invoke("Making an offer")
response

{'query': 'Making an offer',
 'result': 'The provided text gives an example of making an offer in a business context:\n\n"Thank you for your interest in our products/services.  [Further details of the offer would follow here]"\n\nIt also mentions that all price quotes must be firm and state when they expire.  Beyond this example, there is no further information on making an offer.\n'}

In [23]:
response = retrieval_chain.invoke({"query": "Making an offer"})
response["answer"]

AttributeError: 'str' object has no attribute 'page_content'