In [1]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import json
from docx import Document
import os

# file = file.split("/")
# if (len(file) > 1):
#     FILE_NAME = file[-1]

In [2]:
# splitting file into chunks
def split_pdf_to_chunks(file):
    loader = PyMuPDFLoader(file)
    doc = loader.load()

    splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
    chunks = splitter.split_documents(doc)

    data = []
    for i, chunk in enumerate(chunks):
        segment_data = {"page_no" : chunk.metadata["page"], "chunk_number" : i, "chunk_content" : chunk.page_content}
        data.append(segment_data)

    return data

def split_docx_to_chunks(file):
    doc = Document(file)
    text = '\n'.join([para.text for para in doc.paragraphs])

    splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 50)
    chunks = splitter.split_text(text)

    data = []
    for i, chunk in enumerate(chunks):
        segment_data = {
            "chunk_number": i,
            "chunk_content": chunk
        }
        data.append(segment_data)
    return data

In [3]:
# optional
def save_chunks_to_file(file, file_name):
    if (file.endswith(".pdf")):
        data = split_pdf_to_chunks(file)
    else:
        data = split_docx_to_chunks(file)
    if data:
        write_file = open(f"chunk_files/{file_name}_chunks.txt", "w")
        json.dump(data, write_file, indent = 2)
        write_file.close()

In [4]:
def get_chunks(file):
    if (file.endswith(".pdf")):
        data = split_pdf_to_chunks(file)
    elif (file.endswith(".docx")):
        data = split_docx_to_chunks(file)
    else:
        print("Not a '.docx' or '.pdf' file.")
    chunks = [i["chunk_content"] for i in data]

    return chunks

In [5]:
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Chroma

embedding_model = HuggingFaceBgeEmbeddings(model_name="all-MiniLM-L6-v2")

vectordb = Chroma(
    collection_name="brd_collection",
    persist_directory="./my_db",
    embedding_function=embedding_model
)

  embedding_model = HuggingFaceBgeEmbeddings(model_name="all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm
  from .autonotebook import tqdm as notebook_tqdm
  vectordb = Chroma(
  vectordb = Chroma(


In [6]:
def store_to_vectordb(chunks, file_name):
    from langchain.schema import Document

    if not chunks:
        print("No chunks to store")
        return

    docs = [Document(page_content=chunk, metadata={"source": file_name, "chunk_number": i}) for i, chunk in enumerate(chunks)]

    vectordb.add_documents(docs)
    vectordb.persist()

In [7]:
# using the above code on all the files
folder_path = "./all_files"
files = os.listdir(folder_path)

for file in files:
    file_path = rf"{folder_path}/{file}"
    if file_path.endswith(".pdf") or file_path.endswith("docx"):
        chunks = get_chunks(file_path)
        save_chunks_to_file(file_path, file)
        store_to_vectordb(chunks, file)
    print(f"Finished processing and storing {file}")
    

  vectordb.persist()


Finished processing and storing API GetEmployeeByEmail.docx
Finished processing and storing API GetEmployeeById.docx
Finished processing and storing Beon Form - Copy.docx
Finished processing and storing BG Proposal cum Advance Approval Form .docx
Finished processing and storing Beon Form - Copy.docx
Finished processing and storing BG Proposal cum Advance Approval Form .docx
Finished processing and storing BRD of IRIS User ID Creation Form.docx
Finished processing and storing BRD of Request for Cancellation of Penalty Invoice.docx
Finished processing and storing BRD of IRIS User ID Creation Form.docx
Finished processing and storing BRD of Request for Cancellation of Penalty Invoice.docx
Finished processing and storing BRD-Computer Tomography Part Submission form.docx
Finished processing and storing BRD-Computer Tomography Part Submission form.docx
Finished processing and storing BRD_Consulate Issuing Form.docx
Finished processing and storing BRD_Invitation Letter Form.docx
Finished proc

In [8]:
retriever = vectordb.as_retriever(search_kwargs={"k":5})

'''
the above line creates a retriever object from the vectordb.
it returns the top5 most relevant chunks
'''

'\nthe above line creates a retriever object from the vectordb.\nit returns the top5 most relevant chunks\n'

In [14]:
# creating RAG chain using HuggingFaceHub 
from langchain.llms import Ollama
from langchain.chains import RetrievalQA
import os

llm = Ollama(model="mistral")
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

In [None]:
def handle_queries(query):
    try:
        return qa_chain.invoke(query)
    except Exception as e:
        print("Error:", e)

In [16]:
result = handle_queries("what is the action for compliance cell?")
print(result["result"])
print("-------------------------------------------------------------------------------------------")

 The Action for Compliance Cell is to Select name of signature authority and it will print in final PDF with his details mention in Information document.
-------------------------------------------------------------------------------------------


In [12]:
# List of sample queries to test the RAG pipeline
queries = [
    "What is the process for IRIS User ID creation?",
    "Describe the steps for Amazon RDS to Snowflake data migration.",
    "What is the approval workflow for the Scrap Approval Form?",
    "How to request cancellation of a penalty invoice?",
    "What are the requirements for Vendor Payment Voucher?",
    "Explain the role authorization process for SAP User ID.",
    "What is the function of the Consulate Issuing Form?",
    "How to submit a Computer Tomography Part Submission form?"
]

In [17]:
# Run each query and print the results
for idx, query in enumerate(queries, 1):
    print(f"\nQuery {idx}: {query}")
    result = handle_queries(query)
    if result:
        print(f"Answer: {result["result"]}")
    else:
        print("No relevant documents found.")


Query 1: What is the process for IRIS User ID creation?
Answer:  The process for creating an IRIS (Integrated Review and Approval System) User ID appears to be a repetition of the form title. However, without additional context or details about the specific form fields, I cannot provide the exact step-by-step process. It is recommended that you refer to the instructions provided alongside the form for accurate information on how to create an IRIS User ID.

Query 2: Describe the steps for Amazon RDS to Snowflake data migration.
Answer:  The process for creating an IRIS (Integrated Review and Approval System) User ID appears to be a repetition of the form title. However, without additional context or details about the specific form fields, I cannot provide the exact step-by-step process. It is recommended that you refer to the instructions provided alongside the form for accurate information on how to create an IRIS User ID.

Query 2: Describe the steps for Amazon RDS to Snowflake data 