In [1]:
import fitz
from langchain_experimental.text_splitter import SemanticChunker
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
import json
from docx import Document as Doc
import os
from langchain.prompts import PromptTemplate
from langchain_chroma import Chroma
from langchain.llms import Ollama
from langchain.chains import RetrievalQA
from langchain.schema import Document
import io

In [None]:
def get_and_read_file(file):
    content = file.read()
    text = ""
    if file.name.endswith(".docx"):
        doc = Doc(io.BytesIO(content))
        text = "\n".join(para.text for para in doc.paragraphs)
        return text
    elif file.name.endswith(".pdf"):
        file.seek(0)
        content = file.read()
        print(content)
        doc = fitz.open(stream=content, filetype="pdf")
        text = "\n".join(page.get_text() for page in doc)
        return text

In [None]:
# splitting file into chunks
def split_pdf_to_chunks(file):
    text = get_and_read_file(file)
    splitter = SemanticChunker(embeddings=HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5"))
    chunks = splitter.split_text(text)
    data = []
    for i, chunk in enumerate(chunks):
        segment_data = {"chunk_number": i, "chunk_content": chunk}
        data.append(segment_data)
    return data

def split_docx_to_chunks(file):
    text = get_and_read_file(file)
    splitter = SemanticChunker(embeddings=HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5"))
    chunks = splitter.split_text(text)
    data = []
    for i, chunk in enumerate(chunks):
        segment_data = {"chunk_number": i, "chunk_content": chunk}
        data.append(segment_data)
    return data

In [None]:
def get_chunks(file, file_name):
    if file_name.endswith(".pdf"):
        data = split_pdf_to_chunks(file)
    elif file_name.endswith(".docx"):
        data = split_docx_to_chunks(file)
    else:
        print("Not a '.docx' or '.pdf' file.")
        return []
    chunks = [i["chunk_content"] for i in data]
    return chunks

In [22]:
def remove_duplicate_chunks(chunks):
    seen = set()
    unique_chunks = []
    for chunk in chunks:
        content = chunk.strip()
        if content and content not in seen:
            unique_chunks.append(content)
            seen.add(content)
    return unique_chunks

In [None]:
# optional
def save_chunks_to_file(file, file_name):
    if file.endswith(".pdf"):
        data = split_pdf_to_chunks(file)
    else:
        data = split_docx_to_chunks(file)
    if data:
        with open(f"chunk_files/{file_name}_chunks.txt", "w") as write_file:
            json.dump(data, write_file, indent=2)

In [20]:
def init_db():
    embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    vectordb = Chroma(
        collection_name="brd_collection",
        persist_directory="./my_db",
        embedding_function=embedding_model
    )
    retriever = vectordb.as_retriever(search_kwargs={"k":5})
    return vectordb, retriever

In [21]:
def store_to_vectordb(vectordb, chunks, file_name):

    if not chunks:
        return

    docs = [Document(page_content=chunk, metadata={"source": file_name, "chunk_number": i}) for i, chunk in enumerate(chunks)]

    vectordb.add_documents(docs)

In [23]:
# creating RAG chain 
def create_qa_chain(retriever):

    custom_prompt = PromptTemplate(
        input_variables=["context", "question"],
        template=(
            "Based only on the following context, answer the user's question. "
            "If the answer is not present in the context, say 'Not found in the provided documents.'\n\n"
            "Context:\n{context}\n\nQuestion: {question}\nAnswer:"
        )
    )

    llm = Ollama(model="mistral")
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        chain_type="stuff",
        chain_type_kwargs={"prompt": custom_prompt}
    )
    return qa_chain

In [None]:
def handle_query(query, qa_chain, file_name):
    try:
        if file_name:
            return qa_chain.invoke({"query": query, "filter": {"source": file_name}})
        else:
            return qa_chain.invoke(query)
    except Exception as e:
        print("Error:", e)

In [32]:
def pipeline(vectordb, file):
    chunks = get_chunks(file, file.name)
    chunks = remove_duplicate_chunks(chunks)
    store_to_vectordb(vectordb, chunks, file.name)
    return chunks

In [33]:
def flush_db():
    client = Chroma.Client()
    collections = client.list_collections()

    for col in collections:
        client.delete_collection(name=col.name)