In [1]:
import os
import threading
import time
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from PyPDF2 import PdfReader
import numpy as np
import requests
from pydantic import BaseModel, Field
from typing import List, Optional
import json
from uuid import uuid4

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models.base import BaseChatModel
from langchain.schema import (
    Document,
    ChatMessage,
    AIMessage,
    HumanMessage,
    SystemMessage,
    ChatResult
)

from langchain_core.messages import (
    BaseMessage,
)
# from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_core.output_parsers import StrOutputParser

In [2]:
stop_watcher = False
directory_to_watch = "/Users/amish/Workspace/personal/ai/docugent/documents"
# Load a local embedding model
# embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
# embedding_dimension is 768
embedding_dimension = len(embedding_model.embed_query("hello world"))
# Create a FAISS index
index = faiss.IndexFlatL2(embedding_dimension)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# in-memory dictionary to store content and metadata of all pdfs. Contains complete contents of all PDFs
repository = {}
# initialise vector store for storing embeddings
vector_store = FAISS(
    embedding_function=embedding_model,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [4]:
def process_pdf(file_path):
    if os.path.exists(file_path):
        print(f"Processing PDF: {file_path}")
        reader = PdfReader(file_path)
        content = "\n".join([page.extract_text() for page in reader.pages])
        # update the in-memory store
        repository[os.path.basename(file_path)] = {
            "path": file_path,
            "last_modified": os.path.getmtime(file_path),
            "content": content,
        }

In [5]:
# any files already present in directory is processed
def process_existing_files_in_directory(directory_to_watch):
    for file_name in os.listdir(directory_to_watch):
        file_path = os.path.join(directory_to_watch, file_name)
        if os.path.isfile(file_path) and file_name.endswith(".pdf"):
            print(f"Processing existing PDF: {file_path}")
            process_pdf(file_path)

In [6]:
# custom handler
class PDFHandler(FileSystemEventHandler):
    def __init__(self, process_pdf_callback):
        self.process_pdf_callback = process_pdf_callback

    def on_create(self, event):
        if event.is_directory:
            return
        if event.src_path.endswith(".pdf"):
            print(f"New PDF Added: {event.src_path}")
            self.process_pdf_callback(event.src_path)

    def on_modified(self, event):
        if event.is_directory:
            return
        if event.src_path.endswith(".pdf"):
            print(f"PDF Modified: {event.src_path}")
            self.process_pdf_callback(event.src_path)
            

In [7]:
# directory watcher function
def start_directory_watcher_with_initial_scan(directory_to_watch):
    global stop_watcher
    
    # first, process existing files
    process_existing_files_in_directory(directory_to_watch)
    
    # second, start directory watcher
    event_handler = PDFHandler(process_pdf)
    observer = Observer()
    observer.schedule(event_handler, path=directory_to_watch, recursive=False)
    observer.start()
    try:
        print(f"Watching directory: {directory_to_watch}")
        while not stop_watcher:
            time.sleep(2) # keep the observer running
    except KeyboardInterrupt:
         print("KeyboardInterrupt detected. Stopping observer...")
    finally:
        # Ensure the observer stops when the loop exits for any reason
        observer.stop()
        observer.join()
        print(f"Stopped Watching directory: {directory_to_watch}")

In [8]:
# run observer in a seperate thread
def start_watching_in_thread(directory_to_watch):
    watcher_thread = threading.Thread(target=start_directory_watcher_with_initial_scan, args=(directory_to_watch,), daemon=True)
    watcher_thread.start()
    return watcher_thread

In [9]:
# start the watcher in a thread
watcher_thread = start_watching_in_thread(directory_to_watch)

Processing existing PDF: /Users/amish/Workspace/personal/ai/docugent/documents/LRA.pdf
Processing PDF: /Users/amish/Workspace/personal/ai/docugent/documents/LRA.pdf
Processing existing PDF: /Users/amish/Workspace/personal/ai/docugent/documents/Receipts_Delta_AirLines.pdf
Processing PDF: /Users/amish/Workspace/personal/ai/docugent/documents/Receipts_Delta_AirLines.pdf
Watching directory: /Users/amish/Workspace/personal/ai/docugent/documents


In [10]:
# start/stop watcher knob
stop_watcher = False

In [11]:
# current value of knob
stop_watcher

False

In [12]:
def chunk_text(content, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " "],
    )
    return text_splitter.create_documents([content])

In [13]:
chunk = chunk_text(repository['LRA.pdf']['content'])

In [14]:
# split text into chunks using LangChain's RecursiveCharacterTextSplitter
def chunk_and_add_metadata(file_name, content, chunk_size=1000, chunk_overlap=200):
    chunks = chunk_text(content, chunk_size, chunk_overlap)
    return [
        Document(page_content=chunk.page_content, metadata={"file_name": file_name, **chunk.metadata})
        for chunk in chunks
    ]

In [15]:
chunked_documents = chunk_and_add_metadata('LRA.pdf', repository['LRA.pdf']['content'])

In [16]:
chunked_documents[0]

Document(metadata={'file_name': 'LRA.pdf'}, page_content='I485/ GC LRAs/ 11.20.24 /HAF \n \n \n10451 Mill Run Circle, Suite 100, Owings Mills, Maryland 21117  \n \nTel: 410.356.5440 | Fax: 410.356.5669 | Web: www.murthy.com  | eM ail: law@murthy.com  \n \n \n \nPRIVILEGED AND CONFIDENTIAL  \nCLIENT COMMUNICATION \nATTORNEY WORK P RODUCT  \n  Hello :  \n \nThank you for contacting the Murthy Law Firm in connection with processing your immigration case/s. For \ny\nour convenience, we have attached the standard Legal Representation Agreement  for your review and \nsignature, so that we may officially represent you. We are honored and delighted to help you.  \n \nOur experienced and knowledgeable team will work to assist you. Our firm is able to provide you with many \nv\naluable online resources. Among these are: our award- winning website, MurthyDotCom; the M urthyChat , \nthrough which we provide answers to your questions in real time; the MurthyForum, where our attorneys')

#### NOTE : this function was made to work with sentence transformer for generting embeddings of the Documents
```
def generate_embeddings_for_all_documents(repository):
    all_embeddings_with_metadata = []
    for file_name, doc in repository.items():
        chunked_documents = chunk_and_add_metadata(file_name, doc['content'])
        # Generate embeddings for the chunks
        embeddings = embedding_model.encode([doc.page_content for doc in chunked_documents])
        embeddings_with_metadata = [
            {"embedding": embedding, "metadata": doc.metadata}
            for embedding, doc in zip(embeddings, chunked_documents)
        ]
        # Append to the final list
        all_embeddings_with_metadata.extend(embeddings_with_metadata)
        print(f"File: {file_name}, chunks:[{len(chunked_documents)}], embeddings:[{len(embeddings_with_metadata)}]")
    return all_embeddings_with_metadata
```

#### NOTE : this function was made to work with sentence transformer for generting embeddings of the Documents
```
all_embeddings = generate_embeddings_for_all_documents(repository)
```

#### NOTE : this function was made to work with sentence transformer for generting embeddings of the Documents
#### add all_emb|eddings to FAISS index
```
embedding_vectors = np.array([item['embedding'] for item in all_embeddings]).astype('float32')
index.add(embedding_vectors)
print(f"Added {index.ntotal} embeddings to the FAISS index.")
```

#### NOTE : this function was made to work with sentence transformer for generting embeddings of the Documents
#### Encode the query
```
query = "What resources does Murthy Law Firm provides"
query_embedding = embedding_model.encode([query]).astype('float32')
```

#### Search the index
```
k = 5  # Number of nearest neighbors
distances, indices = index.search(query_embedding, k)
```

#### Retrieve results with metadata
```
results = [all_embeddings[i] for i in indices[0]]
for result in results:
    print(f"File: {result['metadata']['file_name']}, Distance: {distances[0][0]}")
```

In [17]:
# add the documents/embedding to the vector store
def embed_all_PDFs_and_add_to_vector_store(repository):
    for file_name, doc in repository.items():
        chunked_documents = chunk_and_add_metadata(file_name, doc['content'])
        uuids = [str(uuid4()) for _ in range(len(chunked_documents))]
        vector_store.add_documents(documents=chunked_documents, ids=uuids)

In [None]:
embed_all_PDFs_and_add_to_vector_store(repository)

In [None]:
# try retrieving records
results = vector_store.similarity_search(query="Passangers travelling from Los Angeles to Bangalore",k=1)
for doc in results:
    print(f"* {doc.page_content} [{doc.metadata}]")


In [None]:
# implement a custom chat model
class ChatLocalOllamaMistral(BaseChatModel):
    """
    A custom chat model that interfaces with a locally installed Ollama Mistral model
    """
    model_name: str = Field(default="mistral", alias="model")
    host: str = "http://localhost:11434"
    
    @property
    def _llm_type(self) -> str:
        return "local-ollama-mistral"

    def _generate(self, message: List[BaseMessage], stop: Optional[List[str]] = None,) -> ChatResult:
        """
        Process a list of messages and generate a response
        """
        formatted_message = self._format_message(message)
        rsp = self._send_request(formatted_message)
        # print(f"Raw response from /api/generate : \n{rsp}\n")
        # form ai message for the received response
        ai_message = AIMessage(
            content=rsp.get("response", ""),
            additional_kwargs={
                "model_name": rsp.get("model", ""),
            }
        )
        # return finally as a ChatResult
        return ChatResult(
            generations=[ChatGeneration(message=ai_message)]
        )
            
    def _send_request(self, prompt: str) -> str:
        """
        Sends a request to the local Ollama Mistral API
        """
        payload = {
            "model": self.model_name,
            "prompt": prompt,
            "stream": False,
        }
        # TODO : Add retries
        try:
            response = requests.post(
                f"{self.host}/api/generate",
                json=payload,
            )
            # print(f"response from API: {response.text}")
            response.raise_for_status()
            return response.json()
        except requests.RequestException as e:
            print(f"Request failed with error: {e}")
            raise RuntimeError("Failed to generate response")

    def _format_message(self, messages: List[ChatMessage]) -> str:
        """
        Convert LangChain messages into a single string format
        """
        formatted_messages = ""
        for message in messages:
            if isinstance(message, AIMessage):
                formatted_messages += f"AI: {message.content}\n"
            else:
                formatted_messages += f"User: {message.content}\n"
        return formatted_messages.strip()
            

llm = ChatLocalOllamaMistral(model_name="mistral")
message = [[HumanMessage(content="Hello")]]
response = llm.generate(message)
print(response.generations)

# Load FAISS vectore store retriever
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 1})
relevant_docs = retriever.invoke("dispute resolution")
# relevant_docs
context = "\n\n".join([doc.page_content for doc in relevant_docs])
context

# Define the prompt template for RAG
template = """Use the following pieces of context to answer the question. If you don't know the answer, say so.

Context:
{context}

Question:
{question}

Answer:"""
question1 = "What all resources does murthy law firm offers?"
question2 = "Where is murthy law firm located?"
question3 = "can you give me a summary of terms and conditions of the Legal Representation Agreement?"
question4 = "what all is present in the addendum?"
question5 = "tell me about dispute resolution"
prompt = PromptTemplate(template=template, input_variables=["context", "question"])
chain = LLMChain( llm=llm, prompt=prompt)
# answer = chain.run({"context": context, "question": "What resources does Murthy Law Firm offer?"})

response = chain.run(
    context=context, 
    question=question5
)

print("Answer:\n", response)

In [None]:
llm = ChatLocalOllamaMistral(model_name="mistral")
# Load FAISS vectore store retriever
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 1})
# Define the prompt template for RAG
template = """Use the following pieces of context to answer the question. If you don't know the answer, say so.
Context:
{context}
Question:
{question}
Answer:"""
prompt = PromptTemplate(template=template, input_variables=["context", "question"])

def rag_pipeline(question):
    relevant_docs = retriever.invoke(question)
    print(relevant_docs)
    chain = LLMChain( llm=llm, prompt=prompt)
    response = chain.run(context=context, question=question5)
    return response

In [None]:
question = "Where is murthy law firm located?"
response = rag_pipeline(question)
print("Answer:\n", response)