### This repository reads any pdf document and uses local llama3 and RAG to chat with the document and get answers.

## Installations and Imports 

In [2]:
!pip install langchain -q
!pip install pymupdf -q


In [2]:
import streamlit as st
import ollama
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings

## STEP 1: READ PDF

In [3]:
loader = PyMuPDFLoader("test_doc.pdf")
data = loader.load()
data[0]

Document(page_content='LLM-based Machine Translation Model\nPipeline\nStep 1: Dataset for pipeline\nThis can include publicly available datasets or proprietary data that needs to be\ngathered. The text should ideally be in diverse fonts, sizes, and backgrounds to\nensure robust OCR performance. In case of legal documents, these can be old\ndocuments being collected from the authorities.This dataset can be in the form of\nimages or pdf.\nOfficial document (text)\n', metadata={'source': 'test_doc.pdf', 'file_path': 'test_doc.pdf', 'page': 0, 'total_pages': 8, 'format': 'PDF 1.4', 'title': 'KYROTICS | PROBLEM FOR LLM', 'author': '', 'subject': '', 'keywords': '', 'creator': '', 'producer': 'Skia/PDF m124 Google Docs Renderer', 'creationDate': '', 'modDate': '', 'trapped': ''})

## STEP 2: CONVERT DOC INTO CHUNKS

In [4]:
# Use the RecursiveCharacterTextSplitter class to split the documents into chunks for embedding
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500, 
    chunk_overlap  = 100, 
    length_function = len,
)

chunks = text_splitter.split_documents(data)

# Look at the first two chunks 
chunks[0:2]

[Document(page_content='LLM-based Machine Translation Model\nPipeline\nStep 1: Dataset for pipeline\nThis can include publicly available datasets or proprietary data that needs to be\ngathered. The text should ideally be in diverse fonts, sizes, and backgrounds to\nensure robust OCR performance. In case of legal documents, these can be old\ndocuments being collected from the authorities.This dataset can be in the form of\nimages or pdf.\nOfficial document (text)', metadata={'source': 'test_doc.pdf', 'file_path': 'test_doc.pdf', 'page': 0, 'total_pages': 8, 'format': 'PDF 1.4', 'title': 'KYROTICS | PROBLEM FOR LLM', 'author': '', 'subject': '', 'keywords': '', 'creator': '', 'producer': 'Skia/PDF m124 Google Docs Renderer', 'creationDate': '', 'modDate': '', 'trapped': ''}),
 Document(page_content='Step 2: Text Extraction\nOCR Extraction: Use an OCR (Optical Character Recognition) tool to extract the\ntext from the image. This tool would recognize characters in the image and convert\nth

In [5]:
print(f'Number of documents: {len(data)}')
print(f'Number of chunks: {len(chunks)}')

Number of documents: 8
Number of chunks: 19


## STEP3: CONVERT CHUNKS TO EMBEDDINGS

In [6]:
embeddings = OllamaEmbeddings(model = "llama3")
vectorstore = Chroma.from_documents(documents = chunks, embedding=embeddings)

## STEP4: Put it all together

In [7]:
# call llama3 model  (RAG - Generation part)
def ollama_llm(question, context):
    formatted_prompt = f"Question: {question}\n\nContext: {context}"
    response = ollama.chat(model = "llama3", messages = [{'role': 'user', 'content': formatted_prompt}])
    return response['message']['content']

In [8]:
# RAG - Retriever part
retriever = vectorstore.as_retriever()


In [9]:
def combine_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [10]:
def rag_chain(question):
    retrieved_docs = retriever.invoke(question)
    formatted_content = combine_docs(retrieved_docs)
    result = ollama_llm(question, formatted_content)
    return {'result': result, 'source_documents': retrieved_docs}


In [11]:
def ask_question(question:str):
    response = rag_chain(question)
    print(f"Response: {response}\n")
    citations = {doc.metadata['source'] for doc in response['source_documents']}
    print(f"Citations: {citations}")

In [12]:
ask_question("What is the document about?")

Response: {'result': 'The document appears to be a proposal for a project that aims to improve the performance of English-Hindi translations by utilizing deep learning algorithms and cloud computing resources.\n\nThe proposal outlines the steps involved in achieving this goal, including:\n\n1. Developing a model using deep learning algorithms such as YOLOv5, CRAFT, or others designed specifically for text detection.\n2. Drawing bounding boxes around detected text blocks to segregate them into individual text blocks.\n3. Using Optical Character Recognition (OCR) on each text block to read the text and translate it from English to Hindi.\n4. Overlaying the translated Hindi text onto the original image, replicating the original format, and replacing the English text with Hindi within the exact bounding box.\n\nThe proposal also includes an estimated budget for the project, which consists of salaries for a team of engineers (INR 72 lakhs), cloud computing costs (INR 10-15 lakhs), local har

In [15]:
ask_question("What is the approximate cost for the whole project?")

Response: {'result': 'The approximate cost for the whole project is INR 1 Crore (90-102 lakhs), leaving some margin for contingencies within the budget. The breakdown of costs is as follows:\n\n* Salaries for Team: approximately INR 72 lakhs (6 engineers x INR 12 lakhs per engineer)\n* Cloud Computing Costs: INR 10-15 lakhs\n* Local Hardware for Hosting: INR 5-10 lakhs\n* Software and Miscellaneous: INR 3-5 lakhs\n\nTotal estimated budget: INR 90-102 lakhs (approximately INR 1 Crore)', 'source_documents': [Document(page_content='amount to INR 3-5 lakhs.\nTotal Estimated Budget: INR 90-102 lakhs, leaving some margin for contingencies\nwithin the INR 1 Crore budget.', metadata={'author': '', 'creationDate': '', 'creator': '', 'file_path': 'test_doc.pdf', 'format': 'PDF 1.4', 'keywords': '', 'modDate': '', 'page': 7, 'producer': 'Skia/PDF m124 Google Docs Renderer', 'source': 'test_doc.pdf', 'subject': '', 'title': 'KYROTICS | PROBLEM FOR LLM', 'total_pages': 8, 'trapped': ''}), Document(