In [1]:
from decouple import config
import os
import uuid

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores.chroma import Chroma

from langchain_ollama import ChatOllama
from langchain_community.llms import LlamaCpp
from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

from PDFChatBot import PDFChatBot

In [2]:
session_id = str(uuid.uuid4()).replace('-', '_')

In [3]:
if 'OLLAMA_API_BASE_URL' not in os.environ:
    os.environ["OPENAI_API_KEY"] = config('OPENAI_API_KEY')
OLLAMA_API_BASE_URL = os.environ['OLLAMA_API_BASE_URL'] if 'OLLAMA_API_BASE_URL' in os.environ else config('OLLAMA_API_BASE_URL')   
LLM = os.environ['LLM'] if 'LLM' in os.environ else config('LLM')   
EMBEDDING_MODEL = os.environ['EMBEDDING_MODEL'] if 'EMBEDDING_MODEL' in os.environ else config('EMBEDDING_MODEL')  

In [4]:
print(f'Using embedding model: {EMBEDDING_MODEL}')
embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)

Using embedding model: sentence-transformers/all-MiniLM-L6-v2


  from tqdm.autonotebook import tqdm, trange


In [5]:
print(f'Using LLM: {LLM}')
llm = ChatOllama(
    base_url=OLLAMA_API_BASE_URL, 
    model=LLM
)

Using LLM: llama3.1:8b


In [6]:
chat_bot = PDFChatBot('/Users/stolli/IT/Designing Data-Intensive Applications.pdf', embedding_model, llm)

incorrect startxref pointer(1)


Initializing PDF Chatbot ...
--- Loading and vectorizing PDF file ---


parsing for Object Streams


--- Initializing history aware LLM ---


In [7]:
stream_response = []
async for chunk in chat_bot.stream_response('What is partitioning?', session_id):
    stream_response.append(chunk)
    print(chunk, end="\n", flush=True)

{'input': 'What is partitioning?', 'chat_history': []}
{'context': [Document(metadata={'page': 220, 'source': '/Users/stolli/IT/Designing Data-Intensive Applications.pdf'}, page_content='Terminological confusion\nWhat we call a partition  here is called a shard  in MongoDB, Elas‐\nticsearch, and SolrCloud; it’s known as a region  in HBase, a tablet\nin Bigtable, a vnode  in Cassandra and Riak, and a vBucket  in\nCouchbase. However, partitioning  is the most established term, so\nwe’ll stick with that.\nNormally, partitions are defined in such a way that each piece of data (each record,\nrow, or document) belongs to exactly one partition. There are various ways of achiev‐\ning this, which we discuss in depth in this chapter. In effect, each partition is a small\ndatabase of its own, although the database may support operations that touch multi‐\nple partitions at the same time.\nThe main reason for wanting to partition data is scalability . Different partitions can\nbe placed on differe

In [8]:
response = chat_bot.get_response('What is the book about? Please summarize it in around 20 sentences. Include a list of the most important topics', session_id=session_id)

--- Generating response ---


In [9]:
print(response['answer'])

The text appears to be an excerpt from a discussion or tutorial about partitioning data, likely in a distributed database system.

Here's a summary:

Partitioning is a technique used to spread data and query load evenly across multiple machines (nodes) in a cluster. This approach helps avoid hot spots, which are nodes with disproportionately high loads. The goal of partitioning is to choose an appropriate scheme for the data and rebalance partitions when nodes are added or removed from the cluster.

There are two main approaches to partitioning:

1. **Key Range Partitioning**: Keys are sorted, and a partition owns all keys from a minimum up to a maximum value. This approach allows efficient range queries but risks hot spots if the application often accesses keys close together.
2. **Hash Partitioning**: A hash function is applied to each key, and a partition owns a range of hashes. This method destroys key ordering, making range queries less efficient.

Partitions in Key Range Partitio

In [10]:
response['context'][0].page_content

'The goal of partitioning is to spread the data and query load evenly across multiple\nmachines, avoiding hot spots (nodes with disproportionately high load). This\nrequires choosing a partitioning scheme that is appropriate to your data, and reba‐\nlancing the partitions when nodes are added to or removed from the cluster.\nWe discussed two main approaches to partitioning:\n•Key range partitioning , where keys are sorted, and a partition owns all the keys\nfrom some minimum up to some maximum. Sorting has the advantage that effi‐\ncient range queries are possible, but there is a risk of hot spots if the application\noften accesses keys that are close together in the sorted order.\nIn this approach, partitions are typically rebalanced dynamically by splitting the\nrange into two subranges when a partition gets too big.\n•Hash partitioning , where a hash function is applied to each key, and a partition\nowns a range of hashes. This method destroys the ordering of keys, making range'

In [11]:
for document in response['context']:
    print(f'Source: {document.metadata["source"]}')
    print(f'Page: {document.metadata["page"]}')
    print(f'Content: {document.page_content}\n')


Source: /Users/stolli/IT/Designing Data-Intensive Applications.pdf
Page: 238
Content: The goal of partitioning is to spread the data and query load evenly across multiple
machines, avoiding hot spots (nodes with disproportionately high load). This
requires choosing a partitioning scheme that is appropriate to your data, and reba‐
lancing the partitions when nodes are added to or removed from the cluster.
We discussed two main approaches to partitioning:
•Key range partitioning , where keys are sorted, and a partition owns all the keys
from some minimum up to some maximum. Sorting has the advantage that effi‐
cient range queries are possible, but there is a risk of hot spots if the application
often accesses keys that are close together in the sorted order.
In this approach, partitions are typically rebalanced dynamically by splitting the
range into two subranges when a partition gets too big.
•Hash partitioning , where a hash function is applied to each key, and a partition
owns a rang

In [12]:
# chat_bot.get_response('What is partitioning?', session_id=session_id)

In [13]:
# chat_bot.get_response('Can you repeat the answer as structured list?', session_id=session_id)

In [14]:
chunks = []
async for chunk in chat_bot._rag_chain_with_history.astream(
    {"input": 'What is partitioning?'},
    config={
        "configurable": {"session_id": session_id}
    }
):
    chunks.append(chunk)
    print(chunk, end="\n", flush=True)

{'input': 'What is partitioning?', 'chat_history': [HumanMessage(content='What is partitioning?', additional_kwargs={}, response_metadata={}), AIMessage(content='Partitioning is dividing data into smaller, independent pieces (called partitions or shards) so that each piece of data belongs to exactly one partition. Each partition acts as a small database, and different partitions can be placed on different nodes in a shared-nothing cluster for improved scalability.', additional_kwargs={}, response_metadata={}), HumanMessage(content='What is the book about? Please summarize it in around 20 sentences. Include a list of the most important topics', additional_kwargs={}, response_metadata={}), AIMessage(content="The text appears to be an excerpt from a discussion or tutorial about partitioning data, likely in a distributed database system.\n\nHere's a summary:\n\nPartitioning is a technique used to spread data and query load evenly across multiple machines (nodes) in a cluster. This approach

In [None]:
chunks

[{'input': 'What is partitioning?',
  'chat_history': [HumanMessage(content='What is the book about? Please summarize it in around 20 sentences. Include a list of the most important topics', additional_kwargs={}, response_metadata={}),
   AIMessage(content='Based on the provided context, here\'s a summary of the book "Designing Data-Intensive Applications" by Martin Kleppmann:\n\n**Summary**\n\nThe book is about designing and building data-intensive applications that can scale to handle large amounts of data. The author, Martin Kleppmann, draws from his experience as a researcher in distributed systems at the University of Cambridge and a software engineer at companies like LinkedIn.\n\nKleppmann shares lessons learned from working on large-scale data infrastructure projects, highlighting common mistakes and pitfalls to avoid. He emphasizes the importance of understanding fundamental technical concepts, making them accessible to everyone, and promoting deeper understanding to develop b

In [None]:
''.join([chunk['answer'] for chunk in chunks if 'answer' in chunk.keys()])

'According to the provided context: Partitioning is necessary when you have so much data that storing and processing it on a single machine is no longer feasible.'

In [None]:
async def stream_response(chain, question, session_id):
    async for chunk in chain.astream(
        {"input": question},
        config={
            "configurable": {"session_id": session_id}
        }
    ):
        yield chunk