In [1]:
from decouple import config
import os
import uuid

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores.chroma import Chroma

from langchain_ollama import ChatOllama
from langchain_community.llms import LlamaCpp
from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

from PDFChatBot import PDFChatBot

In [2]:
session_id = str(uuid.uuid4()).replace('-', '_')

In [3]:
if 'OLLAMA_API_BASE_URL' not in os.environ:
    os.environ["OPENAI_API_KEY"] = config('OPENAI_API_KEY')
OLLAMA_API_BASE_URL = os.environ['OLLAMA_API_BASE_URL'] if 'OLLAMA_API_BASE_URL' in os.environ else config('OLLAMA_API_BASE_URL')   
LLM = os.environ['LLM'] if 'LLM' in os.environ else config('LLM')   
EMBEDDING_MODEL = os.environ['EMBEDDING_MODEL'] if 'EMBEDDING_MODEL' in os.environ else config('EMBEDDING_MODEL')  

In [4]:
print(f'Using embedding model: {EMBEDDING_MODEL}')
embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)

Using embedding model: sentence-transformers/all-MiniLM-L6-v2


  from tqdm.autonotebook import tqdm, trange


In [5]:
print(f'Using LLM: {LLM}')
llm = ChatOllama(
    base_url=OLLAMA_API_BASE_URL, 
    model=LLM
)

Using LLM: llama3.1:8b


In [6]:
chat_bot = PDFChatBot(embedding_model, llm)

In [7]:
chat_bot.add_pdf_data(pdf_file_path='/Users/stolli/IT/Designing Data-Intensive Applications.pdf')

incorrect startxref pointer(1)
parsing for Object Streams


In [None]:
stream_response = []
for chunk in chat_bot.stream_response('What is partitioning?', session_id):
    stream_response.append(chunk)
    print(chunk, end="\n", flush=True)

{'input': 'What is partitioning?', 'chat_history': []}
{'context': [Document(metadata={'page': 220, 'source': '/Users/stolli/IT/Designing Data-Intensive Applications.pdf'}, page_content='Terminological confusion\nWhat we call a partition  here is called a shard  in MongoDB, Elas‐\nticsearch, and SolrCloud; it’s known as a region  in HBase, a tablet\nin Bigtable, a vnode  in Cassandra and Riak, and a vBucket  in\nCouchbase. However, partitioning  is the most established term, so\nwe’ll stick with that.\nNormally, partitions are defined in such a way that each piece of data (each record,\nrow, or document) belongs to exactly one partition. There are various ways of achiev‐\ning this, which we discuss in depth in this chapter. In effect, each partition is a small\ndatabase of its own, although the database may support operations that touch multi‐\nple partitions at the same time.\nThe main reason for wanting to partition data is scalability . Different partitions can\nbe placed on differe

In [None]:
stream_response

In [None]:
''.join([chunk['answer'] for chunk in stream_response if 'answer' in chunk.keys()])

'Partitioning is splitting data into smaller pieces called partitions or shards, each containing a portion of the total dataset. Each piece of data (record, row, or document) belongs to exactly one partition. Partitions can be thought of as small databases of their own, although the main database may support operations that touch multiple partitions at the same time. The goal of partitioning is to spread the data and query load evenly across multiple machines, improving scalability and avoiding hot spots (nodes with disproportionately high load).'

In [None]:
response = chat_bot.get_response('What is the book about? Please summarize it in around 20 sentences. Include a list of the most important topics', session_id=session_id)

In [None]:
response

In [None]:
print(response['answer'])

The book "Designing Data-Intensive Applications" is about designing and building large-scale data systems that can handle massive amounts of data and scale to meet the needs of modern applications.

The authors, Martin Kleppmann and others, draw on their experience in building distributed data systems to provide a comprehensive guide to designing data-intensive applications.

The book focuses on the architecture of data systems and how they are integrated into data-intensive applications. It doesn't cover deployment, operations, security, management, and other areas that are complex and important topics.

The authors explain that most books about data systems focus on the technical details, but this one takes a more general approach to designing data-intensive applications.

They explore the challenges of building large-scale data systems and how to overcome them. The book covers various aspects of designing data-intensive applications, including:

**Most Important Topics:**

1. **Data

In [None]:
response['context']

[Document(metadata={'page': 20, 'source': '/Users/stolli/IT/Designing Data-Intensive Applications.pdf'}, page_content='McCaffrey, Josie McLellan, Christopher Meiklejohn, Ian Meyers, Neha Narkhede,\nNeha Narula, Cathy O’Neil, Onora O’Neill, Ludovic Orban, Zoran Perkov, Julia\nPowles, Chris Riccomini, Henry Robinson, David Rosenthal, Jennifer Rullmann,\nMatthew Sackman, Martin Scholl, Amit Sela, Gwen Shapira, Greg Spurrier, Sam\nStokes, Ben Stopford, Tom Stuart, Diana Vasile, Rahul Vohra, Pete Warden, and\nBrett Wooldridge.\nSeveral more people have been invaluable to the writing of this book by reviewing\ndrafts and providing feedback. For these contributions I am particularly indebted to\nRaul Agepati, Tyler Akidau, Mattias Andersson, Sasha Baranov, Veena Basavaraj,\nDavid Beyer, Jim Brikman, Paul Carey, Raul Castro Fernandez, Joseph Chow, Derek\nElkins, Sam Elliott, Alexander Gallego, Mark Grover, Stu Halloway, Heidi Howard,\nNicola Kleppmann, Stefan Kruppa, Bjorn Madsen, Sander Mak, 

In [None]:
res = []
[res.append(x) for x in response['context'] if x not in res]
res

[Document(metadata={'page': 20, 'source': '/Users/stolli/IT/Designing Data-Intensive Applications.pdf'}, page_content='McCaffrey, Josie McLellan, Christopher Meiklejohn, Ian Meyers, Neha Narkhede,\nNeha Narula, Cathy O’Neil, Onora O’Neill, Ludovic Orban, Zoran Perkov, Julia\nPowles, Chris Riccomini, Henry Robinson, David Rosenthal, Jennifer Rullmann,\nMatthew Sackman, Martin Scholl, Amit Sela, Gwen Shapira, Greg Spurrier, Sam\nStokes, Ben Stopford, Tom Stuart, Diana Vasile, Rahul Vohra, Pete Warden, and\nBrett Wooldridge.\nSeveral more people have been invaluable to the writing of this book by reviewing\ndrafts and providing feedback. For these contributions I am particularly indebted to\nRaul Agepati, Tyler Akidau, Mattias Andersson, Sasha Baranov, Veena Basavaraj,\nDavid Beyer, Jim Brikman, Paul Carey, Raul Castro Fernandez, Joseph Chow, Derek\nElkins, Sam Elliott, Alexander Gallego, Mark Grover, Stu Halloway, Heidi Howard,\nNicola Kleppmann, Stefan Kruppa, Bjorn Madsen, Sander Mak, 

In [None]:
response['context'][0].page_content

'McCaffrey, Josie McLellan, Christopher Meiklejohn, Ian Meyers, Neha Narkhede,\nNeha Narula, Cathy O’Neil, Onora O’Neill, Ludovic Orban, Zoran Perkov, Julia\nPowles, Chris Riccomini, Henry Robinson, David Rosenthal, Jennifer Rullmann,\nMatthew Sackman, Martin Scholl, Amit Sela, Gwen Shapira, Greg Spurrier, Sam\nStokes, Ben Stopford, Tom Stuart, Diana Vasile, Rahul Vohra, Pete Warden, and\nBrett Wooldridge.\nSeveral more people have been invaluable to the writing of this book by reviewing\ndrafts and providing feedback. For these contributions I am particularly indebted to\nRaul Agepati, Tyler Akidau, Mattias Andersson, Sasha Baranov, Veena Basavaraj,\nDavid Beyer, Jim Brikman, Paul Carey, Raul Castro Fernandez, Joseph Chow, Derek\nElkins, Sam Elliott, Alexander Gallego, Mark Grover, Stu Halloway, Heidi Howard,\nNicola Kleppmann, Stefan Kruppa, Bjorn Madsen, Sander Mak, Stefan Podkowinski,\nPhil Potter, Hamid Ramazani, Sam Stokes, and Ben Summers. Of course, I take all'

In [None]:
for document in response['context']:
    print(f'Source: {document.metadata["source"]}')
    print(f'Page: {document.metadata["page"]}')
    print(f'Content: {document.page_content}\n')


Source: /Users/stolli/IT/Designing Data-Intensive Applications.pdf
Page: 20
Content: McCaffrey, Josie McLellan, Christopher Meiklejohn, Ian Meyers, Neha Narkhede,
Neha Narula, Cathy O’Neil, Onora O’Neill, Ludovic Orban, Zoran Perkov, Julia
Powles, Chris Riccomini, Henry Robinson, David Rosenthal, Jennifer Rullmann,
Matthew Sackman, Martin Scholl, Amit Sela, Gwen Shapira, Greg Spurrier, Sam
Stokes, Ben Stopford, Tom Stuart, Diana Vasile, Rahul Vohra, Pete Warden, and
Brett Wooldridge.
Several more people have been invaluable to the writing of this book by reviewing
drafts and providing feedback. For these contributions I am particularly indebted to
Raul Agepati, Tyler Akidau, Mattias Andersson, Sasha Baranov, Veena Basavaraj,
David Beyer, Jim Brikman, Paul Carey, Raul Castro Fernandez, Joseph Chow, Derek
Elkins, Sam Elliott, Alexander Gallego, Mark Grover, Stu Halloway, Heidi Howard,
Nicola Kleppmann, Stefan Kruppa, Bjorn Madsen, Sander Mak, Stefan Podkowinski,
Phil Potter, Hamid Ramaza

In [None]:
# chat_bot.get_response('What is partitioning?', session_id=session_id)

In [None]:
# chat_bot.get_response('Can you repeat the answer as structured list?', session_id=session_id)