In [1]:
from decouple import config
import os

from langchain_ollama import ChatOllama
from langchain_core.prompts import PromptTemplate

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores.chroma import Chroma

from langchain.chains import RetrievalQA, ConversationalRetrievalChain

In [2]:
if 'OLLAMA_API_BASE_URL' not in os.environ:
    os.environ["OPENAI_API_KEY"] = config('OPENAI_API_KEY')
OLLAMA_API_BASE_URL = os.environ['OLLAMA_API_BASE_URL'] if 'OLLAMA_API_BASE_URL' in os.environ else config('OLLAMA_API_BASE_URL')   
MODEL = os.environ['MODEL'] if 'MODEL' in os.environ else config('MODEL')   
print(f"Using model: {MODEL}")

Using model: deepseek-coder-v2


In [3]:
# load and chain model
llm = ChatOllama(
    base_url=OLLAMA_API_BASE_URL, 
    model=MODEL
)

In [4]:
# load and parse pdf file
loader = PyPDFLoader("/Users/stolli/IT/Designing Data-Intensive Applications.pdf")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
pages = loader.load_and_split(text_splitter)

incorrect startxref pointer(1)
parsing for Object Streams


In [5]:
embedding = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")
vectorstore = Chroma.from_documents(pages, embedding=embedding, persist_directory="chroma_db")

  from tqdm.autonotebook import tqdm, trange


In [6]:
vectorstore.similarity_search_with_score("What is partitioning?", 5)

[(Document(metadata={'page': 220, 'source': '/Users/stolli/IT/Designing Data-Intensive Applications.pdf'}, page_content='i. Partitioning, as discussed in this chapter, is a way of intentionally breaking a large database down into\nsmaller ones. It has nothing to do with network partitions  (netsplits), a type of fault in the network between\nnodes. We will discuss such faults in Chapter 8 .\nCHAPTER 6\nPartitioning\nClearly, we must break away from the sequential and not limit the computers. We must\nstate definitions and provide for priorities and descriptions of data. We must state relation‐\nships, not procedures.\n—Grace Murray Hopper, Management and the Computer of the Future  (1962)\nIn Chapter 5  we discussed replication—that is, having multiple copies of the same\ndata on different nodes. For very large datasets, or very high query throughput, that is\nnot sufficient: we need to break the data up into partitions , also known as sharding .i\nTerminological confusion\nWhat we cal

In [7]:
retriever = vectorstore.as_retriever()

In [8]:
prompt_template = """You are an assistant for question-answering tasks. 
    Use the following pieces of retrieved context to answer the question. 
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    If you find the answer, write the answer in a concise way and add the list of pages that are used to derive the answer. 
    Context: {context}
    Question: {question}
    Answer:"""

prompt = PromptTemplate(
    template=prompt_template, 
    input_variables=['context', 'question']
)

qa_interface = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever, 
    return_source_documents=True, 
    chain_type_kwargs={"prompt": prompt}
)

In [9]:
qa_interface.invoke("What is partitioning?")

{'query': 'What is partitioning?',
 'result': ' Partitioning is a method of intentionally breaking down a large database into smaller ones. This approach has nothing to do with network partitions (netsplits) or similar faults in the network between nodes. The direct pages used for this answer are from the context provided above and include Chapter 6, specifically discussing partitioning and related terms such as sharding.',
 'source_documents': [Document(metadata={'page': 220, 'source': '/Users/stolli/IT/Designing Data-Intensive Applications.pdf'}, page_content='i. Partitioning, as discussed in this chapter, is a way of intentionally breaking a large database down into\nsmaller ones. It has nothing to do with network partitions  (netsplits), a type of fault in the network between\nnodes. We will discuss such faults in Chapter 8 .\nCHAPTER 6\nPartitioning\nClearly, we must break away from the sequential and not limit the computers. We must\nstate definitions and provide for priorities a

In [10]:
conv_interface = ConversationalRetrievalChain.from_llm(
    llm, 
    retriever=retriever, 
    return_source_documents=True, 
    combine_docs_chain_kwargs={"prompt": prompt}
    )

In [11]:
chat_history = []
query = "What is partitioning?"

result = conv_interface.invoke({"question": query, "chat_history": chat_history})
chat_history.append((query, result["answer"]))

In [12]:
print(result["answer"])
print('-----')
print(result["source_documents"])

 Partitioning is a method of intentionally breaking down a large database into smaller ones. It involves separating data into distinct partitions or shards, which are also known as sharding in MongoDB, Elasticsearch, and SolrCloud; regions in HBase; and tablets. This approach helps manage larger datasets more efficiently by distributing the load across multiple nodes and supports high query throughput requirements.
-----
[Document(metadata={'page': 220, 'source': '/Users/stolli/IT/Designing Data-Intensive Applications.pdf'}, page_content='i. Partitioning, as discussed in this chapter, is a way of intentionally breaking a large database down into\nsmaller ones. It has nothing to do with network partitions  (netsplits), a type of fault in the network between\nnodes. We will discuss such faults in Chapter 8 .\nCHAPTER 6\nPartitioning\nClearly, we must break away from the sequential and not limit the computers. We must\nstate definitions and provide for priorities and descriptions of data.

In [13]:
query = "Can you repeat your answer as structured list please?"

result = conv_interface.invoke({"question": query, "chat_history": chat_history})


In [None]:
result["answer"]

' Partitioning in a database involves intentionally breaking down a large database into smaller ones. This method has nothing to do with network partitions (netsplits), which are faults in the network between nodes. The main differences between partitioning and network partitions are as follows:\n\n- **Partitioning**: It is a technique used within databases to divide a large dataset into smaller parts, known as partitions or shards. Each partition contains a subset of the data, making it easier for the database system to manage and query these subsets efficiently. The purpose is to improve performance, scalability, and maintainability by distributing the load across multiple nodes or servers.\n- **Network Partitions**: These are faults in the network between nodes where communication between different parts of the network becomes impossible, leading to isolated segments that cannot exchange data or information. This type of fault typically results from hardware failures, configuration 