In [1]:
from decouple import config
import os

from langchain_ollama import ChatOllama
from langchain_core.prompts import PromptTemplate

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores.chroma import Chroma

from langchain.chains import RetrievalQA, ConversationalRetrievalChain

In [2]:
if 'OLLAMA_API_BASE_URL' not in os.environ:
    os.environ["OPENAI_API_KEY"] = config('OPENAI_API_KEY')
OLLAMA_API_BASE_URL = os.environ['OLLAMA_API_BASE_URL'] if 'OLLAMA_API_BASE_URL' in os.environ else config('OLLAMA_API_BASE_URL')   
LLM = os.environ['LLM'] if 'LLM' in os.environ else config('LLM')   
EMBEDDING_MODEL = os.environ['EMBEDDING_MODEL'] if 'EMBEDDING_MODEL' in os.environ else config('EMBEDDING_MODEL')  

print(f'Using LLM: {LLM}')
print(f'Using embedding model: {EMBEDDING_MODEL}')

Using LLM: deepseek-coder-v2
Using embedding model: sentence-transformers/all-MiniLM-L6-v2


In [3]:
# load and chain model
llm = ChatOllama(
    base_url=OLLAMA_API_BASE_URL, 
    model=LLM
)

In [4]:
def load_pdf_data(file_path, use_splitter=False):
    loader = PyPDFLoader(file_path)
    if use_splitter:
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        return loader.load_and_split(text_splitter)
    else:
        return loader.load()


In [5]:
pdf_data = load_pdf_data("/Users/stolli/IT/Designing Data-Intensive Applications.pdf")

incorrect startxref pointer(1)
parsing for Object Streams


In [6]:
def create_vectorstore(pdf_data, embedding_model_name, persist_directory="chroma_db"):
    embedding = HuggingFaceEmbeddings(model_name=embedding_model_name)
    return Chroma.from_documents(pdf_data, embedding=embedding, persist_directory=persist_directory)

In [7]:
vectorstore = create_vectorstore(pdf_data, embedding_model_name=EMBEDDING_MODEL)

  from tqdm.autonotebook import tqdm, trange


In [9]:
vectorstore.similarity_search_with_score("What is partitioning?", 5)

[(Document(metadata={'page': 238, 'source': '/Users/stolli/IT/Designing Data-Intensive Applications.pdf'}, page_content='The goal of partitioning is to spread the data and query load evenly across multiple\nmachines, avoiding hot spots (nodes with disproportionately high load). This\nrequires choosing a partitioning scheme that is appropriate to your data, and reba‐\nlancing the partitions when nodes are added to or removed from the cluster.\nWe discussed two main approaches to partitioning:\n•Key range partitioning , where keys are sorted, and a partition owns all the keys\nfrom some minimum up to some maximum. Sorting has the advantage that effi‐\ncient range queries are possible, but there is a risk of hot spots if the application\noften accesses keys that are close together in the sorted order.\nIn this approach, partitions are typically rebalanced dynamically by splitting the\nrange into two subranges when a partition gets too big.\n•Hash partitioning , where a hash function is ap

In [10]:
retriever = vectorstore.as_retriever()

In [11]:
prompt_template = """You are an assistant for question-answering tasks. 
    Use the following pieces of retrieved context to answer the question. 
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    If you find the answer, write the answer in a concise way and add the list of pages that are used to derive the answer. 
    Context: {context}
    Question: {question}
    Answer:"""

prompt = PromptTemplate(
    template=prompt_template, 
    input_variables=['context', 'question']
)

qa_interface = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever, 
    return_source_documents=True, 
    chain_type_kwargs={"prompt": prompt}
)

In [12]:
qa_interface.invoke("What is partitioning?")

In [10]:
conv_interface = ConversationalRetrievalChain.from_llm(
    llm, 
    retriever=retriever, 
    return_source_documents=True, 
    combine_docs_chain_kwargs={"prompt": prompt}
)

In [11]:
chat_history = []
query = "What is partitioning?"

result = conv_interface.invoke({"question": query, "chat_history": chat_history})
chat_history.append((query, result["answer"]))

In [12]:
print(result["answer"])
print('-----')
print(result["source_documents"])

 Partitions are small subsets of a larger dataset that are created to improve performance, scalability, and manageability. Each partition contains a subset of the data from the entire dataset, which can be stored on different machines or servers within a shared-nothing cluster architecture. This allows for more efficient processing and storage, as well as easier management and maintenance of the database. The main goal is to spread the data and query load evenly across multiple machines, avoiding hot spots where disproportionately high loads occur. Different partitioning schemes are used depending on factors such as the size of the dataset, types of queries being executed, and desired level of performance. Some common methods include key range partitioning and hash partitioning.

Used pages: 6-19 to 6-23
-----
[Document(metadata={'page': 220, 'source': '/Users/stolli/IT/Designing Data-Intensive Applications.pdf'}, page_content='i. Partitioning, as discussed in this chapter, is a way of

In [13]:
query = "Can you repeat your answer as structured list please?"

result = conv_interface.invoke({"question": query, "chat_history": chat_history})


In [14]:
result["answer"]

' El particionamiento en una base de datos puede lograrse mediante varios métodos, incluyendo el particionamiento horizontal (dividir las tablas entre varias bases de datos) y el particionamiento vertical (dividir las columnas de la tabla). Los diferentes métodos utilizados para el particionamiento pueden ser:\n\n1. **Particionamiento por rango**: Las filas se asignan a particiones basadas en un rango de valores, como fechas o ID de cliente.\n2. **Particionamiento por hash**: Utiliza una función hash para distribuir las filas uniformemente entre las particiones.\n3. **Particionamiento circular**: Similar al hashing pero con una distribución cíclica.\n4. **Particionamiento manual**: El administrador de la base de datos asigna explícitamente cada fila a una partición específica.\n5. **Particionamiento por lista**: Utiliza un esquema de lista para asignar filas basado en condiciones específicas, como países o regiones.\n\nPara más detalles y ejemplos, se puede consultar la sección corresp