In [None]:
# Simple RAG Application from an uploaded pdf

In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
os.environ["LANGSMITH_ENDPOINT"] = os.getenv("LANGSMITH_ENDPOINT")
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_PROJECT"] = os.getenv("LANGSMITH_PROJECT")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["LANGSMITH_API_KEY"] = os.getenv("LANGSMITH_API_KEY")

In [3]:
os.environ["LANGSMITH_ENDPOINT"], os.environ["LANGSMITH_PROJECT"]

('https://api.smith.langchain.com', 'GenAIAppWithLangChain')

In [4]:
from langchain_community.document_loaders import PyMuPDFLoader



In [5]:
loader = PyMuPDFLoader("./Data/PDFs/Report.pdf")
loader

<langchain_community.document_loaders.pdf.PyMuPDFLoader at 0x11339d910>

In [6]:
pdf_docs = loader.load()

In [7]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [8]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

In [9]:
documents = text_splitter.split_documents(pdf_docs)

In [10]:
for document in documents:
    print(document.page_content, end="\n\n")

Engineering Data Intensive Systems - 2IMD10
EDS - PROJECT REPORT
Team Number - 16
Full Name
Discord Username
Email
Divyansh Purohit
wah shampy
d.purohit@student.tue.nl
Likhit Vesalapu
likhit7.
l.vesalapu@student.tue.nl
Prathamesh Samal
viper 101
p.samal@student.tue.nl
Elena Terzieva
ellie218388
e.e.terzieva@student.tue.nl
Eindhoven, February 1, 2026

1
ABSTRACT
Accurate cardinality estimation is fundamental to query opti-
mization in graph databases, enabling the selection of efficient
execution plans for regular path queries. In this report, we
present a hybrid cardinality estimator that combines multi-
ple statistical synopses including per-label statistics, pairwise
label correlations, and characteristic sets with a weighted
and stratified sampling strategy for complex queries. Our ap-
proach balances estimation accuracy against preparation time
and memory overhead, achieving competitive performance
on both synthetic and real-world workloads.
2
INTRODUCTION
The efficiency of query p

In [11]:
print(len(documents))

58


In [12]:
from langchain_openai import OpenAIEmbeddings

In [13]:
embeddings = OpenAIEmbeddings()

In [14]:
from langchain_community.vectorstores import FAISS

In [15]:
db = FAISS.from_documents(documents=documents, embedding=embeddings)

In [16]:
db.index.d, db.index.ntotal

(1536, 58)

In [17]:
query = "With cardinality on paths, the query path is split into simpler segments"

In [None]:
retrieved_documents  = db.similarity_search(query)

In [19]:
# retrieved_documents
retrieved_documents = documents[4:8]
retrieved_documents

[Document(metadata={'producer': 'pdfTeX-1.40.27', 'creator': 'LaTeX with hyperref', 'creationdate': '2026-02-01T15:53:53+00:00', 'source': './Data/PDFs/Report.pdf', 'file_path': './Data/PDFs/Report.pdf', 'total_pages': 10, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': 'VGTC Special Issue Paper for TVCG', 'keywords': '', 'moddate': '2026-02-01T15:53:53+00:00', 'trapped': '', 'modDate': 'D:20260201155353Z', 'creationDate': 'D:20260201155353Z', 'page': 1}, page_content='of the results produced by a query. In this project, we are\ndealing with RDF databases that differ from the typical re-\nlational schemas because of their structure, which leads to\ndifferent assumptions and decisions made when estimating\nthe cardinality of a query. In query optimisation, synopses are\nprecomputed summaries of database data (like histograms,\nsketches, samples) that database systems use to quickly es-\ntimate the cost of different query execution plans. Bonifati\net al. survey cardinality es

### Retrieval Chain and Document Chain

In [20]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents import Document

In [21]:
from langchain_openai import ChatOpenAI

In [22]:
llm = ChatOpenAI(model="gpt-4o")

In [23]:
prompt = ChatPromptTemplate.from_template(
    """
    Answer the following questions based only of the provided context::
    <context>
    {context}
    </context>
    """
)

In [24]:
# LangChain has three main strategies to feed documents to an LLM:

# 1 Stuff → put all documents into one prompt
# 2 Map → run the LLM on each document separately, then combine outputs
# 3 Refine → run LLM iteratively, refining previous answers

# create_stuff_documents_chain → uses the stuffing strategy (all docs at once).

In [25]:
document_chain = create_stuff_documents_chain(llm, prompt)
document_chain

RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
| ChatPromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='\n    Answer the following questions based only of the provided context::\n    <context>\n    {context}\n    </context>\n    '), additional_kwargs={})])
| ChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x1197d4430>, async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x11ab1f400>, root_client=<openai.OpenAI object at 0x119e6caf0>, root_async_client=<openai.AsyncOpenAI object at 0x1197d4ac0>, model_name='gpt-4o', model_kwargs={}, openai_api_key=SecretStr('**********'), stream_usage=True)
| StrOutputParser(), kwargs=

In [26]:
result = document_chain.invoke(
    {
        "input": "Importantly, it supports user- or application-specified trade-\noffs between performance and predictability, contributing to\nthe design of more robust query optimisers [1].\nHistograms are one of the most widely used methods for\ncardinality estimation in relational DBMS.",
        "context": retrieved_documents
    }
)

In [27]:
result

"Based on the provided context, here are the answers to some potential questions:\n\n1. **What are RDF databases, and how do they differ from typical relational databases?**\n   RDF databases have a different structure compared to typical relational schemas. This structural difference affects assumptions and decisions in estimating query cardinality.\n\n2. **What are synopses in query optimization?**\n   Synopses are precomputed summaries of database data, such as histograms, sketches, or samples, used by database systems to quickly estimate the cost of different query execution plans.\n\n3. **What are the two cardinality estimation approaches specifically designed for graph query languages discussed by Bonifati et al.?**\n   The two approaches are: \n   - Cardinality on paths, which involves splitting query paths into simpler segments and using predefined formulas for concentration, union, and inversion to estimate results.\n   - Cardinality on patterns, which focuses on graph pattern

In [36]:
from langchain.chains import create_retrieval_chain

In [None]:
# Using the database as a retriever allows you to answer any query that falls within the context of the documents
# stored in the vector store. When a query is made, the retriever dynamically searches the database for the most 
# relevant documents, which are then passed to the document chain and processed by the LLM to generate a natural 
# language answer. This makes the system fully flexible, as it can handle any question covered by the stored content. 


# In contrast, if you use a fixed set of retrieved documents, the system can only generate answers based on those specific 
# documents. Even if a question is relevant to the overall database, the LLM will only see the limited fixed context and 
# cannot access the rest of the information. The key difference is that a dynamic retriever provides context based on the 
# query, enabling a true retrieval-augmented generation workflow, while a fixed set of documents limits the LLM to a narrow, 
# preselected scope.

In [28]:
# retriever → already knows how to query FAISS and fetch relevant documents
# document_chain → knows how to feed those documents to the LLM
# retriever_chain → combines them in one step

In [39]:
retriever = db.as_retriever()
# document_chain provides the context information
retriever_chain = create_retrieval_chain(retriever, document_chain)

In [None]:
response = retriever_chain.invoke({"input": "What is the technique invoked by Babcock and Chaudhuri?"})

In [None]:
response