In [1]:
# 1. Install dependencies (run once per environment if needed)
!{sys.executable} -m pip install chromadb sentence-transformers PyPDF2


/bin/bash: line 1: {sys.executable}: command not found


In [2]:
# 2. Imports
import os
import textwrap

import pandas as pd
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.utils import embedding_functions

import torch

print("Python executable:", os.popen("which python").read().strip())
print("CUDA available:", torch.cuda.is_available())



Python executable: /home/christopher_bonillajulien_22/miniconda3/envs/torch_env/bin/python
CUDA available: True


In [3]:
# 3. Paths & file check

PDF_PATH = "postgresql-16-A4.pdf"   # <-- change if your PDF is named differently

print("Current working directory:", os.getcwd())
print("PDF exists?", os.path.exists(PDF_PATH))


Current working directory: /home/christopher_bonillajulien_22/RAG system
PDF exists? True


In [4]:
# 4. Load PDF pages into memory

def load_pdf_text(pdf_path):
    reader = PdfReader(pdf_path)
    pages = []
    for i, page in enumerate(reader.pages):
        text = page.extract_text() or ""
        pages.append({"page": i + 1, "text": text})
    return pages

pages = load_pdf_text(PDF_PATH)
print("Total pages loaded:", len(pages))
pages[:2]


Total pages loaded: 3026


[{'page': 1,
  'text': 'PostgreSQL 16.11 Documentation\nThe PostgreSQL Global Development Group'},
 {'page': 2,
  'text': 'PostgreSQL 16.11 Documentation\nThe PostgreSQL Global Development Group\nCopyright © 1996–2025 The PostgreSQL Global Development Group\nLegal Notice\nPostgreSQL Database Management System (also known as Postgres, formerly known as Postgres95)\nPortions Copyright © 1996-2025, PostgreSQL Global Development Group\nPortions Copyright © 1994, The Regents of the University of California\nPermission to use, copy, modify, and distribute this software and its documentation for any purpose, without fee, and without a written\nagreement is hereby granted, provided that the above copyright notice and this paragraph and the following two paragraphs appear in all copies.\nIN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL,\nINCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE\nAND I

In [5]:
# 5. Chunk text into overlapping word windows

def chunk_text(text, chunk_size=250, overlap=50):
    """
    Simple word-based chunker.
    chunk_size: number of words per chunk
    overlap:    overlapping words between chunks
    """
    words = text.split()
    if not words:
        return []

    chunks = []
    i = 0
    while i < len(words):
        chunk = " ".join(words[i : i + chunk_size])
        chunks.append(chunk)
        i += chunk_size - overlap
    return chunks


In [6]:
# 6. Create document chunks + metadata + ids

documents = []
metadatas = []
ids = []

chunk_id = 0

for page in pages:
    chunks = chunk_text(page["text"], chunk_size=250, overlap=50)
    for c in chunks:
        if not c.strip():
            continue
        documents.append(c)
        metadatas.append({"page": page["page"]})
        ids.append(f"chunk_{chunk_id}")
        chunk_id += 1

print("Total chunks:", len(documents))
print("Sample chunk:", documents[0][:300])


Total chunks: 6865
Sample chunk: PostgreSQL 16.11 Documentation The PostgreSQL Global Development Group


In [7]:
# 7. Load MiniLM and define Chroma embedding function

embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

class MiniLMEmbeddingFunction(embedding_functions.EmbeddingFunction):
    def __init__(self, model, batch_size=128):
        self.model = model
        self.batch_size = batch_size

    def __call__(self, texts):
        # Batch encode for speed
        return self.model.encode(
            texts,
            batch_size=self.batch_size,
            convert_to_numpy=True,
            show_progress_bar=False,
        ).tolist()

embedding_fn = MiniLMEmbeddingFunction(embedder, batch_size=128)


In [8]:
# 8. Create persistent Chroma DB & collection

CHROMA_DIR = "chroma_pg16_minilm"
COLLECTION_NAME = "pg16_minilm"

client = chromadb.PersistentClient(path=CHROMA_DIR)

# Drop old collection if it exists (fresh rebuild)
try:
    client.delete_collection(COLLECTION_NAME)
    print("Deleted existing collection:", COLLECTION_NAME)
except Exception:
    pass

collection = client.create_collection(
    name=COLLECTION_NAME,
    embedding_function=embedding_fn,
)

print("Created collection:", COLLECTION_NAME)
print("Chroma directory:", os.path.abspath(CHROMA_DIR))



Created collection: pg16_minilm
Chroma directory: /home/christopher_bonillajulien_22/RAG system/chroma_pg16_minilm


In [9]:
# 9. Add all documents to Chroma in batches

BATCH_SIZE = 1000  # safely below Chroma's max batch size
n_docs = len(documents)

for i in range(0, n_docs, BATCH_SIZE):
    batch_docs = documents[i : i + BATCH_SIZE]
    batch_meta = metadatas[i : i + BATCH_SIZE]
    batch_ids  = ids[i : i + BATCH_SIZE]

    collection.add(
        documents=batch_docs,
        metadatas=batch_meta,
        ids=batch_ids,
    )

    print(f"Added batch {i // BATCH_SIZE + 1} ({len(batch_docs)} docs)")

print("✅ Finished adding all documents to Chroma.")



Added batch 1 (1000 docs)
Added batch 2 (1000 docs)
Added batch 3 (1000 docs)
Added batch 4 (1000 docs)
Added batch 5 (1000 docs)
Added batch 6 (1000 docs)
Added batch 7 (865 docs)
✅ Finished adding all documents to Chroma.


In [11]:
# 10. Sanity check: get a small sample back and confirm embedding size
sample = collection.get(include=["embeddings", "metadatas", "documents"], limit=3)

embs = sample.get("embeddings")

if embs is not None and len(embs) > 0:
    print("Embedding count:", len(embs))
    print("Embedding dim:", len(embs[0]))
else:
    print("❌ No embeddings returned.")


Embedding count: 3
Embedding dim: 384


In [12]:
# 11. Test retrieval (optional sanity check)

import textwrap

test_query = "What is PostgreSQL used for?"

results = collection.query(
    query_texts=[test_query],
    n_results=3,
)

print("Query:", test_query)
print("=" * 80)

for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
    print(f"[Page {meta.get('page', '?')}]")
    print(textwrap.fill(doc, width=100))
    print("-" * 80)


Query: What is PostgreSQL used for?
[Page 32]
•Part VII contains assorted information that might be of use to PostgreSQL developers. 1. What Is
PostgreSQL? PostgreSQL is an object-relational database management system (ORDBMS) based on
POSTGRES, Version 4.21, developed at the University of California at Berkeley Computer Science
Department. POSTGRES pioneered many concepts that only became available in some commercial database
sys- tems much later. PostgreSQL is an open-source descendant of this original Berkeley code. It
supports a large part of the SQL standard and offers many modern features: •complex queries •foreign
keys •triggers •updatable views •transactional integrity •multiversion concurrency control Also,
PostgreSQL can be extended by the user in many ways, for example by adding new •data types
•functions •operators •aggregate functions •index methods •procedural languages And because of the
liberal license, PostgreSQL can be used, modified, and distributed by anyone free of

In [13]:
# 12. Export chunks as CSV for later use (optional)

df_chunks = pd.DataFrame({
    "id": ids,
    "page": [m["page"] for m in metadatas],
    "text": documents,
})

df_chunks.to_csv("pg16_chunks.csv", index=False)
df_chunks.head()


Unnamed: 0,id,page,text
0,chunk_0,1,PostgreSQL 16.11 Documentation The PostgreSQL ...
1,chunk_1,2,PostgreSQL 16.11 Documentation The PostgreSQL ...
2,chunk_2,3,Table of Contents Preface........................
3,chunk_3,3,.................................................
4,chunk_4,4,PostgreSQL 16.11 Documentation 5.14. Dependenc...
