In [1]:
# =============================
# STAGE 2 — Re-Embedding Chroma
# =============================

import os
import chromadb
from chromadb.utils import embedding_functions
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import numpy as np

import torch
torch.cuda.empty_cache()

print("Device:", torch.device("cuda" if torch.cuda.is_available() else "cpu"))


Device: cuda


In [2]:
# Path to fine-tuned model from Stage 1
FINETUNED_MODEL_PATH = "models/pg16-minilm-mnrl"

# Load the model
model = SentenceTransformer(FINETUNED_MODEL_PATH)
model = model.to("cuda")

print("Loaded fine-tuned embedding model:", FINETUNED_MODEL_PATH)


Loaded fine-tuned embedding model: models/pg16-minilm-mnrl


In [3]:
#2. Reconnect to Chroma
CHROMA_DIR = "chroma_pg16_minilm"
COLLECTION_NAME = "pg16_minilm"

client = chromadb.PersistentClient(path=CHROMA_DIR)
collection = client.get_collection(name=COLLECTION_NAME)

print("Connected to Chroma collection:", COLLECTION_NAME)


Connected to Chroma collection: pg16_minilm


In [5]:
#3. Load ALL Stored Documents from Chroma
# Retrieve everything from Chroma
docs = collection.get(include=["documents", "metadatas"])

documents = docs["documents"]
metadatas = docs["metadatas"]
ids = docs["ids"]   # <-- ids are always returned even if not included

print("Loaded from Chroma:")
print(" • Documents:", len(documents))
print(" • Example:", documents[0][:200])



Loaded from Chroma:
 • Documents: 6865
 • Example: PostgreSQL 16.11 Documentation The PostgreSQL Global Development Group


In [7]:
#4. Define Embedding Function Using Fine-Tuned Model
# Batch embedding helper
def embed_batch(texts, batch_size=64):
    all_embeddings = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i : i + batch_size]
        emb = model.encode(
            batch,
            convert_to_numpy=True,
            batch_size=batch_size,
            show_progress_bar=False
        )
        all_embeddings.extend(emb)

    return np.array(all_embeddings)


In [8]:
#5. Compute NEW Embeddings for All Chunks
print("Encoding all documents with fine-tuned model...")

new_embeddings = embed_batch(documents, batch_size=64)

print("New embedding matrix shape:", new_embeddings.shape)


Encoding all documents with fine-tuned model...


100%|█████████████████████████████████████████| 108/108 [00:23<00:00,  4.58it/s]

New embedding matrix shape: (6865, 384)





In [9]:
#6. Clear OLD Embeddings from Chroma (Required!)
print("Clearing old embeddings...")

collection.delete(ids=ids)

print("Old embeddings deleted.")


Clearing old embeddings...
Old embeddings deleted.


In [10]:
#7. Re-Insert Chunks with Fine-Tuned Embeddings
print("Upserting new embeddings into Chroma...")

BATCH = 500

for i in tqdm(range(0, len(documents), BATCH)):
    batch_ids = ids[i : i + BATCH]
    batch_docs = documents[i : i + BATCH]
    batch_meta = metadatas[i : i + BATCH]
    batch_emb = new_embeddings[i : i + BATCH]

    collection.add(
        ids=batch_ids,
        documents=batch_docs,
        metadatas=batch_meta,
        embeddings=batch_emb.tolist()
    )

print("Finished re-embedding Chroma!")


Upserting new embeddings into Chroma...


100%|███████████████████████████████████████████| 14/14 [00:13<00:00,  1.00it/s]

Finished re-embedding Chroma!





In [11]:
#8. Test Retrieval with the Fine-Tuned Model
query = "How do I configure a PostgreSQL index?"

# Embed query
q_emb = model.encode([query], convert_to_numpy=True)[0].tolist()

# Query Chroma
results = collection.query(
    query_embeddings=[q_emb],
    n_results=3,
    include=["documents", "metadatas", "distances"]
)

for i in range(3):
    print(f"\n=== Result {i+1} ===")
    print(results["documents"][0][i][:300])
    print("Metadata:", results["metadatas"][0][i])
    print("Distance:", results["distances"][0][i])



=== Result 1 ===
The catalog pg_index contains part of the information about indexes. The rest is mostly in pg_class . 2274
Metadata: {'page': 2312}
Distance: 0.6607102155685425

=== Result 2 ===
can be added to the index. Indexes can have up to 32 columns, including INCLUDE columns. (This limit can be altered when building PostgreSQL; see the file pg_config_manual.h .) 431
Metadata: {'page': 469}
Distance: 0.7031267881393433

=== Result 3 ===
the postgresql.conf file or on the server command line. 630
Metadata: {'page': 668}
Distance: 0.7253999710083008
