In [None]:
%pip install chromadb


In [1]:
# ===============================
# Graduation Project – Chroma Vector Store
# ===============================

# 1️⃣ Imports
import os
import numpy as np
import json
import chromadb


In [2]:
# 2️⃣ Paths
EMBEDDINGS_PATH = r"E:\graduation_project\embeddings"
JSON_CHUNKS_PATH = r"E:\graduation_project\json_llm_responses"
CHROMA_DB_PATH = r"E:\graduation_project\vector_store\chroma_db"

os.makedirs(CHROMA_DB_PATH, exist_ok=True)

In [5]:
# 3️⃣ Initialize Chroma client
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)

# Create or get collection
collection_name = "cs_chunks"
if collection_name in [col.name for col in client.list_collections()]:
    collection = client.get_collection(collection_name)
else:
    collection = client.create_collection(name=collection_name)

print(f"Chroma collection ready: {collection_name}")

Chroma collection ready: cs_chunks


In [6]:
# ===============================
# 4️⃣ Function to safely add chunks
# ===============================
def add_chunks_to_chroma(embedding_file_path, json_file_path, collection, pdf_name_prefix=None):
    """
    Add embeddings and chunks to Chroma collection.
    
    - embedding_file_path: path to .npy embeddings
    - json_file_path: path to enriched JSON chunks
    - collection: Chroma collection object
    - pdf_name_prefix: optional string to prepend to IDs
    """
    # Load embeddings
    embeddings = np.load(embedding_file_path).tolist()
    
    # Load JSON chunks
    if not os.path.exists(json_file_path):
        print(f"Warning: JSON file not found: {json_file_path}")
        return
    
    with open(json_file_path, "r", encoding="utf-8") as f:
        chunks = json.load(f)
    
    # Ensure metadata is safe
    for chunk in chunks:
        for key in ["topic", "subtopic"]:
            value = chunk.get(key, "")
            if isinstance(value, list):
                chunk[key] = ", ".join([str(v) for v in value])
    
    # Add to Chroma
    for i, emb in enumerate(embeddings):
        chunk = chunks[i]
        chunk_id = f"{chunk['id']}_{chunk['source']}"
        if pdf_name_prefix:
            chunk_id = f"{pdf_name_prefix}_{chunk_id}"
        
        collection.add(
            ids=[chunk_id],
            embeddings=[emb],
            metadatas=[{
                "source": str(chunk.get("source", "")),
                "topic": str(chunk.get("topic", "")),
                "subtopic": str(chunk.get("subtopic", "")),
            }],
            documents=[chunk.get("llm_response", chunk.get("original_text", ""))]
        )

    print(f"Added {len(embeddings)} chunks from {os.path.basename(json_file_path)}")


In [7]:
# ===============================
# 5️⃣ Process all existing embeddings
# ===============================
embedding_files = [f for f in os.listdir(EMBEDDINGS_PATH) if f.endswith("_embeddings.npy")]

for emb_file in embedding_files:
    emb_path = os.path.join(EMBEDDINGS_PATH, emb_file)
    
    # Corresponding JSON
    # NOTE: remove "_embeddings.npy" to get ".json" file
    json_file = emb_file.replace("_embeddings.npy", ".json")
    json_path = os.path.join(JSON_CHUNKS_PATH, json_file)
    
    # Optional: use PDF name prefix for unique IDs
    pdf_prefix = os.path.splitext(emb_file)[0].replace("_embeddings", "")
    
    add_chunks_to_chroma(emb_path, json_path, collection, pdf_prefix)


Added 46 chunks from Computer Systems A Programmers Perspective by Randal E. Bryant, David R. OHallaron (z-lib.org)_chunks_chunks_enriched.json


In [8]:
# ===============================
# 6️⃣ Test retrieval (example)
# ===============================
if embedding_files:
    # Load first embedding for query
    test_emb_path = os.path.join(EMBEDDINGS_PATH, embedding_files[0])
    test_embeddings = np.load(test_emb_path)
    query_vector = test_embeddings[0].tolist()
    
    results = collection.query(
        query_embeddings=[query_vector],
        n_results=5
    )
    
    print("Top-5 retrieved chunks:")
    for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
        print(f"Source: {meta['source']}, Topic: {meta['topic']}, Subtopic: {meta['subtopic']}")
        print(doc[:200])  # first 200 chars
        print("------")

print("Chroma vector store build complete.")

Top-5 retrieved chunks:
Source: Computer Systems A Programmers Perspective by Randal E. Bryant, David R. OHallaron (z-lib.org).pdf, Topic: Computer Science Textbook - Computer Systems, Subtopic: Third Edition Publication Details
THIRD EDITION 
. 
COMPUTER SYSTEMS 
BRYANT • O'HALLARON 
I

C·oni.puter Systems 
A Programmer's Perspective 
THIRD EDITION 
"Randal E. Bryant 
Carnegie Mellon University 
David R. O'Hallaron 
Carnegie
------
Source: Computer Systems A Programmers Perspective by Randal E. Bryant, David R. OHallaron (z-lib.org).pdf, Topic: , Subtopic: 
performance, or use of these programs. 
Pearson Education Ltd., London 
Pearson Education Singapore, Pte. Ltd 
Pearson Education Canada, Inc. 
Pearson Education-Japan 
Pearson Education Australia PTY,
------
Source: Computer Systems A Programmers Perspective by Randal E. Bryant, David R. OHallaron (z-lib.org).pdf, Topic: Operating System Compatibility and Programming Environment Setup, Subtopic: 
xx Preface 
l I 
II 
t 
of operatin