In [4]:
import json
import chromadb
import numpy as np
import shutil
import os

In [5]:
CHROMA_PATH = "./chroma_store"
COLLECTION_NAME = "DBMS-25"
JSON_PATH = "embeddings/projects-embedded.json"   # <-- your final clean JSON


In [6]:
if os.path.exists(CHROMA_PATH):
    shutil.rmtree(CHROMA_PATH)

# New clean client
client = chromadb.PersistentClient(path=CHROMA_PATH)


In [7]:
collection = client.create_collection(
    name=COLLECTION_NAME,
    metadata={"hnsw:space": "cosine"}
)

print("üöÄ Rebuilding Chroma collection‚Ä¶")

üöÄ Rebuilding Chroma collection‚Ä¶


In [8]:
with open(JSON_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

In [9]:
for item in data:

    emb = np.array(item["embedding"], dtype=float)
    dim = emb.shape[0]
    if dim != 1024:
        raise ValueError(f"Embedding dimension mismatch: expected 1024, found {dim}")

    # FIX: ensure metadata values are only str/int/float/bool
    metadata = {
        "project_id": str(item["project_id"]),
        "title": str(item["title"]),
        "description": str(item["description"]),
        "domain": str(item["domain"]),
        "year": str(item["year"]),
        "tech_stack": str(item["tech_stack"]),
        "objective": str(item["objective"]),
        "source": str(item.get("source", "")),
    }

    collection.add(
        ids=[item["project_id"]],
        embeddings=[emb.tolist()],
        metadatas=[metadata],
        documents=[item["description"]],
    )

print("‚úÖ DONE ‚Äî Chroma rebuilt cleanly with 1024-dim embeddings!")
print(f"üìÅ Store Path: {CHROMA_PATH}")
print(f"üìä Total items: {collection.count()}")

Insert of existing embedding ID: auto001
Add of existing embedding ID: auto001
Insert of existing embedding ID: auto002
Add of existing embedding ID: auto002
Insert of existing embedding ID: auto003
Add of existing embedding ID: auto003
Insert of existing embedding ID: auto004
Add of existing embedding ID: auto004
Insert of existing embedding ID: auto005
Add of existing embedding ID: auto005
Insert of existing embedding ID: auto006
Add of existing embedding ID: auto006
Insert of existing embedding ID: auto007
Add of existing embedding ID: auto007
Insert of existing embedding ID: auto008
Add of existing embedding ID: auto008
Insert of existing embedding ID: auto009
Add of existing embedding ID: auto009
Insert of existing embedding ID: auto010
Add of existing embedding ID: auto010
Insert of existing embedding ID: auto011
Add of existing embedding ID: auto011
Insert of existing embedding ID: auto012
Add of existing embedding ID: auto012
Insert of existing embedding ID: auto013
Add of exis

‚úÖ DONE ‚Äî Chroma rebuilt cleanly with 1024-dim embeddings!
üìÅ Store Path: ./chroma_store
üìä Total items: 51


In [1]:
import chromadb
import json
import numpy as np
import shutil
import os
from sentence_transformers import SentenceTransformer

# ===============================
# CONFIG
# ===============================
OLD_CHROMA_PATH = "./chroma_store"
NEW_CHROMA_PATH = "./chroma_store_768"
COLLECTION_NAME = "DBMS-25"

JSON_FILE = "embeddings/projects-embedded.json"

# Use 768-dim embedder
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(EMBED_MODEL)


# ===============================
# 1. Load raw JSON
# ===============================
print("üì• Loading raw project data...")
data = json.load(open(JSON_FILE, "r"))
print(f"Loaded {len(data)} projects")


# ===============================
# 2. Remove old 1024-dim store
# ===============================
if os.path.exists(NEW_CHROMA_PATH):
    shutil.rmtree(NEW_CHROMA_PATH)

client = chromadb.PersistentClient(path=NEW_CHROMA_PATH)

collection = client.create_collection(
    name=COLLECTION_NAME,
    metadata={"hnsw:space": "cosine"},
)

# ===============================
# Helper: Fix metadata values
# ===============================
def fix_meta(val):
    """Convert ANY metadata value into a valid Chroma type (string)."""
    if val is None:
        return ""

    # If already valid: return as-is
    if isinstance(val, (str, int, float, bool)):
        return val

    # If it's a list -> join elements
    if isinstance(val, list):
        return " ".join(str(x) for x in val)

    # If it's a dict -> flatten into readable string
    if isinstance(val, dict):
        flat = []
        for k, v in val.items():
            if isinstance(v, list):
                v = " ".join(str(x) for x in v)
            flat.append(f"{k}: {v}")
        return "; ".join(flat)

    # For any other weird type -> convert to string
    return str(val)



# ===============================
# 3. Rebuild Chroma
# ===============================
print("üîÑ Rebuilding Chroma with 768-dim embeddings...")

for item in data:

    # Compute 768 embedding
    new_emb = embedder.encode(item["description"])
    new_emb = np.array(new_emb, dtype=np.float32).tolist()

    # FIX metadata (no lists allowed)
    meta = {
        "title": fix_meta(item.get("title")),
        "description": fix_meta(item.get("description")),
        "domain": fix_meta(item.get("domain")),
        "year": fix_meta(item.get("year")),
        "objective": fix_meta(item.get("objective")),
        "tech_stack": fix_meta(item.get("tech_stack")),
        "source": fix_meta(item.get("source")),
    }


    collection.add(
        ids=[item["project_id"]],
        embeddings=[new_emb],
        metadatas=[meta],
        documents=[item["description"]],
    )

print("‚úÖ DONE ‚Äî Chroma rebuilt with SAFE 768-dim embeddings")
print("üìÅ New store:", NEW_CHROMA_PATH)
print("üìä Total items:", collection.count())


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


üì• Loading raw project data...
Loaded 101 projects
üîÑ Rebuilding Chroma with 768-dim embeddings...


Insert of existing embedding ID: auto001
Add of existing embedding ID: auto001
Insert of existing embedding ID: auto002
Add of existing embedding ID: auto002
Insert of existing embedding ID: auto003
Add of existing embedding ID: auto003
Insert of existing embedding ID: auto004
Add of existing embedding ID: auto004
Insert of existing embedding ID: auto005
Add of existing embedding ID: auto005
Insert of existing embedding ID: auto006
Add of existing embedding ID: auto006
Insert of existing embedding ID: auto007
Add of existing embedding ID: auto007
Insert of existing embedding ID: auto008
Add of existing embedding ID: auto008
Insert of existing embedding ID: auto009
Add of existing embedding ID: auto009
Insert of existing embedding ID: auto010
Add of existing embedding ID: auto010
Insert of existing embedding ID: auto011
Add of existing embedding ID: auto011
Insert of existing embedding ID: auto012
Add of existing embedding ID: auto012
Insert of existing embedding ID: auto013
Add of exis

‚úÖ DONE ‚Äî Chroma rebuilt with SAFE 768-dim embeddings
üìÅ New store: ./chroma_store_768
üìä Total items: 51


In [1]:
from openai import OpenAI
from dotenv import load_dotenv
import os
import json
import numpy as np
import chromadb

load_dotenv()

# Backend config (.env)
MODEL_BACKEND = os.getenv("MODEL_BACKEND", "ollama")
MODEL_NAME = os.getenv("MODEL_NAME", "llama3.2")
OLLAMA_API_BASE = os.getenv("OLLAMA_API_BASE", "http://localhost:11434")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "ollama")

# Configure OpenAI to talk to Ollama
os.environ["OPENAI_API_BASE"] = f"{OLLAMA_API_BASE}/v1"
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

client_ai = OpenAI()

print(f"üîó LLM Backend: {MODEL_BACKEND} | Model: {MODEL_NAME}")


üîó LLM Backend: ollama | Model: llama3.2


In [2]:
import json
import re
import time
from openai import OpenAI

client = OpenAI()

# ------------------------------------------------------
#  Helper: find JSON region
# ------------------------------------------------------
def detect_json(text):
    start = text.find("{")
    end = text.rfind("}")
    if start == -1 or end == -1:
        return None
    return text[start:end+1]

# ------------------------------------------------------
#  Helper: repair malformed JSON
# ------------------------------------------------------
def clean_json_block(text):
    # remove code fences
    text = text.replace("```json", "").replace("```", "")

    # remove trailing commas
    text = re.sub(r",\s*}", "}", text)
    text = re.sub(r",\s*]", "]", text)

    # fix smart quotes
    text = text.replace("‚Äú", '"').replace("‚Äù", '"')

    # try parse normally
    try:
        return json.loads(text)
    except:
        # last_fix
        text = text.replace("'", '"')
        text = text.replace(",}", "}")
        text = text.replace(",]", "]")
        return json.loads(text)

# ------------------------------------------------------
#  llama3.2 request with multiple retries
# ------------------------------------------------------
def call_llama_json(prompt, retries=5):
    for attempt in range(retries):
        out = client.chat.completions.create(
            model="llama3.2",
            messages=[{"role": "user", "content": prompt}]
        )

        raw = out.choices[0].message.content

        # extract JSON
        json_blob = detect_json(raw)

        if json_blob:
            try:
                return clean_json_block(json_blob)
            except:
                pass  # continue retry

        time.sleep(1.2)

    # FINAL fallback if llama fails
    return {
        "titles": ["fallback title 1", "fallback title 2"],
        "descriptions": ["fallback description 1"]
    }

# ------------------------------------------------------
#  Normalize missing keys (avoid KeyError)
# ------------------------------------------------------
def normalize_result(a):
    if "titles" not in a or not isinstance(a["titles"], list):
        a["titles"] = ["fallback title 1"]

    if "descriptions" not in a or not isinstance(a["descriptions"], list):
        a["descriptions"] = ["fallback description 1"]

    # ensure all values are strings
    a["titles"] = [str(x) for x in a["titles"]]
    a["descriptions"] = [str(x) for x in a["descriptions"]]

    return a

# ------------------------------------------------------
#  Augment function
# ------------------------------------------------------
def augment_text(title, desc):
    prompt = f"""
Generate:
- 10 paraphrased titles
- 5 paraphrased descriptions

Return ONLY JSON:
{{
  "titles": ["t1","t2",...],
  "descriptions": ["d1","d2",...]
}}

Title: {title}
Description: {desc}
"""
    result = call_llama_json(prompt)
    return normalize_result(result)

# ------------------------------------------------------
#  MAIN PIPELINE (RUNS WITHOUT CRASHING)
# ------------------------------------------------------
orig = json.load(open("embeddings/projects-embedded.json"))
augmented = []

for item in orig:
    a = augment_text(item["title"], item["description"])

    # titles
    for t in a["titles"]:
        augmented.append({
            "project_id": item["project_id"] + "_t",
            "title": t,
            "description": item["description"],
            "domain": item["domain"]
        })

    # descriptions
    for d in a["descriptions"]:
        augmented.append({
            "project_id": item["project_id"] + "_d",
            "title": item["title"],
            "description": d,
            "domain": item["domain"]
        })

with open("augmented.json", "w", encoding="utf-8") as f:
    json.dump(augmented, f, indent=2, ensure_ascii=False)

print("‚úî Augmentation completed successfully.")


‚úî Augmentation completed successfully.


In [1]:
import chromadb
import numpy as np
from sentence_transformers import SentenceTransformer
from openai import OpenAI
from dotenv import load_dotenv
import os
import json
import numpy as np
import chromadb

embedder = SentenceTransformer("all-MiniLM-L6-v2")  # 768-dim

client = chromadb.PersistentClient("./chroma_aug_store")
coll = client.create_collection("DBMS-25-AUG")

# FIX IS HERE üëá
with open("augmented.json", "r", encoding="utf-8") as f:
    aug = json.load(f)

for item in aug:
    emb = embedder.encode(item["title"]) 
    coll.add(
        ids=[item["project_id"]],
        embeddings=[emb.tolist()],
        metadatas=[item],
        documents=[item["title"]]
    )

print("AUG collection ready!")


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
Add of existing embedding ID: auto001_t
Insert of existing embedding ID: auto001_t
Add of existing embedding ID: auto001_t
Insert of existing embedding ID: auto001_t
Add of existing embedding ID: auto001_t
Insert of existing embedding ID: auto001_t
Add of existing embedding ID: auto001_t
Insert of existing embedding ID: auto001_t
Add of existing embedding ID: auto001_t
Insert of existing embedding ID: auto001_t
Add of existing embedding ID: auto001_t
Insert of existing embedding ID: auto001_t
Add of existing embedding ID: auto001_t
Insert of existing embedding ID: auto001_t
Add of existing embedding ID: auto001_t
Insert of existing embedding ID: auto001_t
Add of existing embedding ID: auto001_t
Insert of existing embedding ID: auto001_t
Add of existing embedding ID: auto001_d
Insert of existing embedding ID: auto001_d
Add of existing embedding ID: auto001_d
Insert of exi

AUG collection ready!


In [2]:
# ===============================
# üîç FULL CHROMADB VALIDATION NOTEBOOK
# ===============================

import os
import json
import numpy as np
import chromadb

CHROMA_PATH = "./chroma_store_aug"
COLLECTION = "DBMS-25-AUG"

client = chromadb.PersistentClient(path=CHROMA_PATH)
coll = client.get_collection(COLLECTION)

print("====================================")
print("üîç Checking ChromaDB Status...")
print("====================================")

# -------------------------
# 1. Basic Info
# -------------------------
print(f"‚úî Collection: {coll.name}")
print(f"üì¶ Total Items: {coll.count()}")


# -------------------------
# 2. Fetch sample
# -------------------------
sample = coll.get(limit=1, include=["embeddings", "documents", "metadatas"])

print("\n=== SAMPLE RECORD (JSON-SAFE) ===")
safe_sample = json.loads(json.dumps(sample, default=lambda x: x.tolist() if isinstance(x, np.ndarray) else str(x), indent=2))
print(json.dumps(safe_sample, indent=2))


# -------------------------
# 3. Detect Embedding Dim (correct for ndarray, list, nested list)
# -------------------------
embs = sample.get("embeddings", None)
dim = None

if isinstance(embs, list) and len(embs) > 0:
    first = embs[0]

    if isinstance(first, np.ndarray):
        dim = first.shape[0]

    elif isinstance(first, list) and len(first) > 0 and isinstance(first[0], (float, int)):
        dim = len(first)

    elif isinstance(first, list) and len(first) > 0 and isinstance(first[0], list):
        dim = len(first[0])

print(f"\nüìê Embedding Dimension: {dim}")


# ===============================
# 4. FULL VALIDATION
# ===============================

print("\n====================================")
print("üîé FULL VALIDATION OF ALL EMBEDDINGS")
print("====================================")

records = coll.get(include=["embeddings"], limit=20000)

all_embs = records["embeddings"]
ids = records["ids"]

bad_count = 0
dims = set()

for idx, emb in enumerate(all_embs):
    id_ = ids[idx]

    # ndarray
    if isinstance(emb, np.ndarray):
        dims.add(emb.shape[0])

    # list of floats
    elif isinstance(emb, list) and len(emb) > 0 and isinstance(emb[0], (float, int)):
        dims.add(len(emb))

    # nested list
    elif isinstance(emb, list) and len(emb) > 0 and isinstance(emb[0], list):
        dims.add(len(emb[0]))

    else:
        print(f"‚ùå Bad embedding for ID: {id_}")
        bad_count += 1


print("\n=== VALIDATION REPORT ===")
print(f"‚úî Total embeddings checked: {len(all_embs)}")
print(f"üìê Unique embedding dimensions found: {dims}")
print(f"‚ùå Corrupted embeddings: {bad_count}")

if len(dims) == 1:
    print("üéâ Good news: All embeddings have a consistent dimension.")
else:
    print("‚ö†Ô∏è WARNING: Mixed embedding dimensions detected!")

print("\nüîç DB Check Complete.")


üîç Checking ChromaDB status...

‚úî Collection found: DBMS-25-AUG
üì¶ Total items: 102

=== SAMPLE RECORD (JSON-SAFE) ===
{
  "ids": [
    "auto001_t"
  ],
  "embeddings": [
    [
      -0.0740911141037941,
      0.10007232427597046,
      -0.06078357994556427,
      0.042467281222343445,
      -0.03277324140071869,
      -0.010189599357545376,
      0.06510160863399506,
      -0.05643374100327492,
      -0.03331286087632179,
      -0.037888526916503906,
      -0.018932858482003212,
      -0.004982047248631716,
      0.053329259157180786,
      0.0047050220891833305,
      -0.04416971281170845,
      0.0747058168053627,
      -0.01451728492975235,
      0.04346424341201782,
      -0.005094795022159815,
      -0.04766011983156204,
      -0.05374922975897789,
      -0.0029457523487508297,
      -0.0938207134604454,
      0.008722788654267788,
      0.017974702641367912,
      0.00849444605410099,
      0.04606930539011955,
      -0.03254694119095802,
      -0.04125959798693657,
      