In [1]:
# --- STEP 1: Imports ---
import json
import os
import time
import random
import requests
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer


  from tqdm.autonotebook import tqdm, trange


In [2]:
# === STEP 2: Load environment variables ===
load_dotenv()

MODEL_BACKEND = os.getenv("MODEL_BACKEND", "ollama")  # 'ollama' or 'openai'
MODEL_NAME = os.getenv("MODEL_NAME", "llama3.2")
OLLAMA_API_BASE = os.getenv("OLLAMA_API_BASE", "http://localhost:11434")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


print(f"✅ Backend: {MODEL_BACKEND} | Model: {MODEL_NAME}")

✅ Backend: ollama | Model: llama3.2


In [3]:
# === STEP 3: Load dataset ===
with open("projects-embedded.json", "r", encoding="utf-8") as f:
    base_projects = json.load(f)

print(f"📚 Loaded {len(base_projects)} projects")

📚 Loaded 101 projects


In [4]:
# === STEP 4: Load embedding model ===
embedder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
# === STEP 5: Unified LLM Call Function ===
def call_model(prompt: str) -> str:
    """Call either Ollama or OpenAI depending on backend selection."""
    try:
        if MODEL_BACKEND == "ollama":
            response = requests.post(
                f"{OLLAMA_API_BASE}/api/generate",
                json={"model": MODEL_NAME, "prompt": prompt},
                timeout=120
            )
            if response.status_code == 200:
                raw = response.text.strip()
                return raw.split(":", 1)[-1].strip() if ":" in raw else raw
            else:
                print(f"⚠️ Ollama Error {response.status_code}: {response.text[:100]}")
                return None

        elif MODEL_BACKEND == "openai":
            import openai
            openai.api_key = OPENAI_API_KEY
            resp = openai.ChatCompletion.create(
                model=MODEL_NAME,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.7,
            )
            return resp["choices"][0]["message"]["content"].strip()

        else:
            raise ValueError("❌ Invalid MODEL_BACKEND in .env")

    except Exception as e:
        print("🚫 Model call failed:", e)
        return None

In [7]:
# === STEP 6: Paraphrasing Function ===
def paraphrase_text(text, n=3):
    """Generate n paraphrased project descriptions."""
    paras = []
    for i in range(n):
        prompt = f"""
You are an expert technical writer. Paraphrase the following project description
to make it sound unique and professionally reworded while keeping the same meaning.
Avoid lists or markdown. Output only the rephrased text.

Original:
{text}

---
Paraphrased:
"""
        out = call_model(prompt)
        if out:
            paras.append(out.strip())
        time.sleep(1)
    return paras

# === STEP 7: Expand Dataset ===
synthetic_projects = []

for idx, proj in enumerate(base_projects, start=1):
    base_entry = proj.copy()
    synthetic_projects.append(base_entry)

    print(f"✨ Expanding: {proj['title']}")
    variations = paraphrase_text(proj["description"], n=3)

    for i, v in enumerate(variations, start=1):
        new_proj = proj.copy()
        new_proj["project_id"] = f"{proj['project_id']}_syn{i}"
        new_proj["description"] = v

        text_for_embed = f"{new_proj['title']} {v} {new_proj['tech_stack']} {new_proj['objective']}"
        new_proj["embedding"] = embedder.encode(text_for_embed).tolist()

        synthetic_projects.append(new_proj)

print(f"\n✅ Total dataset size: {len(synthetic_projects)}")

# === STEP 8: Save Final Dataset ===
with open("projects-embedding-augmented.json", "w", encoding="utf-8") as f:
    json.dump(synthetic_projects, f, indent=2)

print("💾 Saved as: projects-embedding-augmented.json")

✨ Expanding: Instagram Reach Analysis
✨ Expanding: Scraping laptop data from Amazon
✨ Expanding: Video Game Sales Prediction
✨ Expanding: Heart disease detection
✨ Expanding: Food order prediction
✨ Expanding: Contact tracing system
✨ Expanding: Sarcasm detection
✨ Expanding: Medical insurance price prediction
✨ Expanding: Credit card clustering
✨ Expanding: MNIST Data
✨ Expanding: Real time sentiment analysis
✨ Expanding: News recommendation system
✨ Expanding: Calories Burnt Prediction
✨ Expanding: Online Payment Fraud Detection
✨ Expanding: Rainfall Prediction system
✨ Expanding: Health and Fitness Tracking with Gamification
✨ Expanding: Human-Robot Interaction Interface
✨ Expanding: Fake News Detection System
✨ Expanding: E-commerce platform
✨ Expanding: Smart Traffic Light System
✨ Expanding: Library Management System
✨ Expanding: Building AI Chatbots
✨ Expanding: Image Recognition
✨ Expanding: Sign Language Recognition System
✨ Expanding: Recommendation System
✨ Expanding: Sentim

In [11]:
# --- Imports ---
import json
import chromadb
import numpy as np
from chromadb.config import Settings

# --- Initialize Chroma client ---
client = chromadb.Client(Settings(
    persist_directory="./chroma_store",  # local persistence
))

# --- Create or get collection ---
collection = client.get_or_create_collection(
    name="DBMS-RAG",
    metadata={"hnsw:space": "cosine"}  # cosine similarity works well for embeddings
)

# --- Load your dataset ---
with open("projects-embedding-augmented.json", "r", encoding="utf-8") as f:
    projects = json.load(f)

print(f"✅ Loaded {len(projects)} projects")

# --- Prepare data for insertion ---
ids = []
texts = []
embeddings = []
metadatas = []

for proj in projects:
    emb = proj.get("synthetic_embedding") or proj.get("embedding")
    if not emb:
        print(f"⚠️ Skipping {proj['project_id']} (no embedding found)")
        continue

    # Validate numeric list embeddings only
    if not isinstance(emb, list) or not all(isinstance(x, (int, float)) for x in emb):
        print(f"⚠️ Skipping {proj['project_id']} (invalid embedding format)")
        continue

    ids.append(proj["project_id"])
    texts.append(f"{proj['title']} - {proj['description']} - {proj.get('objective','')}")
    embeddings.append(emb)
    metadatas.append({
        "title": proj["title"],
        "domain": proj["domain"],
        "year": proj["year"],
        "tech_stack": proj["tech_stack"],
        "source": proj["source"]
    })

# --- Check dimension consistency ---
dims = [len(e) for e in embeddings]
if len(set(dims)) > 1:
    print(f"⚠️ Inconsistent embedding dimensions found: {set(dims)}")
    mode_dim = max(set(dims), key=dims.count)
    print(f"🔧 Normalizing all to {mode_dim}D...")
    embeddings = [e[:mode_dim] if len(e) > mode_dim else e + [0]*(mode_dim-len(e)) for e in embeddings]

print(f"✅ All embeddings normalized to {len(embeddings[0])} dimensions")

# --- Add data to ChromaDB ---
batch_size = 25
for i in range(0, len(ids), batch_size):
    collection.add(
        ids=ids[i:i+batch_size],
        documents=texts[i:i+batch_size],
        embeddings=embeddings[i:i+batch_size],
        metadatas=metadatas[i:i+batch_size]
    )
    print(f"📦 Added {min(i+batch_size, len(ids))} / {len(ids)}")

# --- Persistence handled automatically ---
print("💾 All data stored successfully in ChromaDB at ./chroma_store/ (auto-persisted)")

# --- Optional: Quick retrieval test ---
query = "analyze instagram reach using machine learning"
results = collection.query(
    query_texts=[query],
    n_results=3
)

print("\n🔍 Top Matches for Query:")
for idx, meta in enumerate(results["metadatas"][0]):
    print(f"{idx+1}. {meta['title']}  |  Domain: {meta['domain']}")



✅ Loaded 404 projects
⚠️ Inconsistent embedding dimensions found: {1024, 768}
🔧 Normalizing all to 768D...
✅ All embeddings normalized to 768 dimensions
📦 Added 25 / 404
📦 Added 50 / 404
📦 Added 75 / 404
📦 Added 100 / 404
📦 Added 125 / 404
📦 Added 150 / 404
📦 Added 175 / 404
📦 Added 200 / 404
📦 Added 225 / 404
📦 Added 250 / 404
📦 Added 275 / 404
📦 Added 300 / 404
📦 Added 325 / 404
📦 Added 350 / 404
📦 Added 375 / 404
📦 Added 400 / 404
📦 Added 404 / 404
💾 All data stored successfully in ChromaDB at ./chroma_store/ (auto-persisted)


C:\Users\sushm\.cache\chroma\onnx_models\all-MiniLM-L6-v2\onnx.tar.gz: 100%|█████| 79.3M/79.3M [00:14<00:00, 5.86MiB/s]


InvalidArgumentError: Collection expecting embedding with dimension of 768, got 384

In [12]:
from sentence_transformers import SentenceTransformer
import numpy as np

# --- load the same embedding model you used for dataset ---
embedding_model = SentenceTransformer("BAAI/bge-small-en-v1.5")

# --- sample query ---
query = "analyze instagram reach using machine learning"

# --- embed query to match stored vectors ---
query_vec = embedding_model.encode([query])[0].tolist()

# --- run query ---
results = collection.query(
    query_embeddings=[query_vec],
    n_results=3
)

print("\n🔍 Top Matches for Query:")
for idx, meta in enumerate(results["metadatas"][0]):
    print(f"{idx+1}. {meta['title']}  |  Domain: {meta['domain']}")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  return forward_call(*args, **kwargs)


InvalidArgumentError: Collection expecting embedding with dimension of 768, got 384