In [1]:
# --- STEP 1: Imports ---
import json
import os
import time
import random
import requests
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer


  from tqdm.autonotebook import tqdm, trange


In [2]:
# === STEP 2: Load environment variables ===
load_dotenv()

MODEL_BACKEND = os.getenv("MODEL_BACKEND", "ollama")  # 'ollama' or 'openai'
MODEL_NAME = os.getenv("MODEL_NAME", "llama3.2")
OLLAMA_API_BASE = os.getenv("OLLAMA_API_BASE", "http://localhost:11434")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


print(f"✅ Backend: {MODEL_BACKEND} | Model: {MODEL_NAME}")

✅ Backend: ollama | Model: llama3.2


In [3]:
# === STEP 3: Load dataset ===
with open("projects-embedded.json", "r", encoding="utf-8") as f:
    base_projects = json.load(f)

print(f"📚 Loaded {len(base_projects)} projects")

📚 Loaded 101 projects


In [4]:
# === STEP 4: Load embedding model ===
embedder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
# === STEP 5: Unified LLM Call Function ===
def call_model(prompt: str) -> str:
    """Call either Ollama or OpenAI depending on backend selection."""
    try:
        if MODEL_BACKEND == "ollama":
            response = requests.post(
                f"{OLLAMA_API_BASE}/api/generate",
                json={"model": MODEL_NAME, "prompt": prompt},
                timeout=120
            )
            if response.status_code == 200:
                raw = response.text.strip()
                return raw.split(":", 1)[-1].strip() if ":" in raw else raw
            else:
                print(f"⚠️ Ollama Error {response.status_code}: {response.text[:100]}")
                return None

        elif MODEL_BACKEND == "openai":
            import openai
            openai.api_key = OPENAI_API_KEY
            resp = openai.ChatCompletion.create(
                model=MODEL_NAME,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.7,
            )
            return resp["choices"][0]["message"]["content"].strip()

        else:
            raise ValueError("❌ Invalid MODEL_BACKEND in .env")

    except Exception as e:
        print("🚫 Model call failed:", e)
        return None

In [7]:
# === STEP 6: Paraphrasing Function ===
def paraphrase_text(text, n=3):
    """Generate n paraphrased project descriptions."""
    paras = []
    for i in range(n):
        prompt = f"""
You are an expert technical writer. Paraphrase the following project description
to make it sound unique and professionally reworded while keeping the same meaning.
Avoid lists or markdown. Output only the rephrased text.

Original:
{text}

---
Paraphrased:
"""
        out = call_model(prompt)
        if out:
            paras.append(out.strip())
        time.sleep(1)
    return paras

# === STEP 7: Expand Dataset ===
synthetic_projects = []

for idx, proj in enumerate(base_projects, start=1):
    base_entry = proj.copy()
    synthetic_projects.append(base_entry)

    print(f"✨ Expanding: {proj['title']}")
    variations = paraphrase_text(proj["description"], n=3)

    for i, v in enumerate(variations, start=1):
        new_proj = proj.copy()
        new_proj["project_id"] = f"{proj['project_id']}_syn{i}"
        new_proj["description"] = v

        text_for_embed = f"{new_proj['title']} {v} {new_proj['tech_stack']} {new_proj['objective']}"
        new_proj["embedding"] = embedder.encode(text_for_embed).tolist()

        synthetic_projects.append(new_proj)

print(f"\n✅ Total dataset size: {len(synthetic_projects)}")

# === STEP 8: Save Final Dataset ===
with open("projects-embedding-augmented.json", "w", encoding="utf-8") as f:
    json.dump(synthetic_projects, f, indent=2)

print("💾 Saved as: projects-embedding-augmented.json")

✨ Expanding: Instagram Reach Analysis
✨ Expanding: Scraping laptop data from Amazon
✨ Expanding: Video Game Sales Prediction
✨ Expanding: Heart disease detection
✨ Expanding: Food order prediction
✨ Expanding: Contact tracing system
✨ Expanding: Sarcasm detection
✨ Expanding: Medical insurance price prediction
✨ Expanding: Credit card clustering
✨ Expanding: MNIST Data
✨ Expanding: Real time sentiment analysis
✨ Expanding: News recommendation system
✨ Expanding: Calories Burnt Prediction
✨ Expanding: Online Payment Fraud Detection
✨ Expanding: Rainfall Prediction system
✨ Expanding: Health and Fitness Tracking with Gamification
✨ Expanding: Human-Robot Interaction Interface
✨ Expanding: Fake News Detection System
✨ Expanding: E-commerce platform
✨ Expanding: Smart Traffic Light System
✨ Expanding: Library Management System
✨ Expanding: Building AI Chatbots
✨ Expanding: Image Recognition
✨ Expanding: Sign Language Recognition System
✨ Expanding: Recommendation System
✨ Expanding: Sentim

In [5]:
from sentence_transformers import SentenceTransformer
import chromadb
import json

# --- Load model (384 dimensions) ---
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# --- Load your dataset ---
with open("projects-embedding-augmented.json", "r", encoding="utf-8") as f:
    projects = json.load(f)

# --- Initialize ChromaDB ---
client = chromadb.PersistentClient(path="./chroma_store")
collection = client.get_or_create_collection(name="DBMS-25")

# --- Prepare and store ---
ids, texts, metadatas, embeddings = [], [], [], []

for i, proj in enumerate(projects):
    text = f"{proj['title']} - {proj['description']} - {proj.get('objective', '')}"
    emb = embedder.encode(text).tolist()

    ids.append(str(i))
    texts.append(text)
    embeddings.append(emb)
    metadatas.append({
        "title": proj["title"],
        "domain": proj["domain"],
        "tech_stack": proj["tech_stack"],
        "source": proj["source"]
    })

collection.add(
    ids=ids,
    documents=texts,
    embeddings=embeddings,
    metadatas=metadatas
)

print(f"✅ Stored {len(ids)} projects in ChromaDB (384-dim embeddings).")


✅ Stored 404 projects in ChromaDB (384-dim embeddings).


In [9]:
# --- Load same embedding model ---
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# --- Connect to ChromaDB ---
client = chromadb.PersistentClient(path="./chroma_store")
collection = client.get_collection("DBMS-25")

# --- Query ---
query = "Disaster-News-Auth-Validate "
query_embedding = embedder.encode(query).tolist()

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=3
)

# --- Display Results ---
print("\n🔍 Top 3 Matches for Query:")
for idx, meta in enumerate(results["metadatas"][0]):
    print(f"\n[{idx+1}] {meta['title']}")
    print(f"Similarity Score: {results['distances'][0][idx]:.4f}")
    print(f"Domain: {meta['domain']}")
    print(f"Tech Stack: {meta['tech_stack']}")
    print(f"Source: {meta['source']}")



🔍 Top 3 Matches for Query:

[1] Fake News Detection System
Similarity Score: 1.2879
Domain: Artificial Intelligence / Data Science
Tech Stack: Python, TensorFlow, PyTorch, NLTK, spaCy, Keras, Pandas, NumPy
Source: ISE-dept

[2] Fake News Detection System
Similarity Score: 1.2883
Domain: Artificial Intelligence / Data Science
Tech Stack: Python, TensorFlow, PyTorch, NLTK, spaCy, Keras, Pandas, NumPy
Source: ISE-dept

[3] Fake News Detection System
Similarity Score: 1.3045
Domain: Artificial Intelligence / Data Science
Tech Stack: Python, TensorFlow, PyTorch, NLTK, spaCy, Keras, Pandas, NumPy
Source: ISE-dept
