In [2]:
# ============================
# 1. Imports & Environment
# ============================

import os
import json
import numpy as np
from dotenv import load_dotenv
from sklearn.metrics.pairwise import cosine_similarity

import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer

from openai import OpenAI

import google.generativeai as genai

load_dotenv()


  _torch_pytree._register_pytree_node(


True

In [3]:
# ==============================================
# 2. Configure Model Backend (Ollama / OpenAI)
# ==============================================

MODEL_BACKEND = os.getenv("MODEL_BACKEND", "ollama")   # "ollama" or "openai"
MODEL_NAME = os.getenv("MODEL_NAME", "llama3.2")
OLLAMA_API_BASE = os.getenv("OLLAMA_API_BASE", "http://localhost:11434")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "ollama")  # dummy for ollama

if MODEL_BACKEND.lower() == "ollama":
    os.environ["OPENAI_API_BASE"] = f"{OLLAMA_API_BASE}/v1"
    os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
    print(f"ðŸ”— Using Ollama model: {MODEL_NAME}")
else:
    print(f"ðŸ”— Using cloud backend: {MODEL_NAME}")

client_ai = OpenAI()


ðŸ”— Using Ollama model: llama3.2


In [4]:
# ======================================
# 3. Load ChromaDB & Embedding Model
# ======================================

CHROMA_PATH = "./chroma_store_768"

client_chroma = chromadb.PersistentClient(path=CHROMA_PATH)

print("ðŸ“¦ Existing Collections:")
for c in client_chroma.list_collections():
    print(" -", c.name)

# load correct collection
collection = client_chroma.get_collection("DBMS-25")

# embedding model
embedder = SentenceTransformer("all-MiniLM-L6-v2")


ðŸ“¦ Existing Collections:
 - DBMS-25


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [5]:
# ==========================
# 4. User Query
# ==========================

query = "instagram reach aalysis"
TOP_K = 10

query_emb = embedder.encode(query).tolist()


In [6]:
# ===================================
# 5. Query ChromaDB for Similar Docs
# ===================================

res = collection.query(
    query_embeddings=[query_emb],
    n_results=TOP_K,
    include=["metadatas", "documents", "distances", "embeddings"]
)

ids       = res["ids"][0]
docs      = res["documents"][0]
metas     = res["metadatas"][0]
distances = res["distances"][0]
stored_embs = res["embeddings"][0]

def to_percent(x): return round(float(x) * 100, 2)


Add of existing embedding ID: auto001
Add of existing embedding ID: auto002
Add of existing embedding ID: auto003
Add of existing embedding ID: auto004
Add of existing embedding ID: auto005
Add of existing embedding ID: auto006
Add of existing embedding ID: auto007
Add of existing embedding ID: auto008
Add of existing embedding ID: auto009
Add of existing embedding ID: auto010
Add of existing embedding ID: auto011
Add of existing embedding ID: auto012
Add of existing embedding ID: auto013
Add of existing embedding ID: auto014
Add of existing embedding ID: auto015
Add of existing embedding ID: auto001
Add of existing embedding ID: auto002
Add of existing embedding ID: auto003
Add of existing embedding ID: auto004
Add of existing embedding ID: auto005
Add of existing embedding ID: auto006
Add of existing embedding ID: auto007
Add of existing embedding ID: auto008
Add of existing embedding ID: auto009
Add of existing embedding ID: auto010
Add of existing embedding ID: auto001
Add of exist

In [7]:
# ========================================
# 6. Compute Similarities with Sentence Transformers
# ========================================

results = []

for i, rid in enumerate(ids):

    meta = metas[i] or {}
    title = meta.get("title", "")
    domain = meta.get("domain", "")
    tech_stack = meta.get("tech_stack", "")
    source = meta.get("source", "")
    desc = docs[i] or ""
    objective = meta.get("objective", "")

    # embeddings
    def enc(x): return embedder.encode(x).tolist() if x else None

    emb_title = enc(title)
    emb_desc = enc(desc)
    emb_tech = enc(tech_stack)
    emb_obj  = enc(objective)

    # similarity metrics
    sim_whole = cosine_similarity([query_emb], [stored_embs[i]])[0][0]
    sim_title = cosine_similarity([query_emb], [emb_title])[0][0] if emb_title else 0
    sim_desc  = cosine_similarity([query_emb], [emb_desc])[0][0] if emb_desc else 0
    sim_tech  = cosine_similarity([query_emb], [emb_tech])[0][0] if emb_tech else 0
    sim_obj   = cosine_similarity([query_emb], [emb_obj])[0][0] if emb_obj else 0

    results.append({
        "id": rid,
        "title": title,
        "domain": domain,
        "tech_stack": tech_stack,
        "source": source,
        "sim_whole": sim_whole,
        "sim_title": sim_title,
        "sim_description": sim_desc,
        "sim_tech_stack": sim_tech,
        "sim_objective": sim_obj,
        "doc_snippet": desc[:300]
    })


In [8]:
# =========================================
# 7. Deduplicate by Title â†’ Keep Top 5
# =========================================

unique = {}
for r in sorted(results, key=lambda x: x["sim_whole"], reverse=True):
    if r["title"] not in unique:
        unique[r["title"]] = r

final_results = list(unique.values())[:5]

print("\n=== ðŸŽ¯ Final Unique Top-5 ===\n")
for i, r in enumerate(final_results, start=1):
    print(f"{i}. {r['title']} ({to_percent(r['sim_whole'])}%)")



=== ðŸŽ¯ Final Unique Top-5 ===

1. Rainfall Prediction system (27.09%)
2. To analyse the implications of deploying autonomous drones (25.02%)
3. Weather Forecasting Application (22.7%)
4. Predictive Analysis Tool (22.22%)
5. Medical insurance price prediction (22.06%)


In [12]:
# =============================================
# 8. Build Enhanced Prompt for Llama3.2
# =============================================

prompt_lines = []

prompt_lines.append(f"Query: {query}")
prompt_lines.append("The system retrieved the 5 most similar projects.\n")

for idx, r in enumerate(final_results, start=1):
    prompt_lines.append(f"{idx}. {r['title']}")
    prompt_lines.append(f"   - Domain: {r['domain']}")
    prompt_lines.append(
        f"   - Similarities: Overall {to_percent(r['sim_whole'])}%, "
        f"Title {to_percent(r['sim_title'])}%, Description {to_percent(r['sim_description'])}%, Tech {to_percent(r['sim_tech_stack'])}%"
    )
    prompt_lines.append(f"   - Snippet: {r['doc_snippet']}\n")

prompt_lines.append(
"""
Act as a project evaluation assistant.

For **each** project:
1. Explain briefly why it matched.
2. Identify exact overlapping parts.
3. Provide 2 suggestions to make the project idea more original.

Finally:
Give 3 general originality improvement tips.
"""
)

llama_prompt = "\n".join(prompt_lines)


In [13]:
# =============================================
# 9. Generate Explanation Using Llama3.2 (via Ollama/OpenAI API)
# =============================================

print("\nðŸ§  Generating Llama3.2 similarity analysis...\n")

completion = client_ai.chat.completions.create(
    model=MODEL_NAME,
    messages=[
        {"role": "system", "content": "You are an academic evaluator and writing assistant."},
        {"role": "user", "content": llama_prompt}
    ],
    temperature=0.5
)

llama_summary = completion.choices[0].message.content.strip()

print(llama_summary)



ðŸ§  Generating Llama3.2 similarity analysis...

I'll evaluate each project and provide feedback.

**Project 1: Scraping laptop data from Amazon**

1. Why it matched: The project's title, description, and tech domain are similar to the query "web scrapping amazon deals", indicating that it involves extracting data from a website using web scraping techniques.
2. Exact overlapping parts: None found.
3. Suggestions for originality improvement:
	* Add a twist by focusing on a specific type of laptop data (e.g., gaming laptops, budget laptops) or exploring the use of machine learning algorithms to improve data extraction accuracy.
	* Incorporate a unique aspect, such as using natural language processing techniques to extract reviews or ratings from Amazon product pages.

**Project 2: Extracting Laptop Data from Amazon**

1. Why it matched: Similar to Project 1, this project's title and description are similar to the query "web scrapping amazon deals".
2. Exact overlapping parts: None foun

In [None]:
# =============================================
# 10. Use GEMINI for Enhanced Summary (Optional)
# =============================================

GOOGLE_API_KEY = os.getenv("GEMINI_API_KEY")
genai.configure(api_key=GOOGLE_API_KEY)

gmodel = genai.GenerativeModel("gemini-2.5-pro")

print("\nðŸ§  Generating Gemini summary...\n")

response = gmodel.generate_content(llama_prompt)
gemini_summary = response.text.strip()

print(gemini_summary)
