In [1]:
import os
import json
import numpy as np
import chromadb
from sentence_transformers import SentenceTransformer
from openai import OpenAI
from dotenv import load_dotenv
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

  _torch_pytree._register_pytree_node(


In [2]:
load_dotenv()

# Backend config (.env)
MODEL_BACKEND = os.getenv("MODEL_BACKEND", "ollama")
MODEL_NAME = os.getenv("MODEL_NAME", "llama3.2")
OLLAMA_API_BASE = os.getenv("OLLAMA_API_BASE", "http://localhost:11434")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "ollama")

# Configure OpenAI to talk to Ollama
os.environ["OPENAI_API_BASE"] = f"{OLLAMA_API_BASE}/v1"
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

client_ai = OpenAI()

print(f"üîó LLM Backend: {MODEL_BACKEND} | Model: {MODEL_NAME}")


üîó LLM Backend: ollama | Model: llama3.2


In [4]:
CHROMA_PATH = "./chroma_aug_store"
client = chromadb.PersistentClient(path=CHROMA_PATH)
collection = client.get_collection("DBMS-25")

embedder = SentenceTransformer("all-MiniLM-L6-v2")

print("üì¶ Loaded ChromaDB & Embedding Model")


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


üì¶ Loaded ChromaDB & Embedding Model


In [5]:
def to_percent(x):
    return round(float(x) * 100, 2)


def compute_cosine(a, b):
    return float(cosine_similarity([a], [b])[0][0])


def bag_of_words_similarity(text1, text2):
    vectorizer = CountVectorizer().fit([text1, text2])
    bow1 = vectorizer.transform([text1]).toarray()[0]
    bow2 = vectorizer.transform([text2]).toarray()[0]
    if np.linalg.norm(bow1) == 0 or np.linalg.norm(bow2) == 0:
        return 0.0
    return float(np.dot(bow1, bow2) / (np.linalg.norm(bow1) * np.linalg.norm(bow2)))


def call_llama_similarity(query_text, db_text):
    """
    Ask Llama3.2 to give contextual similarity score (0‚Äì100)
    """
    prompt = f"""
Rate the similarity between the following two project descriptions.
Return ONLY a number between 0 and 100.

USER PROJECT:
{query_text}

EXISTING PROJECT:
{db_text}
"""

    try:
        resp = client_ai.chat.completions.create(
            model=MODEL_NAME,
            messages=[
                {"role": "system", "content": "You are a similarity evaluator. Return only a number."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.0
        )
        score = resp.choices[0].message.content.strip()
        return float(score)
    except:
        return 0.0

In [115]:
title = "Smart Water Leakage Detection System"
description = "An IoT-based monitoring setup that detects pipe leaks using flow irregularity detection and instant notifications."
tech_stack = "ESP32, Flow Sensors, MQTT, Python"
domain = "IoT / Infrastructure"
objective = "Prevent water wastage by identifying leaks in real time."


In [116]:
emb_title = embedder.encode(title).tolist()
emb_desc = embedder.encode(description).tolist()
emb_tech = embedder.encode(tech_stack).tolist()
emb_domain = embedder.encode(domain).tolist()
emb_obj = embedder.encode(objective).tolist()

combined_text = f"{title} {description} {tech_stack} {domain} {objective}"
emb_whole = embedder.encode(combined_text).tolist()

print("‚ú® Embeddings created.")



‚ú® Embeddings created.


In [117]:
TOP_K = 8

res = collection.query(
    query_embeddings=[emb_whole],
    n_results=TOP_K,
    include=["metadatas", "documents", "distances", "embeddings"]
)

ids = res["ids"][0]
docs = res["documents"][0]
metas = res["metadatas"][0]
stored_embs = res["embeddings"][0]

print("üîç Retrieved matches from ChromaDB")



üîç Retrieved matches from ChromaDB


In [118]:

results = []

for i, rid in enumerate(ids):
    meta = metas[i]
    db_title = meta.get("title", "")
    db_domain = meta.get("domain", "")
    db_tech = meta.get("tech_stack", "")
    db_source = meta.get("source", "")
    db_desc = docs[i]

    # Embeddings
    db_emb = np.array(stored_embs[i])
    db_title_emb = embedder.encode(db_title).tolist()
    db_desc_emb = embedder.encode(db_desc).tolist()
    db_tech_emb = embedder.encode(db_tech).tolist()
    db_domain_emb = embedder.encode(db_domain).tolist()

    # Similarities
    sim_title = compute_cosine(emb_title, db_title_emb)
    sim_desc = compute_cosine(emb_desc, db_desc_emb)
    sim_tech = compute_cosine(emb_tech, db_tech_emb)
    sim_domain = compute_cosine(emb_domain, db_domain_emb)
    sim_objective = compute_cosine(emb_obj, embedder.encode(meta.get("objective","")).tolist())
    sim_whole = compute_cosine(emb_whole, db_emb)

    # BoW similarity
    bow_sim = bag_of_words_similarity(combined_text, db_desc)

    # LLM contextual similarity
    ctx_sim = call_llama_similarity(combined_text, db_desc)

    # Weighted final score
    final_score = (
        0.40 * sim_whole +
        0.20 * ((sim_title + sim_desc + sim_tech + sim_domain + sim_objective) / 5) +
        0.15 * bow_sim +
        0.25 * (ctx_sim / 100)
    )

    results.append({
        "project_id": rid,
        "title": db_title,
        "domain": db_domain,
        "tech_stack": db_tech,
        "whole_similarity": to_percent(sim_whole),
        "title_similarity": to_percent(sim_title),
        "description_similarity": to_percent(sim_desc),
        "tech_similarity": to_percent(sim_tech),
        "domain_similarity": to_percent(sim_domain),
        "objective_similarity": to_percent(sim_objective),
        "bow_similarity": to_percent(bow_sim),
        "contextual_similarity": ctx_sim,
        "final_similarity": to_percent(final_score),
        "snippet": db_desc[:250]
    })


In [119]:

results_sorted = sorted(results, key=lambda x: x["final_similarity"], reverse=True)

print("\n=== üéØ FINAL TOP-5 MATCHES ===\n")
for r in results_sorted[:5]:
    print(f"üìå {r['title']}  |  Final Score: {r['final_similarity']}%")
    print(f"   Whole: {r['whole_similarity']}% | Title: {r['title_similarity']}% | Desc: {r['description_similarity']}%")
    print(f"   Domain: {r['domain_similarity']}% | Tech: {r['tech_similarity']}% | Obj: {r['objective_similarity']}%")
    print(f"   BoW: {r['bow_similarity']}% | Contextual: {r['contextual_similarity']}%")
    print(f"   Snippet: {r['snippet']}\n")



=== üéØ FINAL TOP-5 MATCHES ===

üìå Smart Farming and Agriculture  |  Final Score: 33.3%
   Whole: 33.41% | Title: 29.46% | Desc: 30.0%
   Domain: 81.44% | Tech: 34.81% | Obj: 15.74%
   BoW: 11.87% | Contextual: 42.0%
   Snippet: Another excellent idea to work for enhancing the agricultural sector. You can work on developing gadgets or technology that will help farmers have insights about the soil and climate and plan their farming methods accordingly. IoT devices like detect

üìå Rainfall Prediction system  |  Final Score: 30.53%
   Whole: 31.86% | Title: 30.72% | Desc: 24.76%
   Domain: 35.03% | Tech: 33.23% | Obj: 4.95%
   BoW: 12.58% | Contextual: 43.0%
   Snippet: This project aims to develop a model capable of predicting rainfall patterns in the future based on past meteorological data. Learners will get hands-on practice collecting and preprocessing meteorological data, using different machine learning algor

üìå Heart disease detection  |  Final Score: 29.82%
   Whole: 30

In [120]:
prompt_lines = []
prompt_lines.append("You are an expert academic evaluator specializing in project similarity analysis and originality improvement.")
prompt_lines.append("Your job is to compare the student's synopsis with top-5 similar projects and produce a clear, structured, point-wise analytical report.\n")

prompt_lines.append("=== STUDENT PROJECT SYNOPSIS ===")
prompt_lines.append(f"‚Ä¢ Title: {title}")
prompt_lines.append(f"‚Ä¢ Description: {description}")
prompt_lines.append(f"‚Ä¢ Tech Stack: {tech_stack}")
prompt_lines.append(f"‚Ä¢ Domain: {domain}")
prompt_lines.append(f"‚Ä¢ Objective: {objective}\n")

prompt_lines.append("=== TOP-5 MOST SIMILAR PROJECTS FROM DATABASE ===")

for i, r in enumerate(results_sorted[:5], start=1):
    prompt_lines.append(f"\n{i}. {r['title']}")
    prompt_lines.append("   Similarity Breakdown:")
    prompt_lines.append(f"   ‚Ä¢ Whole Similarity: {r['whole_similarity']}%")
    prompt_lines.append(f"   ‚Ä¢ Title Similarity: {r['title_similarity']}%")
    prompt_lines.append(f"   ‚Ä¢ Description Similarity: {r['description_similarity']}%")
    prompt_lines.append(f"   ‚Ä¢ Tech Stack Overlap: {r['tech_similarity']}%")
    prompt_lines.append(f"   ‚Ä¢ Objective Similarity: {r['objective_similarity']}%")
    prompt_lines.append(f"   ‚Ä¢ Domain Match: {r['domain_similarity']}%")
    prompt_lines.append(f"   ‚Ä¢ Contextual Match Score: {r['contextual_similarity']}")
    prompt_lines.append(f"   ‚Ä¢ Relevant Snippet: {r['snippet']}\n")

prompt_lines.append(
"""
=== TASK ===

For **each** of the 5 similar projects, generate a detailed section with the following structure:

1Ô∏è‚É£ **Why It Was Flagged as Similar (2‚Äì3 crisp bullet points)**  
   - Identify specific overlapping concepts (keywords, functional goals, methods, problem domain).  
   - Mention pattern-level or theme-level similarity (e.g., ‚Äúpredictive analytics‚Äù, ‚Äúdatabase CRUD operations‚Äù, etc).  

2Ô∏è‚É£ **Exact Matching Components (point-wise)**  
   - Title keyword overlaps  
   - Matching description phrases  
   - Shared technical stack components  
   - Similar objectives or problem statements  
   - Domain-level commonality  
   - Any contextual similarity patterns  

3Ô∏è‚É£ **Uniqueness Enhancement Suggestions (2 actionable changes)**  
   - Rewrite suggestions for making the student‚Äôs project more original  
   - Clear instructions to alter scope, architecture, features, or focus  
   - Recommend new angles, datasets, modules, or problem framing  

After covering all 5 projects, generate:

4Ô∏è‚É£ **GENERAL ORIGINALITY GUIDELINES (5 strong, practical points)**  
   - How to avoid common similarity traps  
   - How to differentiate a project in title, description, and objective  
   - Best practices for making academically original work  
   - Technical + conceptual strategies to reduce similarity  

Format the entire output cleanly:
‚Ä¢ Use headings  
‚Ä¢ Use bullet points  
‚Ä¢ Avoid long paragraphs  
‚Ä¢ Keep analysis deep but easy to scan  
"""
)

llama_prompt = "\n".join(prompt_lines)

# ----- Call model -----
try:
    completion = client_ai.chat.completions.create(
        model=MODEL_NAME,
        messages=[
            {"role": "system", "content": "You evaluate project similarities and rewrite suggestions."},
            {"role": "user", "content": llama_prompt}
        ],
        temperature=0.4
    )
    llama_analysis = completion.choices[0].message.content.strip()
except Exception as e:
    llama_analysis = f"LLAMA ERROR: {e}"

# ----- Print result -----
print("=== üß† LLM Similarity Breakdown & Suggestions ===\n")
print(llama_analysis)


=== üß† LLM Similarity Breakdown & Suggestions ===

**Smart Water Leakage Detection System: Project Similarity Analysis and Originality Enhancement**

### 1. Smart Farming and Agriculture

#### Why It Was Flagged as Similar
‚Ä¢ **IoT devices**: Both projects involve the use of IoT devices (ESP32 in Smart Water Leakage Detection System and sensors in Smart Farming and Agriculture).
‚Ä¢ **Real-time monitoring**: Both projects focus on real-time monitoring and instant notifications.
‚Ä¢ **Data-driven insights**: Both projects aim to provide data-driven insights to improve efficiency and prevent waste.

#### Exact Matching Components
‚Ä¢ Title keyword overlap: "System" (Smart Water Leakage Detection System and Smart Farming and Agriculture)
‚Ä¢ Matching description phrases: "detect pipe leaks" and "enhance agricultural sector"
‚Ä¢ Shared technical stack components: ESP32, IoT devices
‚Ä¢ Similar objectives or problem statements: Prevent waste by identifying leaks in real time
‚Ä¢ Domain-l

In [121]:
import os
import json
from datetime import datetime

REPORT_DIR = "analysis_report"
os.makedirs(REPORT_DIR, exist_ok=True)

# ---------------------------
# Safe auto-numbering
# ---------------------------
nums = []
for f in os.listdir(REPORT_DIR):
    if f.startswith("json_") and f.endswith(".json"):
        try:
            num = int(f.split("_")[1].split(".")[0])
            nums.append(num)
        except:
            pass

next_num = max(nums) + 1 if nums else 1

file_name = f"json_{next_num}.json"
file_path = os.path.join(REPORT_DIR, file_name)

# ---------------------------
# Build the report
# ---------------------------
analysis_report = {
    "query": title,
    "timestamp": str(datetime.now()),
    "results": [
        {
            "id": r.get("proj_id"),
            "title": r.get("title"),
            "domain": r.get("domain"),
            "tech_stack": r.get("tech_stack"),
            "source": r.get("source"),
            "sim_whole": r.get("overall_similarity"),
            "sim_title": r.get("title_similarity"),
            "sim_description": r.get("desc_similarity"),
            "sim_tech_stack": r.get("tech_similarity"),
            "sim_objective": r.get("objective_similarity", 0),
            "doc_snippet": r.get("snippet")
        }
        for r in results_sorted
    ],
    "ai_analysis": llama_analysis
}

# ---------------------------
# Save to file
# ---------------------------
with open(file_path, "w") as f:
    json.dump(analysis_report, f, indent=4)

print(f"‚úÖ Saved ‚Üí {file_path}")


‚úÖ Saved ‚Üí analysis_report\json_14.json
