# Tier 3: Semantic Detection Evaluation

Evaluates CodeBERT embeddings for Type-4 (Semantic) detection.
**Note**: This may be slow on CPU.

In [2]:
import sys
import os
import numpy as np
import json
import importlib

sys.path.append(os.path.abspath(".."))
os.environ["REDIS_URL"] = "redis://localhost:6379/0"

# Reload modules to pick up changes
from app.models import embedding_model
from app.pipeline import tier3_semantic
importlib.reload(embedding_model)
importlib.reload(tier3_semantic)
from app.models.embedding_model import embedder
from app.pipeline.tier3_semantic import tier3

Loading microsoft/codebert-base on CPU (no GPU detected)


2026-01-02 12:29:01.404997: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  if not hasattr(np, "object"):


ValueError: Could not find RobertaModel neither in <module 'transformers.models.roberta' from '/home/iamdasun/Projects/SLIIT/4YRG/gradeloop-core/.venv/lib/python3.12/site-packages/transformers/models/roberta/__init__.py'> nor in <module 'transformers' from '/home/iamdasun/Projects/SLIIT/4YRG/gradeloop-core/.venv/lib/python3.12/site-packages/transformers/__init__.py'>!

## 1. Embedding Generation
Generate and inspect embeddings for sample code.

In [None]:
# Factorial Recursive
code_A = """
public int factorial(int n) {
    if (n <= 1) return 1;
    return n * factorial(n - 1);
}
"""

# Factorial Iterative (Type-4 Clone of A)
code_B = """
public int factorial(int n) {
    int res = 1;
    for (int i = 2; i <= n; i++)
        res *= i;
    return res;
}
"""

# Completely unrelated
code_C = """
public void printHello() {
    System.out.println("Hello World");
}
"""

emb_A = embedder.generate_embedding(code_A)
emb_B = embedder.generate_embedding(code_B)
emb_C = embedder.generate_embedding(code_C)

print(f"Embedding Size: {len(emb_A)}")

Embedding Size: 768


## 2. Cosine Similarity Analysis
Compare the embeddings.

In [None]:
def cosine(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

sim_AB = cosine(emb_A, emb_B)
sim_AC = cosine(emb_A, emb_C)

print(f"Similarity (Recursive vs Iterative): {sim_AB:.4f}")
print(f"Similarity (Factorial vs Hello): {sim_AC:.4f}")

# Check if clear separation exists
if sim_AB > sim_AC + 0.1:
    print("SUCCESS: Semantic model distinguishes functionality.")
else:
    print("WARNING: Model may need fine-tuning.")

Similarity (Recursive vs Iterative): 0.9966
Similarity (Factorial vs Hello): 0.9820


## 3. Tier 3 Retrieval

In [None]:
candidates = [{"submission_id": "sub_B", "similarity": 0.4, "tier": 2}] # Low synth sim

import redis
r = redis.Redis.from_url("redis://localhost:6379/0")
r.set("emb:sub_B", json.dumps(emb_B))

results = await tier3.search("sub_A", code_A, candidates)
print(results)

[{'submission_id': 'sub_B', 'similarity': 0.6983037763136996, 'tier': 3, 'semantic_similarity': 0.9966075526273991}]
