# Tier 3: Semantic Detection Evaluation

Evaluates CodeBERT embeddings for Type-4 (Semantic) detection.
**Note**: This may be slow on CPU.

In [None]:
import sys
import os
import numpy as np
import json

sys.path.append(os.path.abspath("../.."))
os.environ["REDIS_URL"] = "redis://localhost:6379/0"

from apps.cipas.app.models.embedding_model import embedder
from apps.cipas.app.pipeline.tier3_semantic import tier3

## 1. Embedding Generation
Generate and inspect embeddings for sample code.

In [None]:
# Factorial Recursive
code_A = """
public int factorial(int n) {
    if (n <= 1) return 1;
    return n * factorial(n - 1);
}
"""

# Factorial Iterative (Type-4 Clone of A)
code_B = """
public int factorial(int n) {
    int res = 1;
    for (int i = 2; i <= n; i++)
        res *= i;
    return res;
}
"""

# Completely unrelated
code_C = """
public void printHello() {
    System.out.println("Hello World");
}
"""

emb_A = embedder.generate_embedding(code_A)
emb_B = embedder.generate_embedding(code_B)
emb_C = embedder.generate_embedding(code_C)

print(f"Embedding Size: {len(emb_A)}")

## 2. Cosine Similarity Analysis
Compare the embeddings.

In [None]:
def cosine(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

sim_AB = cosine(emb_A, emb_B)
sim_AC = cosine(emb_A, emb_C)

print(f"Similarity (Recursive vs Iterative): {sim_AB:.4f}")
print(f"Similarity (Factorial vs Hello): {sim_AC:.4f}")

# Check if clear separation exists
if sim_AB > sim_AC + 0.1:
    print("SUCCESS: Semantic model distinguishes functionality.")
else:
    print("WARNING: Model may need fine-tuning.")

## 3. Tier 3 Retrieval

In [None]:
candidates = [{"submission_id": "sub_B", "similarity": 0.4, "tier": 2}] # Low synth sim

import redis
r = redis.Redis.from_url("redis://localhost:6379/0")
r.set("emb:sub_B", json.dumps(emb_B))

results = await tier3.search("sub_A", code_A, candidates)
print(results)