# Tier 1: Lexical Detection Evaluation

Evaluates the hash-based (Type-1) and token-overlap (Type-2) detection tiers.
**Pre-requisite**: Redis must be running locally on port 6379.

In [4]:
import sys
import os
import asyncio
import pandas as pd
from typing import List

sys.path.append(os.path.abspath(".."))
os.environ["REDIS_URL"] = "redis://localhost:6379/0"

from app.pipeline.tier1_lexical import tier1
from app.features.normalization import normalize_code_ast, tokenize_source
from app.models.submission import Submission

# Ensure we can talk to Redis
try:
    import redis
    r = redis.Redis.from_url("redis://localhost:6379/0")
    r.ping()
    print("Redis connection successful")
    r.flushdb() # Clean start for eval
except Exception as e:
    print(f"Warning: Redis not connected. {e}")

Redis connection successful


## 1. Prepare Test Data
Creating a set of synthetic Type-1 and Type-2 clones.

In [5]:
test_cases = [
    ("s1", "public void foo() { int x = 1; }", "Original"),
    ("s2", "public void foo() { int x = 1; }", "Type-1 (Exact)"),
    ("s3", "public void foo() { int y = 1; }", "Type-2 (Rename x->y)"),
    ("s4", "public void bar() { float z = 2.0; }", "Different")
]

submissions = []
for tid, code, label in test_cases:
    submissions.append(Submission(id=tid, student_id="student", assignment_id="a1", code=code))

print("Test Cases Prepared")

Test Cases Prepared


## 2. Indexing Performance
Measure time to index submissions.

In [6]:
import time

start = time.time()

async def index_all():
    for sub in submissions:
        norm = normalize_code_ast(sub.code)
        tokens = tokenize_source(norm)
        await tier1.index_submission(sub, tokens)

await index_all()
end = time.time()

print(f"Indexing Time: {end - start:.4f}s")

Indexing Time: 0.0215s


## 3. Query & Precision Evaluation
Query with a Type-2 clone and check results.

In [7]:
query_code = "public void foo() { int z = 1; }" # Type-2 of s1

norm = normalize_code_ast(query_code)
tokens = tokenize_source(norm)

results = await tier1.search(tokens)
print("Query Results:")
for r in results:
    print(r)

# Evaluation Metric: Recall
# Expecting s1, s2, s3 to be in results (since they normalize similarly)
expected_ids = {'s1', 's2', 's3'}
retrieved_ids = {r['submission_id'] for r in results}

recall = len(expected_ids.intersection(retrieved_ids)) / len(expected_ids)
print(f"Recall: {recall:.2f}")

Query Results:
{'submission_id': 's1', 'similarity': 1.0, 'tier': 1, 'type': 'Type-1'}
{'submission_id': 's2', 'similarity': 1.0, 'tier': 1, 'type': 'Type-1'}
{'submission_id': 's3', 'similarity': 1.0, 'tier': 1, 'type': 'Type-1'}
{'submission_id': 's4', 'similarity': 0.8888888888888888, 'tier': 1, 'type': 'Candidate'}
Recall: 1.00
