# Data Processing & Normalization Evaluation

This notebook evaluates the performance and quality of the text normalization and tokenization steps, which are critical for Tier 1 detection.

In [None]:
import sys
import os
import pandas as pd
import time
import matplotlib.pyplot as plt

# Add parent directory to path to import app modules
sys.path.append(os.path.abspath("../.."))

from apps.cipas.app.features.normalization import normalize_code_ast, tokenize_source

## 1. Load Dataset
Loading a sample from BigCloneBench or using mock data.

In [None]:
# Try loading local parquet if exists, else mock
parquet_path = "../datasets/raw/bigclonebench_400k.parquet"
try:
    df = pd.read_parquet(parquet_path).head(1000)
    print(f"Loaded {len(df)} samples from {parquet_path}")
    # Assuming column 'code' or 'func_code' exists. Adjust as needed.
    if 'code' not in df.columns and 'func_code' in df.columns:
        df['code'] = df['func_code']
except FileNotFoundError:
    print("Dataset not found. Using mock data.")
    data = [
        {"id": "1", "code": "public void test() { int a = 10; System.out.println(a); }"},
        {"id": "2", "code": "public void test2() { int b = 20; System.out.println(b); }"},
        {"id": "3", "code": "// Comment\npublic int add(int x, int y) { return x + y; }"}
    ]
    df = pd.DataFrame(data)

print(df.head())

## 2. Normalization Performance
Measuring the time taken to normalize code and the reduction in vocabulary size.

In [None]:
start_time = time.time()

df['normalized_code'] = df['code'].apply(normalize_code_ast)

end_time = time.time()
print(f"Normalization Time for {len(df)} items: {end_time - start_time:.4f} seconds")
print(f"Average per item: {(end_time - start_time)/len(df):.4f} seconds")

### Example Normalization

In [None]:
print("Original:")
print(df.iloc[0]['code'])
print("\nNormalized:")
print(df.iloc[0]['normalized_code'])

## 3. Tokenization Statistics

In [None]:
df['tokens'] = df['normalized_code'].apply(tokenize_source)
df['num_tokens'] = df['tokens'].apply(len)

plt.figure(figsize=(10, 6))
plt.hist(df['num_tokens'], bins=30, alpha=0.7)
plt.title('Distribution of Token Counts after Normalization')
plt.xlabel('Number of Tokens')
plt.ylabel('Frequency')
plt.show()