# 📊 LLM Post-Filter Experiment: Data Extraction

**Focus**: Extract GLITCH detections from baseline experiments for LLM evaluation.

## 🎯 Goals:
1. Extract all TP/FP GLITCH detections from Chef and Puppet experiments
2. Prepare clean dataset for LLM post-filtering pipeline
3. Generate summary statistics for baseline performance

## 📁 Data Sources:
- **Baseline Results**: Chef and Puppet static analysis experiments
- **Target Smells**: Hard-coded secret, Suspicious comment, Weak cryptography


## 🔧 Setup and Imports


In [None]:
import sys
import pandas as pd
import numpy as np
from pathlib import Path

# Add the src directory to path to import our llm_postfilter modules
project_root = Path.cwd().parent.parent.parent
sys.path.append(str(project_root / "src"))

from llm_postfilter.data_extractor import GLITCHDetectionExtractor

print(f"Project root: {project_root}")
print(f"Working directory: {Path.cwd()}")

# Initialize the extractor
extractor = GLITCHDetectionExtractor(project_root)
print("✅ Data extractor initialized successfully!")

## 📊 Extract Chef Detections


In [None]:
print("🔍 Extracting Chef detections...")

chef_detections = extractor.extract_detections_for_llm('chef')

print(f"\n📊 Chef Detection Summary:")
for smell, detections in chef_detections.items():
    tp_count = sum(1 for d in detections if d['is_true_positive'])
    fp_count = sum(1 for d in detections if not d['is_true_positive'])
    print(f"  {smell}: {len(detections)} total | {tp_count} TP | {fp_count} FP")

# Show example detection structure
if chef_detections:
    first_smell = next(iter(chef_detections.keys()))
    if chef_detections[first_smell]:
        print(f"\n📝 Example detection structure:")
        example = chef_detections[first_smell][0]
        for key, value in example.items():
            print(f"  {key}: {value}")
    else:
        print(f"\n⚠️  No detections found for {first_smell}")
else:
    print("\n❌ No detections extracted!")

## 📊 Extract Puppet Detections


In [None]:
print("🔍 Extracting Puppet detections...")

puppet_detections = extractor.extract_detections_for_llm('puppet')

print(f"\n📊 Puppet Detection Summary:")
for smell, detections in puppet_detections.items():
    tp_count = sum(1 for d in detections if d['is_true_positive'])
    fp_count = sum(1 for d in detections if not d['is_true_positive'])
    print(f"  {smell}: {len(detections)} total | {tp_count} TP | {fp_count} FP")

# Cross-tool comparison
print(f"\n🔄 Chef vs Puppet Comparison:")
all_smells = set(chef_detections.keys()) | set(puppet_detections.keys())
for smell in sorted(all_smells):
    chef_count = len(chef_detections.get(smell, []))
    puppet_count = len(puppet_detections.get(smell, []))
    print(f"  {smell}: Chef={chef_count}, Puppet={puppet_count}")

total_detections = (sum(len(detections) for detections in chef_detections.values()) + 
                   sum(len(detections) for detections in puppet_detections.values()))
print(f"\n✅ Total detections extracted: {total_detections}")

## 💾 Save Detection Dataset


In [None]:
print("💾 Saving detection dataset...")

output_dir = project_root / "experiments/llm_postfilter/data"
output_dir.mkdir(parents=True, exist_ok=True)

# Save detection files
chef_saved = extractor.save_detections('chef', output_dir)
puppet_saved = extractor.save_detections('puppet', output_dir)

print(f"✅ Detection files saved:")
for file_path in sorted(output_dir.glob("*_detections.csv")):
    file_size = file_path.stat().st_size
    df = pd.read_csv(file_path)
    print(f"  📄 {file_path.name}: {len(df)} detections ({file_size:,} bytes)")

print(f"\n🎯 Dataset ready for LLM post-filtering pipeline!")
print(f"📁 Location: {output_dir}")

## 🔍 Extract Code Context for LLM Analysis

**Extract ±3 lines of code context around each GLITCH detection for LLM evaluation.**

This provides transparent, analyzable code snippets for the LLM post-filtering pipeline.


In [None]:
# Import context extractor
from llm_postfilter.context_extractor import CodeContextExtractor

# Setup directories
context_dir = output_dir / "with_context"
context_dir.mkdir(exist_ok=True)

# Initialize context extractor
context_extractor = CodeContextExtractor(project_root)

# Find detection files to process
detection_files = list(output_dir.glob("*_detections.csv"))
detection_files = [f for f in detection_files if not f.name.endswith("_with_context.csv") and not f.name.endswith("_llm_filtered.csv")]

print(f"📁 Found {len(detection_files)} detection files:")
for file in detection_files:
    df = pd.read_csv(file)
    tp_count = df['is_true_positive'].sum()
    fp_count = len(df) - tp_count
    print(f"  📄 {file.name}: {len(df)} detections ({tp_count} TP, {fp_count} FP)")

print(f"\n🔍 Extracting code context for LLM analysis...")

In [None]:
# Process each detection file and save context-enhanced versions
context_enhanced_files = []
context_stats = []

for i, detection_file in enumerate(detection_files):
    print(f"\n🔄 Processing {i+1}/{len(detection_files)}: {detection_file.name}")
    
    # Extract context and save enhanced file
    enhanced_df = context_extractor.process_and_save_detections(
        detection_file, context_dir, context_lines=3
    )
    
    # Track files and stats
    base_name = detection_file.stem
    context_file = context_dir / f"{base_name}_with_context.csv"
    context_enhanced_files.append(context_file)
    
    # Calculate stats
    total_detections = len(enhanced_df)
    successful_context = enhanced_df['context_success'].sum()
    files_found = enhanced_df['file_found'].sum()
    
    context_stats.append({
        'file': detection_file.name,
        'total_detections': total_detections,
        'files_found': files_found,
        'context_extracted': successful_context,
        'success_rate': successful_context / total_detections if total_detections > 0 else 0
    })
    
    print(f"✅ {context_file.name}: {successful_context}/{total_detections} context extracted ({successful_context/total_detections:.1%})")

print(f"\n🎯 Context extraction completed for {len(detection_files)} files!")

In [None]:
# Display context extraction summary
print("📊 Context Extraction Summary")
print("=" * 50)

stats_df = pd.DataFrame(context_stats)
for _, row in stats_df.iterrows():
    print(f"📄 {row['file']}: {row['context_extracted']}/{row['total_detections']} ({row['success_rate']:.1%}) context extracted")

# Overall statistics
total_detections = stats_df['total_detections'].sum()
total_context_extracted = stats_df['context_extracted'].sum()
overall_success_rate = total_context_extracted / total_detections if total_detections > 0 else 0

print(f"\n🎯 Overall: {total_context_extracted}/{total_detections} ({overall_success_rate:.1%}) successful")
print(f"📁 Context files saved: {len(context_enhanced_files)} → {context_dir}")

In [None]:
# Show example context snippet that LLM will analyze
print("🔍 Example Context Snippet for LLM")
print("=" * 40)

if context_enhanced_files:
    example_file = context_enhanced_files[0]
    example_df = pd.read_csv(example_file)
    successful_detections = example_df[example_df['context_success'] == True]
    
    if len(successful_detections) > 0:
        example = successful_detections.iloc[0]
        print(f"📁 {example_file.name}")
        print(f"🎯 {example['smell_category']} | TP: {example['is_true_positive']}")
        print(f"\n📄 Context Snippet:")
        print("-" * 30)
        print(example['context_snippet'])
        print("-" * 30)
    else:
        print("❌ No successful context extractions found")
else:
    print("❌ No context-enhanced files available")

print(f"\n💡 All context files saved → {context_dir}")

print(f"\n🎯 Data preparation completed!")
print(f"📁 Detection files: {output_dir}")
print(f"📁 Context files: {context_dir}")
print(f"\n➡️  Next: Run 02_llm_experiment.py for LLM evaluation")

