# 🔍 LLM Post-Filter Error Analysis

**Objective**: Deep dive into misclassified examples to understand why the LLM is losing true positives and keeping false positives.

## 🎯 Analysis Focus:
1. **Lost True Positives** (Type 1 Error): Original TP → LLM filtered out (False Negatives)
2. **Kept False Positives** (Type 2 Error): Original FP → LLM kept (False Positives)

## 🔧 Investigation Questions:
- What patterns exist in misclassified code snippets?
- Is context window too narrow for certain types?
- Are prompt definitions misaligned with IaC-specific patterns?
- Which smell categories are most problematic?

---


## 🛠️ Setup & Configuration


In [None]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML, Markdown
import ipywidgets as widgets
from collections import defaultdict, Counter
import re

# Set up styling
plt.style.use('default')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

print("🔧 Setup complete!")


In [None]:
# 🎛️ Interactive Configuration (Refactored)
project_root = Path.cwd().parent.parent.parent
results_dir = project_root / "results/llm_postfilter"

# Helpers
def list_experiment_options(base_dir: Path):
    folders = [d for d in base_dir.iterdir() if d.is_dir() and d.name.startswith('2025')]
    return [(f"{f.name} ({len(list(f.glob('*.csv')))} files)", f.name) for f in sorted(folders, reverse=True)]

class CheckboxGroup:
    def __init__(self, title: str, options, selected=None):
        selected = set(selected or [])
        self.checkboxes = [
            widgets.Checkbox(
                value=(opt in selected), description=str(opt), style={'description_width': 'initial'}
            ) for opt in options
        ]
        self.widget = widgets.VBox([widgets.HTML(f"<b>{title}:</b>"), *self.checkboxes])

    @property
    def selected(self):
        return [cb.description for cb in self.checkboxes if cb.value]

# Base configuration
config = {
    'experiment_folder': None,
    'iac_tools': ['chef', 'puppet'],
    'smell_categories': ['Hard-coded secret', 'Suspicious comment', 'Use of weak cryptography algorithms'],
    'analysis_mode': 'both',
    'max_examples': 10,
}

# Build UI
experiment_options = list_experiment_options(results_dir)
experiment_widget = widgets.Dropdown(
    options=experiment_options or [('No experiments found', '')],
    value=(experiment_options[0][1] if experiment_options else ''),
    description='Experiment:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='500px')
)

iac_group = CheckboxGroup("IaC Tools", ['chef', 'puppet'], selected=config['iac_tools'])
smell_group = CheckboxGroup("Security Smells", config['smell_categories'], selected=config['smell_categories'])

mode_widget = widgets.RadioButtons(
    options=[
        ('🔴 Lost True Positives (TP→FN)', 'lost_tp'),
        ('🟡 Kept False Positives (FP→FP)', 'kept_fp'),
        ('🔍 Both Error Types', 'both')
    ],
    value=config['analysis_mode'],
    description='Analysis Mode:',
    style={'description_width': 'initial'}
)

max_examples_widget = widgets.IntSlider(
    value=config['max_examples'], min=5, max=50, step=5,
    description='Max Examples:', style={'description_width': 'initial'}
)

def update_config():
    config['experiment_folder'] = experiment_widget.value
    config['iac_tools'] = iac_group.selected
    config['smell_categories'] = smell_group.selected
    config['analysis_mode'] = mode_widget.value
    config['max_examples'] = int(max_examples_widget.value)
    print(
        "✅ Config updated:",
        f"experiment={config['experiment_folder']}",
        f"iac_tools={config['iac_tools']}",
        f"smells={len(config['smell_categories'])}",
        f"mode={config['analysis_mode']}",
        f"max_examples={config['max_examples']}"
    )

print("🎛️ Configure Error Analysis:")
display(widgets.VBox([experiment_widget, iac_group.widget, smell_group.widget, mode_widget, max_examples_widget]))

## 📊 Data Loading & Overview


In [None]:
# Update configuration
update_config()

# Load experiment data
experiment_path = results_dir / config['experiment_folder']
print(f"📂 Loading data from: {experiment_path}")

# Function to load all detection files
def load_experiment_data(experiment_path, iac_tools, smell_categories):
    all_data = []
    
    for iac_tool in iac_tools:
        for smell in smell_categories:
            # Clean smell name for filename
            smell_clean = smell.replace(' ', '_').replace('-', '_').lower()
            csv_file = experiment_path / f"{iac_tool}_{smell_clean}_detections_with_context_llm_filtered.csv"
            
            if csv_file.exists():
                df = pd.read_csv(csv_file)
                df['experiment_folder'] = config['experiment_folder']
                df['analysis_id'] = f"{iac_tool}_{smell}"
                all_data.append(df)
                print(f"✅ Loaded {iac_tool} {smell}: {len(df)} detections")
            else:
                print(f"❌ Missing: {csv_file.name}")
    
    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)
        return combined_df
    else:
        print("⚠️ No data loaded!")
        return pd.DataFrame()

# Load the data
df = load_experiment_data(experiment_path, config['iac_tools'], config['smell_categories'])

if not df.empty:
    print(f"\n📊 Total loaded: {len(df)} detections")
    print(f"Columns: {list(df.columns)}")
else:
    print("❌ No data to analyze!")


In [None]:
if not df.empty:
    # Classification summary
    print("🔍 Classification Overview:")
    
    # Define error types
    df['error_type'] = 'Correct'
    
    # Lost True Positives (Type 1 Error): TP but LLM said NO
    lost_tp_mask = (df['is_true_positive'] == True) & (df['keep_detection'] == False)
    df.loc[lost_tp_mask, 'error_type'] = 'Lost TP'
    
    # Kept False Positives (Type 2 Error): FP but LLM said YES  
    kept_fp_mask = (df['is_true_positive'] == False) & (df['keep_detection'] == True)
    df.loc[kept_fp_mask, 'error_type'] = 'Kept FP'
    
    # Summary statistics
    summary = df.groupby(['iac_tool', 'smell_category', 'error_type']).size().unstack(fill_value=0)
    display(summary)
    
    # Error rates by category
    print("\n📈 Error Analysis by Category:")
    error_analysis = []
    
    for (iac_tool, smell), group in df.groupby(['iac_tool', 'smell_category']):
        total_tp = (group['is_true_positive'] == True).sum()
        total_fp = (group['is_true_positive'] == False).sum()
        lost_tp = ((group['is_true_positive'] == True) & (group['keep_detection'] == False)).sum()
        kept_fp = ((group['is_true_positive'] == False) & (group['keep_detection'] == True)).sum()
        
        if total_tp > 0:
            tp_loss_rate = lost_tp / total_tp
        else:
            tp_loss_rate = 0
            
        if total_fp > 0:
            fp_retention_rate = kept_fp / total_fp
        else:
            fp_retention_rate = 0
        
        error_analysis.append({
            'IaC Tool': iac_tool,
            'Security Smell': smell,
            'Total TP': total_tp,
            'Lost TP': lost_tp,
            'TP Loss Rate': f"{tp_loss_rate:.1%}",
            'Total FP': total_fp,
            'Kept FP': kept_fp,
            'FP Retention Rate': f"{fp_retention_rate:.1%}"
        })
    
    error_df = pd.DataFrame(error_analysis)
    display(error_df)
    
    # Filter for error analysis
    if config['analysis_mode'] == 'lost_tp':
        analysis_df = df[df['error_type'] == 'Lost TP'].copy()
        print(f"\n🔴 Analyzing {len(analysis_df)} Lost True Positives")
    elif config['analysis_mode'] == 'kept_fp':
        analysis_df = df[df['error_type'] == 'Kept FP'].copy()
        print(f"\n🟡 Analyzing {len(analysis_df)} Kept False Positives")
    else:
        analysis_df = df[df['error_type'].isin(['Lost TP', 'Kept FP'])].copy()
        print(f"\n🔍 Analyzing {len(analysis_df)} total errors ({(df['error_type'] == 'Lost TP').sum()} Lost TP + {(df['error_type'] == 'Kept FP').sum()} Kept FP)")
    
else:
    print("❌ No data available for analysis")


## 🔬 Detailed Error Examples Analysis


In [None]:
def display_error_example(row, include_prompt=False):
    """Display a detailed view of an error example."""
    
    # Header with error type and metadata
    error_color = "🔴" if row['error_type'] == 'Lost TP' else "🟡"
    
    display(HTML(f"""
    <div style="border: 2px solid {'#ff4444' if row['error_type'] == 'Lost TP' else '#ffaa00'}; 
                padding: 15px; margin: 10px 0; border-radius: 8px;">
        <h3>{error_color} {row['error_type']}: {row['smell_category']}</h3>
        <p><strong>IaC Tool:</strong> {row['iac_tool']} | 
           <strong>File:</strong> {row['file_path']} | 
           <strong>Line:</strong> {row['line_number']}</p>
        <p><strong>Ground Truth:</strong> {'✅ True Positive' if row['is_true_positive'] else '❌ False Positive'} | 
           <strong>LLM Decision:</strong> {row['llm_decision']} | 
           <strong>Kept:</strong> {'✅ Yes' if row['keep_detection'] else '❌ No'}</p>
    </div>
    """))
    
    # Code context
    print("📋 Code Context:")
    print("─" * 80)
    print(row['context_snippet'])
    print("─" * 80)
    
    # LLM response
    if pd.notna(row['llm_raw_response']):
        print(f"\n🤖 LLM Raw Response: '{row['llm_raw_response']}'")
    
    # Analysis notes
    print(f"\n📊 Analysis Notes:")
    context_lines = len([l for l in str(row['context_snippet']).split('\n') if l.strip() and not l.startswith('#')])
    print(f"   • Context Lines: {context_lines}")
    print(f"   • Target Line Length: {len(str(row['target_content']))} chars")
    print(f"   • Context Success: {row['context_success']}")
    print(f"   • Processing Time: {row.get('llm_processing_time', 'N/A')}s")
    
    return "\n" + "="*100 + "\n"

# Display examples based on configuration
if not df.empty and 'analysis_df' in locals() and not analysis_df.empty:
    # Better sampling strategy based on max_examples
    if config['max_examples'] <= 10:
        # For small numbers, prioritize variety (max 2 per group)
        sample_df = analysis_df.groupby(['error_type', 'smell_category']).head(2).head(config['max_examples'])
    else:
        # For larger numbers, show more examples proportionally
        examples_per_group = max(2, config['max_examples'] // len(analysis_df.groupby(['error_type', 'smell_category'])))
        sample_df = analysis_df.groupby(['error_type', 'smell_category']).head(examples_per_group).head(config['max_examples'])
    
    print(f"🔍 Showing {len(sample_df)} out of {len(analysis_df)} total error examples:")
    print(f"   (Max configured: {config['max_examples']}, Available: {len(analysis_df)})")
    
    for i, (idx, row) in enumerate(sample_df.iterrows(), 1):
        print(f"\n{'='*20} Example {i}/{len(sample_df)} {'='*20}")
        display_error_example(row)

else:
    print("❌ No examples to display")


## 💡 Analysis Summary & Recommendations


In [None]:
if 'analysis_df' in locals() and not analysis_df.empty:
    print("📋 Error Analysis Summary:")
    print("="*50)
    
    # Summary statistics
    total_errors = len(analysis_df)
    lost_tp_count = (analysis_df['error_type'] == 'Lost TP').sum()
    kept_fp_count = (analysis_df['error_type'] == 'Kept FP').sum()
    
    print(f"📊 Total Errors Analyzed: {total_errors}")
    print(f"   • Lost True Positives: {lost_tp_count} ({lost_tp_count/total_errors:.1%})")
    print(f"   • Kept False Positives: {kept_fp_count} ({kept_fp_count/total_errors:.1%})")
    
    # Context quality analysis
    analysis_df['context_lines'] = analysis_df['context_snippet'].apply(
        lambda x: len([line for line in str(x).split('\n') if line.strip() and not line.startswith('#')])
    )
    avg_context_lines = analysis_df['context_lines'].mean()
    context_success_rate = analysis_df['context_success'].mean()
    
    print(f"\n📏 Context Analysis:")
    print(f"   • Average Context Lines: {avg_context_lines:.1f}")
    print(f"   • Context Success Rate: {context_success_rate:.1%}")
    
    # Smell-specific analysis
    print(f"\n🔍 Most Problematic Smells:")
    smell_error_rates = analysis_df.groupby('smell_category').size().sort_values(ascending=False)
    for smell, count in smell_error_rates.head(3).items():
        print(f"   • {smell}: {count} errors")


## 📤 Export Analysis Results


In [None]:
if 'analysis_df' in locals() and not analysis_df.empty:
    # Export detailed analysis to the experiment directory
    export_dir = experiment_path / "error_analysis"
    export_dir.mkdir(exist_ok=True)
    
    # Generate timestamp for this analysis
    from datetime import datetime
    analysis_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    print(f"📁 Exporting analysis results to: {export_dir}")
    
    # 1. Export filtered error examples
    error_export_file = export_dir / f"error_examples_{config['analysis_mode']}_{analysis_timestamp}.csv"
    
    # Select key columns for export
    export_columns = [
        'detection_id', 'iac_tool', 'smell_category', 'file_path', 'line_number',
        'is_true_positive', 'llm_decision', 'keep_detection', 'error_type',
        'target_content', 'context_snippet', 'llm_raw_response', 'context_success',
        'llm_processing_time', 'experiment_folder'
    ]
    
    analysis_df[export_columns].to_csv(error_export_file, index=False)
    print(f"✅ Exported {len(analysis_df)} error examples to: {error_export_file.name}")
    
    # 2. Export summary statistics
    summary_file = export_dir / f"error_summary_{config['analysis_mode']}_{analysis_timestamp}.json"
    
    # Calculate detailed statistics
    analysis_df['context_lines'] = analysis_df['context_snippet'].apply(
        lambda x: len([line for line in str(x).split('\\n') if line.strip() and not line.startswith('#')])
    )
    
    summary_data = {
        'analysis_metadata': {
            'experiment_folder': config['experiment_folder'],
            'analysis_timestamp': analysis_timestamp,
            'analysis_config': config
        },
        'error_statistics': {
            'total_errors': len(analysis_df),
            'lost_tp_count': int((analysis_df['error_type'] == 'Lost TP').sum()),
            'kept_fp_count': int((analysis_df['error_type'] == 'Kept FP').sum()),
            'avg_context_lines': float(analysis_df['context_lines'].mean()),
            'context_success_rate': float(analysis_df['context_success'].mean()),
            'avg_processing_time': float(analysis_df['llm_processing_time'].mean()) if 'llm_processing_time' in analysis_df.columns else None
        },
        'distribution_analysis': {
            'smell_error_distribution': analysis_df['smell_category'].value_counts().to_dict(),
            'iac_tool_distribution': analysis_df['iac_tool'].value_counts().to_dict(),
            'error_type_distribution': analysis_df['error_type'].value_counts().to_dict()
        },
        'context_analysis': {
            'context_lines_stats': {
                'min': int(analysis_df['context_lines'].min()),
                'max': int(analysis_df['context_lines'].max()),
                'mean': float(analysis_df['context_lines'].mean()),
                'median': float(analysis_df['context_lines'].median())
            },
            'context_success_by_smell': analysis_df.groupby('smell_category')['context_success'].mean().to_dict()
        }
    }
    
    with open(summary_file, 'w') as f:
        json.dump(summary_data, f, indent=2, default=str)
    
    print(f"✅ Exported analysis summary to: {summary_file.name}")
    
    # 3. Export detailed examples for manual review (if not too many)
    if len(analysis_df) <= 20:  # Only for manageable amounts
        detailed_export_file = export_dir / f"detailed_examples_{config['analysis_mode']}_{analysis_timestamp}.txt"
        
        with open(detailed_export_file, 'w', encoding='utf-8') as f:
            f.write(f"DETAILED ERROR ANALYSIS REPORT\\n")
            f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\\n")
            f.write(f"Experiment: {config['experiment_folder']}\\n")
            f.write(f"Analysis Mode: {config['analysis_mode']}\\n")
            f.write(f"Total Errors: {len(analysis_df)}\\n")
            f.write("="*80 + "\\n\\n")
            
            for i, (idx, row) in enumerate(analysis_df.iterrows(), 1):
                f.write(f"EXAMPLE {i}/{len(analysis_df)}\\n")
                f.write(f"Error Type: {row['error_type']}\\n")
                f.write(f"Smell: {row['smell_category']}\\n")
                f.write(f"IaC Tool: {row['iac_tool']}\\n")
                f.write(f"File: {row['file_path']}:{row['line_number']}\\n")
                f.write(f"Ground Truth: {'TP' if row['is_true_positive'] else 'FP'}\\n")
                f.write(f"LLM Decision: {row['llm_decision']}\\n")
                f.write(f"Kept: {'Yes' if row['keep_detection'] else 'No'}\\n")
                f.write("\\nCode Context:\\n")
                f.write("-" * 40 + "\\n")
                f.write(str(row['context_snippet']) + "\\n")
                f.write("-" * 40 + "\\n")
                f.write(f"\\nLLM Response: {row['llm_raw_response']}\\n")
                f.write("\\n" + "="*80 + "\\n\\n")
        
        print(f"✅ Exported detailed examples to: {detailed_export_file.name}")
    
    # 4. Create a README for the analysis
    readme_file = export_dir / "README.md"
    
    readme_content = f'''# Error Analysis Results

**Experiment:** {config['experiment_folder']}  
**Analysis Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}  
**Analysis Mode:** {config['analysis_mode']}

## Summary

- **Total Errors Analyzed:** {len(analysis_df)}
- **Lost True Positives:** {(analysis_df['error_type'] == 'Lost TP').sum()}
- **Kept False Positives:** {(analysis_df['error_type'] == 'Kept FP').sum()}
- **Average Context Lines:** {analysis_df['context_lines'].mean():.1f}
- **Context Success Rate:** {analysis_df['context_success'].mean():.1%}

## Files Generated

1. `error_examples_{config['analysis_mode']}_{analysis_timestamp}.csv` - Detailed error data for further analysis
2. `error_summary_{config['analysis_mode']}_{analysis_timestamp}.json` - Statistical summary and metadata
3. `detailed_examples_{config['analysis_mode']}_{analysis_timestamp}.txt` - Human-readable detailed examples (if ≤20 errors)
4. `README.md` - This file

## Error Distribution

### By Security Smell
{chr(10).join([f"- **{smell}:** {count} errors" for smell, count in analysis_df['smell_category'].value_counts().items()])}

### By IaC Tool  
{chr(10).join([f"- **{tool}:** {count} errors" for tool, count in analysis_df['iac_tool'].value_counts().items()])}

## Next Steps

1. Review detailed examples to identify patterns
2. Consider adjusting context window size (current avg: {analysis_df['context_lines'].mean():.1f} lines)
3. Refine prompt templates based on error patterns
4. Test confidence-based filtering approaches

---
*Generated by 03_error_analysis.ipynb*
'''
    
    with open(readme_file, 'w') as f:
        f.write(readme_content)
    
    print(f"✅ Created analysis README: {readme_file.name}")
    
    print(f"\\n🎉 Analysis export completed!")
    print(f"📂 All files saved to: {export_dir}")
    print(f"\\n📋 Summary:")
    print(f"   • Error examples CSV: {len(analysis_df)} rows")
    print(f"   • Summary JSON with detailed statistics")
    print(f"   • {'Detailed examples TXT (≤20 errors)' if len(analysis_df) <= 20 else 'Detailed examples skipped (>20 errors)'}") 
    print(f"   • README with human-readable summary")
    
else:
    print("❌ No analysis data to export")
