# 🤖 LLM Post-Filter Experiment: GLITCH+LLM Pipeline

**Focus**: Evaluate **GLITCH + LLM** hybrid approach vs **GLITCH-only** baseline using **interactive configuration**.

## 🔬 Experiment Pipeline:

1. **Interactive Setup**: Configure LLM provider, model, and experiment parameters
2. **Data Preparation**: GLITCH detections + context extracted *(01_data_extraction.py)*
3. **LLM Filtering**: Apply the selected LLM post-filtering  
4. **Performance Evaluation**: Calculate precision/recall improvements

## 🎯 Expected Outcomes:
- **Precision**: 50-300% improvement
- **Recall**: >90% retention  
- **FP Reduction**: Significant decrease in false alarms


## 🎛️ Interactive Experiment Configuration

Configure your LLM provider, model, and experiment parameters using the interactive widgets below. No more manual cell editing!


In [None]:
# 🎛️ Interactive Configuration
import os
import ipywidgets as widgets
from IPython.display import display, clear_output, HTML

# Configuration storage
config = {
    'provider': 'openai',
    'model': 'gpt-4o-mini', 
    'base_url': '',
    'prompt_template': 'definition_based_conservative',
    'context_lines': 3,
    'iac_tools': ['chef', 'puppet']
}

# Provider selection
provider_widget = widgets.Dropdown(
    options=[
        ('🔥 OpenAI (GPT models)', 'openai'),
        ('🧠 Anthropic (Claude models)', 'anthropic'), 
        ('🦙 Ollama (Local models)', 'ollama')
    ],
    value=config['provider'],
    description='Provider:',
    style={'description_width': 'initial'}
)

# Model input
model_widget = widgets.Text(
    value=config['model'],
    placeholder='e.g., gpt-4o, claude-3-5-sonnet-latest, codellama:7b',
    description='Model:',
    style={'description_width': 'initial'}
)

# Base URL input (for ollama)
base_url_widget = widgets.Text(
    value=config['base_url'],
    placeholder='e.g., http://localhost:11434',
    description='Base URL:',
    style={'description_width': 'initial'}
)

# Prompt template selection (dynamic detection)
prompt_template_widget = widgets.Dropdown(
    options=[
        ('Definition Based Conservative', 'definition_based_conservative'),
        ('Definition Based Current', 'definition_based_current'),
        ('Static Analysis Rules Conservative', 'static_analysis_rules_conservative'),
        ('Static Analysis Rules Current', 'static_analysis_rules_current'),
    ],
    value=config['prompt_template'],
    description='Prompt Template:',
    style={'description_width': 'initial'}
)

# Context lines slider
context_lines_widget = widgets.IntSlider(
    value=config['context_lines'],
    min=0,
    max=3,
    step=1,
    description='Context Lines:',
    style={'description_width': 'initial'}
)

# IaC tools selection
iac_tools_widget = widgets.ToggleButtons(
    options=[
        ('Chef', 'chef'),
        ('Puppet', 'puppet'),
        ('Both', 'both')
    ],
    value='both',
    description='IaC Tools:',
    style={'description_width': 'initial'}
)

# Status output
status_output = widgets.Output()

def update_config(*args):
    """Update configuration and environment variables when widgets change"""
    with status_output:
        clear_output()
        
        # Update config
        config['provider'] = provider_widget.value
        config['model'] = model_widget.value
        config['base_url'] = base_url_widget.value.strip()
        config['prompt_template'] = prompt_template_widget.value
        config['context_lines'] = context_lines_widget.value
        
        # Handle IaC tools selection
        if iac_tools_widget.value == 'both':
            config['iac_tools'] = ['chef', 'puppet']
        elif iac_tools_widget.value == 'chef':
            config['iac_tools'] = ['chef']
        elif iac_tools_widget.value == 'puppet':
            config['iac_tools'] = ['puppet']
        else:
            config['iac_tools'] = []
        
        # Update environment variables
        os.environ["LLM_PROVIDER"] = config['provider']
        os.environ["LLM_MODEL"] = config['model']
        if config['base_url']:
            os.environ["LLM_BASE_URL"] = config['base_url']
        else:
            os.environ.pop("LLM_BASE_URL", None)
        
        # Show configuration status
        provider_emoji = {
            'openai': '🔥', 'anthropic': '🧠', 
            'ollama': '🦙'
        }
        
        print("✅ CONFIGURATION UPDATED")
        print(f"🤖 Provider: {provider_emoji.get(config['provider'], '❓')} {config['provider']}")
        print(f"🎯 Model: {config['model']}")
        if config['base_url']:
            print(f"🌐 Base URL: {config['base_url']}")
        print(f"📋 Prompt Template: {config['prompt_template']}")
        print(f"🔎 Context Lines: ±{config['context_lines']}")
        print(f"🔧 IaC Tools: {', '.join(config['iac_tools']) if config['iac_tools'] else 'None selected'}")
        
        # API key check
        api_key_status = "❌ Not found"
        if config['provider'] == 'openai' and os.getenv("OPENAI_API_KEY"):
            api_key_status = "✅ Found"
        elif config['provider'] == 'anthropic' and os.getenv("ANTHROPIC_API_KEY"):
            api_key_status = "✅ Found"
        elif config['provider'] == 'ollama':
            api_key_status = "🚫 Not required"
            
        print(f"🔑 API Key: {api_key_status}")

# Display widgets
display(provider_widget)
display(model_widget)
display(base_url_widget)
display(prompt_template_widget)
display(context_lines_widget)
display(iac_tools_widget)
display(status_output)

# Initial configuration display
update_config()

# Connect widgets to update function (after initial display to avoid duplicates)
provider_widget.observe(update_config, names='value')
model_widget.observe(update_config, names='value')
base_url_widget.observe(update_config, names='value')
prompt_template_widget.observe(update_config, names='value')
context_lines_widget.observe(update_config, names='value')
iac_tools_widget.observe(update_config, names='value')

In [None]:
# Configuration

import sys
import pandas as pd
import numpy as np
from pathlib import Path
import logging
import warnings
warnings.filterwarnings('ignore')

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Add llm-postfilter modules to path
project_root = Path.cwd().parent.parent.parent
sys.path.append(str(project_root / "src"))

# Import llm-postfilter pipeline components
from llm_postfilter import (
    GLITCHLLMFilter, 
    HybridEvaluator,
    SecuritySmellPrompts,
    SecuritySmell,
    Provider,
)

print(f"🏠 Project root: {project_root}")
print(f"📍 Working directory: {Path.cwd()}")

# Get configuration from interactive widgets
provider = config['provider']
model = config['model']
base_url = config['base_url'] if config['base_url'] else None
prompt_template = config['prompt_template']
context_lines = config['context_lines']
iac_tools = config['iac_tools']

# API keys per provider
api_key = None
if provider == Provider.OPENAI.value:
    api_key = os.getenv("OPENAI_API_KEY")
    print(f"🔑 Provider: OpenAI | Model: {model} | API key found: {bool(api_key)}")
elif provider == Provider.ANTHROPIC.value:
    api_key = os.getenv("ANTHROPIC_API_KEY")
    print(f"🔑 Provider: Anthropic | Model: {model} | API key found: {bool(api_key)}")
elif provider == Provider.OLLAMA.value:
    print(f"🔑 Provider: Ollama | Model: {model} | Base URL: {base_url or 'http://localhost:11434'}")
else:
    print(f"❌ Unsupported provider: {provider}")

print("\n📊 CURRENT EXPERIMENT CONFIGURATION:")
print(f"  🤖 Provider: {provider}")
print(f"  🎯 Model: {model}")
if base_url:
    print(f"  🌐 Base URL: {base_url}")
print(f"  📋 Prompt Template: {prompt_template}")
print(f"  🔎 Context Lines: ±{context_lines}")
print(f"  🔧 IaC Tools: {', '.join(iac_tools) if iac_tools else 'None selected'}")

print("\n🚀 LLM Post-Filter Experiment Ready!")

## 📁 Step 1: Load Context-Enhanced Data

**Load detection files with code context prepared by 01_data_extraction.py**


In [None]:
# Setup directories
data_dir = project_root / "experiments/llm_postfilter/data"
context_dir = data_dir / "with_context"

# Find context-enhanced files
all_context_files = list(context_dir.glob("*_with_context.csv"))

# Filter files based on selected IaC tools
context_enhanced_files = []
for file in all_context_files:
    for tool in iac_tools:
        if file.name.startswith(tool):
            context_enhanced_files.append(file)
            break

print(f"📁 Available files: {len(all_context_files)} | Selected tools: {', '.join(iac_tools) if iac_tools else 'None'}")

if context_enhanced_files:
    print(f"📁 Processing {len(context_enhanced_files)} context-enhanced files:")
    for file in context_enhanced_files:
        df = pd.read_csv(file)
        tp_count = df['is_true_positive'].sum()
        fp_count = len(df) - tp_count
        context_success = df['context_success'].sum()
        print(f"  📄 {file.name}: {len(df)} detections ({tp_count} TP, {fp_count} FP, {context_success} with context)")
    
    print(f"\n✅ Context-enhanced data ready for LLM analysis")
    
elif not iac_tools:
    print("❌ No IaC tools selected!")
    print("➡️  Select Chef and/or Puppet in the configuration above")
else:
    print("❌ No context-enhanced files found for selected tools!")
    print("➡️  Run 01_data_extraction.py first to prepare the data")

## 📝 Step 2: Review LLM Prompt Design

Review the formal security smell definitions used for LLM evaluation.


In [None]:
# Prompt Template Demonstration  
print("🔍 PROMPT TEMPLATE PREVIEW")
print("=" * 50)

# Sample code for demonstration
sample_code = """
# File: database_config.rb
    15: database_password = "MySecretPassword123"
>>> 16: connection_string = "mongodb://admin:#{database_password}@localhost:27017/mydb"
    17: Chef::Log.info("Connecting to database")
"""

smell = SecuritySmell.HARD_CODED_SECRET
current_template = config['prompt_template']

print(f"\n📝 CURRENT TEMPLATE: {current_template}")
print("-" * 40)

prompt = SecuritySmellPrompts.create_prompt(smell, sample_code, current_template)

# Show first few lines to preview
lines = prompt.split('\n')[:8]
for line in lines:
    print(f"  {line}")

print(f"  ... ({len(prompt)} total characters)")

# Show available templates
print(f"\n📋 Available Templates:")
for template in SecuritySmellPrompts.get_available_templates():
    marker = "🟢 SELECTED" if template == current_template else "🔵 Available"
    print(f"  {marker} - {template}")

## 🔧 Step 3: Initialize LLM Pipeline

Setup GLITCH+LLM hybrid detection pipeline with the selected provider/model.


In [None]:
# Initialize the LLM filter pipeline
# For OpenAI/Anthropic/OpenAI-compatible, an API key is required; for Ollama, it's not
if (provider == Provider.OLLAMA.value) or (api_key):
    print("🔧 Initializing GLITCH+LLM pipeline...")
    
    llm_filter = GLITCHLLMFilter(
        project_root=project_root,
        api_key=api_key,
        model=model,
        provider=provider,
        base_url=base_url,
        context_lines=context_lines,
        prompt_template=prompt_template,
    )
    evaluator = HybridEvaluator(project_root)
    
    # Setup directories
    results_dir = data_dir / "llm_results"
    results_dir.mkdir(exist_ok=True)
    
    print("✅ Pipeline ready:")
    print(f"  🤖 Provider: {provider}")
    print(f"  🎯 Model: {llm_filter.llm_client.model}")
    print(f"  🔎 Context lines: ±{llm_filter.context_lines}")
    print(f"  📋 Prompt template: {llm_filter.prompt_template}")
    print(f"  📊 Results → {results_dir}")
    
    # Configuration validation
    print(f"\n🔍 Pipeline Configuration Validation:")
    print(f"  ✅ Using interactive config - provider: {config['provider']}")
    print(f"  ✅ Using interactive config - model: {config['model']}")
    print(f"  ✅ Using interactive config - prompt_template: {config['prompt_template']}")
    print(f"  ✅ Using interactive config - context_lines: {config['context_lines']}")
    
else:
    print("❌ Pipeline initialization failed - missing credentials")
    print("💡 Solutions:")
    print("  • Set API key environment variable:")
    if provider == Provider.OPENAI.value:
        print("    export OPENAI_API_KEY='your-key-here'")
    elif provider == Provider.ANTHROPIC.value:
        print("    export ANTHROPIC_API_KEY='your-key-here'")
    print("  • Or switch to Ollama (no API key required)")
    print("  • Update configuration using the widgets above ⬆️")

## 🚀 Step 4: Run LLM Post-Filtering

Apply LLM post-filtering to GLITCH detections and measure improvements.


In [None]:
if ((provider == Provider.OLLAMA.value) or api_key) and 'context_enhanced_files' in locals():
    print(f"🔍 Processing {len(context_enhanced_files)} context-enhanced files:")
    for file in context_enhanced_files:
        df = pd.read_csv(file)
        tp_count = df['is_true_positive'].sum()
        fp_count = len(df) - tp_count
        context_success = df['context_success'].sum()
        print(f"  📁 {file.name}: {len(df)} detections ({tp_count} TP, {fp_count} FP)")
    
    print(f"\n🚀 Starting LLM post-filtering...")
    
    # Process each context-enhanced file
    filtered_results = {}
    
    for i, context_file in enumerate(context_enhanced_files):
        print(f"\n🔄 Processing {i+1}/{len(context_enhanced_files)}: {context_file.name}")
        
        try:
            # Run LLM filtering
            filtered_df = llm_filter.filter_detections(context_file, results_dir)
            filtered_results[context_file.stem] = filtered_df
            
            # Summary stats
            total = len(filtered_df)
            kept = filtered_df['keep_detection'].sum()
            original_tp = filtered_df['is_true_positive'].sum() 
            kept_tp = filtered_df[filtered_df['keep_detection']]['is_true_positive'].sum()
            
            print(f"✅ Kept {kept}/{total} ({kept/total:.1%}) | TP retention: {kept_tp}/{original_tp} ({kept_tp/original_tp:.1%})")
            
        except Exception as e:
            print(f"❌ Error: {e}")
            logger.error(f"Failed to process {context_file}: {e}")
    
    print(f"\n🎉 LLM filtering completed! Results → {results_dir}")
    
elif provider != Provider.OLLAMA.value and not api_key:
    print("❌ Skipping - API key required for selected provider")
else:
    print("❌ Skipping - no context files (run context extraction first)")

## 📈 Step 5: Evaluate Performance Improvement

Calculate precision, recall, and F1 improvements from LLM post-filtering.


In [None]:
if 'filtered_results' in locals() and filtered_results:
    print("📊 Evaluating GLITCH vs GLITCH+LLM Performance")
    print("=" * 50)
    
    # Organize results by IaC tool (only process selected tools)
    evaluation_results = {}
    
    for tool in iac_tools:
        tool_filtered_dfs = []
        for key, filtered_df in filtered_results.items():
            if key.startswith(tool):
                tool_filtered_dfs.append(filtered_df)
        
        if tool_filtered_dfs:
            tool_results = evaluator.evaluate_iac_tool(tool_filtered_dfs, tool.title())
            evaluation_results[tool] = tool_results
    
    # Save evaluation results
    evaluation_dir = results_dir / "evaluation"
    evaluation_dir.mkdir(exist_ok=True)
    
    if evaluation_results:
        summary_df = evaluator.save_evaluation_results(evaluation_results, evaluation_dir)
        
        print("\n🎯 EXPERIMENT RESULTS")
        print("=" * 40)
        
        # Display key findings
        for _, row in summary_df.iterrows():
            tool = row['IaC_Tool']
            smell = row['Security_Smell']
            baseline_precision = row['Baseline_Precision']
            llm_precision = row['LLM_Precision']
            precision_improvement = row['Precision_Improvement']
            fp_reduction = row['FP_Reduction']
            tp_retention = row['TP_Retention']
            
            print(f"\n📌 {tool} - {smell}:")
            print(f"  Precision: {baseline_precision:.3f} → {llm_precision:.3f} ({precision_improvement:+.1%})")
            print(f"  FP↓: {fp_reduction:.1%} | TP retained: {tp_retention:.1%}")
        
        # Overall improvements
        avg_precision_improvement = summary_df['Precision_Improvement'].mean()
        avg_fp_reduction = summary_df['FP_Reduction'].mean()
        avg_tp_retention = summary_df['TP_Retention'].mean()
        
        print(f"\n🚀 OVERALL OUTCOMES:")
        print(f"  📈 Precision improvement: {avg_precision_improvement:+.1%}")
        print(f"  📉 FP reduction: {avg_fp_reduction:.1%}")
        print(f"  🎯 TP retention: {avg_tp_retention:.1%}")
        
        print(f"\n💾 Detailed results → {evaluation_dir}")
        
    else:
        print("❌ No evaluation results available")
        
else:
    print("⏭️ Skipping evaluation - run LLM filtering first")

## 📁 Generated Files & Transparency

Complete experimental transparency through intermediate files.


In [None]:
print("📁 Generated Files & Transparency")
print("=" * 40)

print("\n🔍 Context Files (LLM input):")
if 'context_dir' in locals():
    context_files = list(context_dir.glob("*.csv"))
    for file in context_files:
        size_kb = file.stat().st_size // 1024
        print(f"  📄 {file.name} ({size_kb} KB)")
    print(f"  📁 {context_dir}")
else:
    print("  ❌ No context files generated")

print("\n🤖 LLM Results:")
if 'results_dir' in locals() and results_dir.exists():
    result_files = list(results_dir.glob("*.csv")) + list(results_dir.glob("*.json"))
    for file in result_files:
        size_kb = file.stat().st_size // 1024
        if file.name.endswith("_prompts_and_responses.json"):
            print(f"  📝 {file.name} ({size_kb} KB) - Full prompts & LLM responses")
        elif file.name.endswith("_llm_filtered.csv"):
            print(f"  📊 {file.name} ({size_kb} KB) - Filtered detections")
        else:
            print(f"  📄 {file.name} ({size_kb} KB)")
    print(f"  📁 {results_dir}")
else:
    print("  ❌ No LLM results generated")

print("\n📊 Evaluation:")
if 'evaluation_dir' in locals() and evaluation_dir.exists():
    eval_files = list(evaluation_dir.glob("*.csv")) + list(evaluation_dir.glob("*.json"))
    for file in eval_files:
        size_kb = file.stat().st_size // 1024
        print(f"  📄 {file.name} ({size_kb} KB)")
    print(f"  📁 {evaluation_dir}")
else:
    print("  ❌ No evaluation results generated")

print("\n💡 Full transparency: code snippets, prompts, LLM decisions, metrics")