# ü§ñ LLM Post-Filter Experiment: GLITCH+LLM Pipeline

**Focus**: Evaluate **GLITCH + LLM** hybrid approach vs **GLITCH-only** baseline.

## üî¨ Experiment Pipeline:

1. **Data Preparation**: GLITCH detections + context extracted *(01_data_extraction.py)*
2. **LLM Filtering**: Apply the selected LLM post-filtering  
3. **Performance Evaluation**: Calculate precision/recall improvements

## üéØ Expected Outcomes:
- **Precision**: 50-300% improvement
- **Recall**: >90% retention  
- **FP Reduction**: Significant decrease in false alarms


## üîß Setup and Configuration


In [None]:
# üîß Manual provider/model override (optional)
# Edit these values to force a specific provider/model for this session.
# Supported providers: "openai", "anthropic", "ollama", "openai_compatible"
provider = "openai"
model = "gpt-4o"   # examples: "gpt-4o", "claude-3-5-sonnet-latest", "codellama:7b"
base_url = None          # for ollama or openai-compatible, e.g. "http://localhost:11434" or "https://api.x.ai/v1"

import os
os.environ["LLM_PROVIDER"] = provider
os.environ["LLM_MODEL"] = model
if base_url:
    os.environ["LLM_BASE_URL"] = str(base_url)
else:
    os.environ.pop("LLM_BASE_URL", None)

print(f"‚úÖ Using provider={os.getenv('LLM_PROVIDER')} | model={os.getenv('LLM_MODEL')} | base_url={os.getenv('LLM_BASE_URL')}")


In [None]:
# üîé Context window configuration (lines around target; use 0 for target-only)
CONTEXT_LINES = 3  # change to 0, 1, 2, ... as needed
print(f"Context lines: ¬±{CONTEXT_LINES}")

In [None]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
import logging
import warnings
warnings.filterwarnings('ignore')

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Add llm-postfilter modules to path
project_root = Path.cwd().parent.parent.parent
sys.path.append(str(project_root / "src"))

# Import llm-postfilter pipeline components
from llm_postfilter import (
    GLITCHLLMFilter, 
    HybridEvaluator,
    SecuritySmellPrompts,
    SecuritySmell,
    Provider,
)

print(f"üè† Project root: {project_root}")
print(f"üìç Working directory: {Path.cwd()}")

# Optional: set provider/model for downstream notebook via env
os.environ["LLM_PROVIDER"] = "openai"
os.environ["LLM_MODEL"] = "gpt-4o"

# Provider/model selection via env vars with sensible defaults
provider = os.getenv("LLM_PROVIDER", Provider.OPENAI.value)
model = os.getenv("LLM_MODEL", "gpt-4o-mini")
base_url = os.getenv("LLM_BASE_URL")  # for ollama or openai-compatible

# API keys per provider
api_key = None
if provider == Provider.OPENAI.value:
    api_key = os.getenv("OPENAI_API_KEY")
    print(f"üîë Provider: OpenAI | Model: {model} | API key found: {bool(api_key)}")
elif provider == Provider.ANTHROPIC.value:
    api_key = os.getenv("ANTHROPIC_API_KEY")
    print(f"üîë Provider: Anthropic | Model: {model} | API key found: {bool(api_key)}")
elif provider == Provider.OLLAMA.value:
    print(f"üîë Provider: Ollama | Model: {model} | Base URL: {base_url or 'http://localhost:11434'}")
elif provider == Provider.OPENAI_COMPATIBLE.value:
    api_key = os.getenv("OPENAI_COMPATIBLE_API_KEY")
    print(f"üîë Provider: OpenAI-compatible | Model: {model} | Base URL: {base_url}")
else:
    print(f"‚ùå Unsupported provider: {provider}")

print("üöÄ LLM Post-Filter Experiment Ready!")

## üìÅ Step 1: Load Context-Enhanced Data

**Load detection files with code context prepared by 01_data_extraction.py**


In [None]:
# Setup directories
data_dir = project_root / "experiments/llm_postfilter/data"
context_dir = data_dir / "with_context"

# Find context-enhanced files
context_enhanced_files = list(context_dir.glob("*_with_context.csv"))

if context_enhanced_files:
    print(f"üìÅ Found {len(context_enhanced_files)} context-enhanced files:")
    for file in context_enhanced_files:
        df = pd.read_csv(file)
        tp_count = df['is_true_positive'].sum()
        fp_count = len(df) - tp_count
        context_success = df['context_success'].sum()
        print(f"  üìÑ {file.name}: {len(df)} detections ({tp_count} TP, {fp_count} FP, {context_success} with context)")
    
    print(f"\n‚úÖ Context-enhanced data ready for LLM analysis")
    
else:
    print("‚ùå No context-enhanced files found!")
    print("‚û°Ô∏è  Run 01_data_extraction.py first to prepare the data")

## üìù Step 2: Review LLM Prompt Design

Review the formal security smell definitions used for LLM evaluation.


In [None]:
# Display formal definitions for each security smell
print("üìù Security Smell Definitions for LLM")
print("=" * 40)

for smell in SecuritySmell:
    definition = SecuritySmellPrompts.DEFINITIONS[smell]
    lines = definition.strip().split('\n')[:3]
    print(f"\nüìå {smell.value}")
    for line in lines:
        print(f"  {line}")
    print(f"  ... ({len(definition.split())} words total)")

print(f"\n‚úÖ {len(SecuritySmell)} smell categories with formal definitions ready")

## üîß Step 3: Initialize LLM Pipeline

Setup GLITCH+LLM hybrid detection pipeline with the selected provider/model.


In [None]:
# Initialize the LLM filter pipeline
# For OpenAI/Anthropic/OpenAI-compatible, an API key is required; for Ollama, it's not
if (provider == Provider.OLLAMA.value) or (api_key):
    print("üîß Initializing GLITCH+LLM pipeline...")
    
    llm_filter = GLITCHLLMFilter(
        project_root=project_root,
        api_key=api_key,
        model=model,
        provider=provider,
        base_url=base_url,
        context_lines=CONTEXT_LINES,
    )
    evaluator = HybridEvaluator(project_root)
    
    # Setup directories
    results_dir = data_dir / "llm_results"
    results_dir.mkdir(exist_ok=True)
    
    print("‚úÖ Pipeline ready:")
    print(f"  ü§ñ Provider: {provider}")
    print(f"  ü§ñ Model: {llm_filter.llm_client.model}")
    print(f"  üß© Context lines: ¬±{llm_filter.context_lines}")
    print(f"  üìä Results ‚Üí {results_dir}")
else:
    print("‚ùå Pipeline initialization failed - missing credentials")
    print("Set appropriate API key env var (OPENAI_API_KEY / ANTHROPIC_API_KEY / OPENAI_COMPATIBLE_API_KEY) or use Ollama")


## üöÄ Step 4: Run LLM Post-Filtering

Apply LLM post-filtering to GLITCH detections and measure improvements.


In [None]:
if ((provider == Provider.OLLAMA.value) or api_key) and 'context_enhanced_files' in locals():
    print(f"üîç Processing {len(context_enhanced_files)} context-enhanced files:")
    for file in context_enhanced_files:
        df = pd.read_csv(file)
        tp_count = df['is_true_positive'].sum()
        fp_count = len(df) - tp_count
        context_success = df['context_success'].sum()
        print(f"  üìÅ {file.name}: {len(df)} detections ({tp_count} TP, {fp_count} FP)")
    
    print(f"\nüöÄ Starting LLM post-filtering...")
    
    # Process each context-enhanced file
    filtered_results = {}
    
    for i, context_file in enumerate(context_enhanced_files):
        print(f"\nüîÑ Processing {i+1}/{len(context_enhanced_files)}: {context_file.name}")
        
        try:
            # Run LLM filtering
            filtered_df = llm_filter.filter_detections(context_file, results_dir)
            filtered_results[context_file.stem] = filtered_df
            
            # Summary stats
            total = len(filtered_df)
            kept = filtered_df['keep_detection'].sum()
            original_tp = filtered_df['is_true_positive'].sum() 
            kept_tp = filtered_df[filtered_df['keep_detection']]['is_true_positive'].sum()
            
            print(f"‚úÖ Kept {kept}/{total} ({kept/total:.1%}) | TP retention: {kept_tp}/{original_tp} ({kept_tp/original_tp:.1%})")
            
        except Exception as e:
            print(f"‚ùå Error: {e}")
            logger.error(f"Failed to process {context_file}: {e}")
    
    print(f"\nüéâ LLM filtering completed! Results ‚Üí {results_dir}")
    
elif provider != Provider.OLLAMA.value and not api_key:
    print("‚ùå Skipping - API key required for selected provider")
else:
    print("‚ùå Skipping - no context files (run context extraction first)")

## üìà Step 5: Evaluate Performance Improvement

Calculate precision, recall, and F1 improvements from LLM post-filtering.


In [None]:
if api_key and 'filtered_results' in locals():
    print("üìä Evaluating GLITCH vs GLITCH+LLM Performance")
    print("=" * 50)
    
    # Organize results by IaC tool
    evaluation_results = {}
    
    for tool in ['chef', 'puppet']:
        tool_filtered_dfs = []
        for key, filtered_df in filtered_results.items():
            if key.startswith(tool):
                tool_filtered_dfs.append(filtered_df)
        
        if tool_filtered_dfs:
            tool_results = evaluator.evaluate_iac_tool(tool_filtered_dfs, tool.title())
            evaluation_results[tool] = tool_results
    
    # Save evaluation results
    evaluation_dir = results_dir / "evaluation"
    evaluation_dir.mkdir(exist_ok=True)
    
    if evaluation_results:
        summary_df = evaluator.save_evaluation_results(evaluation_results, evaluation_dir)
        
        print("\nüéØ EXPERIMENT RESULTS")
        print("=" * 40)
        
        # Display key findings
        for _, row in summary_df.iterrows():
            tool = row['IaC_Tool']
            smell = row['Security_Smell']
            baseline_precision = row['Baseline_Precision']
            llm_precision = row['LLM_Precision']
            precision_improvement = row['Precision_Improvement']
            fp_reduction = row['FP_Reduction']
            tp_retention = row['TP_Retention']
            
            print(f"\nüìå {tool} - {smell}:")
            print(f"  Precision: {baseline_precision:.3f} ‚Üí {llm_precision:.3f} ({precision_improvement:+.1%})")
            print(f"  FP‚Üì: {fp_reduction:.1%} | TP retained: {tp_retention:.1%}")
        
        # Overall improvements
        avg_precision_improvement = summary_df['Precision_Improvement'].mean()
        avg_fp_reduction = summary_df['FP_Reduction'].mean()
        avg_tp_retention = summary_df['TP_Retention'].mean()
        
        print(f"\nüöÄ OVERALL OUTCOMES:")
        print(f"  üìà Precision improvement: {avg_precision_improvement:+.1%}")
        print(f"  üìâ FP reduction: {avg_fp_reduction:.1%}")
        print(f"  üéØ TP retention: {avg_tp_retention:.1%}")
        
        print(f"\nüíæ Detailed results ‚Üí {evaluation_dir}")
        
    else:
        print("‚ùå No evaluation results available")
        
else:
    print("‚è≠Ô∏è Skipping evaluation - run LLM filtering first")

## üìÅ Generated Files & Transparency

Complete experimental transparency through intermediate files.


In [None]:
print("üìÅ Generated Files & Transparency")
print("=" * 40)

print("\nüîç Context Files (LLM input):")
if 'context_dir' in locals():
    context_files = list(context_dir.glob("*.csv"))
    for file in context_files:
        size_kb = file.stat().st_size // 1024
        print(f"  üìÑ {file.name} ({size_kb} KB)")
    print(f"  üìÅ {context_dir}")
else:
    print("  ‚ùå No context files generated")

print("\nü§ñ LLM Results:")
if 'results_dir' in locals() and results_dir.exists():
    result_files = list(results_dir.glob("*.csv")) + list(results_dir.glob("*.json"))
    for file in result_files:
        size_kb = file.stat().st_size // 1024
        if file.name.endswith("_prompts_and_responses.json"):
            print(f"  üìù {file.name} ({size_kb} KB) - Full prompts & LLM responses")
        elif file.name.endswith("_llm_filtered.csv"):
            print(f"  üìä {file.name} ({size_kb} KB) - Filtered detections")
        else:
            print(f"  üìÑ {file.name} ({size_kb} KB)")
    print(f"  üìÅ {results_dir}")
else:
    print("  ‚ùå No LLM results generated")

print("\nüìä Evaluation:")
if 'evaluation_dir' in locals() and evaluation_dir.exists():
    eval_files = list(evaluation_dir.glob("*.csv")) + list(evaluation_dir.glob("*.json"))
    for file in eval_files:
        size_kb = file.stat().st_size // 1024
        print(f"  üìÑ {file.name} ({size_kb} KB)")
    print(f"  üìÅ {evaluation_dir}")
else:
    print("  ‚ùå No evaluation results generated")

print("\nüí° Full transparency: code snippets, prompts, LLM decisions, metrics")