# FTE-HARM BASIC: Single Hypothesis Validation

## Overview

This notebook implements a **simplified FTE-HARM validation** with:
- **ONE hypothesis** (first label discovered from dataset)
- **ONE P_Score method** (Option A: Binary Presence)
- **ONE validation approach** (Binary: TP/FP/TN/FN)

**Purpose:** Understand core FTE-HARM mechanics before scaling to full implementation.

---

## Dataset Structure

The datasets have varying naming conventions:

| Path | Log File | Label File |
|------|----------|------------|
| grp1/rm/ | log_auth.log | label_auth.log |
| grp1/santos_paired/santos/openvpn/ | openvpn.log | openvpn_labels.log |
| grp1/santos_paired/santos_minimal/vpn_logs_openvpn/ | log.log | label.log |
| grp1/santos/vpn_logs_openvpn/ | log.log | label.log |

**Key Insight:** Log/Label file naming varies by dataset.

## Cell 1: Imports and Setup

In [None]:
# =============================================================================
# FTE-HARM BASIC: IMPORTS AND SETUP
# =============================================================================

import os
import re
import json
import numpy as np
from datetime import datetime
from collections import defaultdict
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# -----------------------------------------------------------------------------
# PATH CONFIGURATION
# -----------------------------------------------------------------------------

DATASET_BASE_PATH = '/content/drive/My Drive/thesis/dataset'
OUTPUT_PATH = '/content/drive/My Drive/thesis/hypotheses_validation'

# Create output directory
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Transformer model paths
MODELS = {
    'distilbert': '/content/drive/My Drive/thesis/transformer/distilberta_base_uncased/results/checkpoint-5245',
    'distilroberta': '/content/drive/My Drive/thesis/transformer/distilroberta_base/results/checkpoint-5275',
    'roberta_large': '/content/drive/My Drive/thesis/transformer/roberta_large/results/checkpoint-2772',
    'xlm_roberta_base': '/content/drive/My Drive/thesis/transformer/xlm_roberta_base/results/checkpoint-12216',
    'xlm_roberta_large': '/content/drive/My Drive/thesis/transformer/xlm_roberta_large/results/checkpoint-12240',
}

SELECTED_MODEL = 'xlm_roberta_large'

print("Libraries imported")
print("Google Drive mounted")
print(f"Dataset base path: {DATASET_BASE_PATH}")
print(f"Output path: {OUTPUT_PATH}")

## Cell 2: Dataset Discovery (Flexible File Naming)

In [None]:
# =============================================================================
# DATASET DISCOVERY - HANDLES VARIABLE FILE NAMING
# =============================================================================

def find_log_label_pair(folder_path):
    """
    Find log and label files in a folder, handling variable naming conventions.
    
    Naming patterns supported:
    - log.log / label.log
    - log_auth.log / label_auth.log
    - openvpn.log / openvpn_labels.log
    - <name>.log / <name>_labels.log
    
    Args:
        folder_path (str): Path to dataset folder
    
    Returns:
        tuple: (log_file_path, label_file_path) or (None, None) if not found
    """
    if not os.path.isdir(folder_path):
        return None, None
    
    files = os.listdir(folder_path)
    log_files = [f for f in files if f.endswith('.log')]
    
    log_file = None
    label_file = None
    
    # Strategy 1: Look for log.log / label.log
    if 'log.log' in log_files and 'label.log' in log_files:
        log_file = 'log.log'
        label_file = 'label.log'
    
    # Strategy 2: Look for log_*.log / label_*.log pattern
    elif any(f.startswith('log_') for f in log_files):
        for f in log_files:
            if f.startswith('log_'):
                suffix = f[4:]  # e.g., "auth.log" from "log_auth.log"
                potential_label = f'label_{suffix}'
                if potential_label in log_files:
                    log_file = f
                    label_file = potential_label
                    break
    
    # Strategy 3: Look for <name>.log / <name>_labels.log pattern
    else:
        for f in log_files:
            if not f.endswith('_labels.log'):
                base_name = f[:-4]  # Remove .log
                potential_label = f'{base_name}_labels.log'
                if potential_label in log_files:
                    log_file = f
                    label_file = potential_label
                    break
    
    if log_file and label_file:
        return (
            os.path.join(folder_path, log_file),
            os.path.join(folder_path, label_file)
        )
    
    return None, None


def scan_all_datasets(base_path):
    """
    Recursively scan for all valid dataset pairs.
    
    Args:
        base_path (str): Base dataset path
    
    Returns:
        list: List of dicts with dataset info
    """
    datasets = []
    
    for root, dirs, files in os.walk(base_path):
        log_path, label_path = find_log_label_pair(root)
        
        if log_path and label_path:
            rel_path = os.path.relpath(root, base_path)
            datasets.append({
                'name': rel_path,
                'folder': root,
                'log_path': log_path,
                'label_path': label_path,
                'log_file': os.path.basename(log_path),
                'label_file': os.path.basename(label_path)
            })
    
    return datasets


# Scan for datasets
print("Scanning for datasets...")
all_datasets = scan_all_datasets(DATASET_BASE_PATH)

print(f"\nFound {len(all_datasets)} dataset pairs:\n")
for i, ds in enumerate(all_datasets, 1):
    print(f"  {i}. {ds['name']}")
    print(f"     Log: {ds['log_file']} | Label: {ds['label_file']}")

## Cell 3: Label Discovery (Find First Label)

In [None]:
# =============================================================================
# LABEL DISCOVERY - FIND FIRST LABEL FOR BASIC HYPOTHESIS
# =============================================================================

def discover_labels(label_path):
    """
    Discover all unique labels in a label.log file.
    
    Format: JSON lines with {"line": N, "labels": [...], "rules": {...}}
    
    Args:
        label_path (str): Path to label file
    
    Returns:
        dict: {
            'all_labels': set of unique labels,
            'label_counts': {label: count},
            'first_label': first label encountered,
            'total_entries': number of labeled lines
        }
    """
    all_labels = set()
    label_counts = defaultdict(int)
    first_label = None
    total_entries = 0
    
    if not os.path.exists(label_path):
        print(f"WARNING: Label file not found: {label_path}")
        return None
    
    with open(label_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            
            try:
                entry = json.loads(line)
                labels = entry.get('labels', [])
                
                if labels:
                    total_entries += 1
                    
                    for label in labels:
                        all_labels.add(label)
                        label_counts[label] += 1
                        
                        # Track first label encountered
                        if first_label is None:
                            first_label = label
                            
            except json.JSONDecodeError:
                continue
    
    # Sort by count
    sorted_labels = sorted(label_counts.items(), key=lambda x: x[1], reverse=True)
    
    return {
        'all_labels': all_labels,
        'label_counts': dict(label_counts),
        'sorted_labels': sorted_labels,
        'first_label': first_label,
        'most_common_label': sorted_labels[0][0] if sorted_labels else None,
        'total_entries': total_entries
    }


def discover_labels_all_datasets(datasets):
    """
    Discover labels across all datasets.
    
    Args:
        datasets (list): List of dataset info dicts
    
    Returns:
        dict: Combined label discovery results
    """
    combined_labels = set()
    combined_counts = defaultdict(int)
    first_label = None
    
    print("\n" + "="*80)
    print("LABEL DISCOVERY")
    print("="*80)
    
    for ds in datasets:
        result = discover_labels(ds['label_path'])
        
        if result:
            print(f"\n{ds['name']}:")
            print(f"  Labeled lines: {result['total_entries']}")
            print(f"  Unique labels: {len(result['all_labels'])}")
            print(f"  Labels: {list(result['all_labels'])[:5]}{'...' if len(result['all_labels']) > 5 else ''}")
            
            combined_labels.update(result['all_labels'])
            for label, count in result['label_counts'].items():
                combined_counts[label] += count
            
            if first_label is None and result['first_label']:
                first_label = result['first_label']
    
    # Sort combined
    sorted_combined = sorted(combined_counts.items(), key=lambda x: x[1], reverse=True)
    
    print(f"\n{'-'*40}")
    print(f"COMBINED RESULTS:")
    print(f"  Total unique labels: {len(combined_labels)}")
    print(f"  First label found: {first_label}")
    print(f"  Most common label: {sorted_combined[0][0] if sorted_combined else None}")
    print(f"\n  All labels by frequency:")
    for label, count in sorted_combined:
        print(f"    {label}: {count}")
    
    return {
        'all_labels': combined_labels,
        'label_counts': dict(combined_counts),
        'sorted_labels': sorted_combined,
        'first_label': first_label,
        'most_common_label': sorted_combined[0][0] if sorted_combined else None
    }


# Run label discovery
label_discovery = discover_labels_all_datasets(all_datasets)

# Store first label for hypothesis creation
FIRST_LABEL = label_discovery['first_label']
print(f"\nFirst label discovered: '{FIRST_LABEL}'")
print(f"  This will be used to create the basic hypothesis.")

## Cell 4: Ground Truth Loader (No Tokenization)

In [None]:
# =============================================================================
# GROUND TRUTH LOADER - NO TOKENIZATION
# =============================================================================

def load_ground_truth(label_path):
    """
    Load ground truth labels from label file.
    
    IMPORTANT: Ground truth is NOT tokenized - only used for validation.
    
    Format: JSON lines where each line is:
    {"line": N, "labels": ["label1", "label2"], "rules": {...}}
    
    Args:
        label_path (str): Path to label file
    
    Returns:
        dict: {line_number: {"labels": [...], "rules": {...}}}
    """
    ground_truth = {}
    
    if not os.path.exists(label_path):
        print(f"WARNING: Label file not found: {label_path}")
        return ground_truth
    
    with open(label_path, 'r', encoding='utf-8', errors='ignore') as f:
        for json_line in f:
            json_line = json_line.strip()
            if not json_line:
                continue
            
            try:
                entry = json.loads(json_line)
                line_num = entry.get('line')
                
                if line_num is not None:
                    ground_truth[line_num] = {
                        'labels': entry.get('labels', []),
                        'rules': entry.get('rules', {})
                    }
            except json.JSONDecodeError:
                continue
    
    return ground_truth


def get_label_for_line(line_number, ground_truth):
    """
    Get ground truth for a specific log line.
    
    Args:
        line_number (int): 1-indexed line number
        ground_truth (dict): Loaded ground truth
    
    Returns:
        dict: {"is_malicious": bool, "labels": [...], "rules": {...}}
    """
    if line_number in ground_truth:
        return {
            "is_malicious": True,
            "labels": ground_truth[line_number]['labels'],
            "rules": ground_truth[line_number]['rules']
        }
    else:
        return {
            "is_malicious": False,
            "labels": [],
            "rules": {}
        }


print("Ground truth loader defined")

## Cell 5: Raw Log Loader (With Line Tracking)

In [None]:
# =============================================================================
# RAW LOG LOADER - WITH 1-INDEXED LINE TRACKING
# =============================================================================

def load_raw_logs(log_path):
    """
    Load raw log lines with 1-indexed line number tracking.
    
    Args:
        log_path (str): Path to log file
    
    Returns:
        list: [(line_number, log_text), ...] - line numbers are 1-indexed
    """
    logs = []
    
    if not os.path.exists(log_path):
        print(f"ERROR: Log file not found: {log_path}")
        return logs
    
    with open(log_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line_number, log_text in enumerate(f, 1):  # 1-indexed!
            log_text = log_text.strip()
            if log_text:
                logs.append((line_number, log_text))
    
    return logs


def load_dataset(dataset_info):
    """
    Load complete dataset (logs + ground truth).
    
    Args:
        dataset_info (dict): Dataset info from scan_all_datasets()
    
    Returns:
        tuple: (logs, ground_truth, stats)
    """
    logs = load_raw_logs(dataset_info['log_path'])
    ground_truth = load_ground_truth(dataset_info['label_path'])
    
    total_lines = len(logs)
    malicious_lines = len(ground_truth)
    
    stats = {
        'name': dataset_info['name'],
        'total_lines': total_lines,
        'malicious_lines': malicious_lines,
        'benign_lines': total_lines - malicious_lines,
        'malicious_pct': (malicious_lines / total_lines * 100) if total_lines > 0 else 0
    }
    
    print(f"\nLoaded: {dataset_info['name']}")
    print(f"  Total lines: {total_lines}")
    print(f"  Malicious: {malicious_lines} ({stats['malicious_pct']:.2f}%)")
    print(f"  Benign: {stats['benign_lines']}")
    
    return logs, ground_truth, stats


print("Raw log loader defined")

## Cell 6: Model Loading and Entity Extraction

In [None]:
# =============================================================================
# MODEL LOADING AND ENTITY EXTRACTION (PHYSICAL TOKEN QUANTIZATION)
# =============================================================================

ENTITY_LABELS = [
    'O', 'B-Action', 'B-ApplicationSpecific', 'B-AuthenticationType',
    'B-DNSName', 'I-DNSName', 'B-DateTime', 'I-DateTime', 'B-Error', 'I-Error',
    'B-IPAddress', 'B-Object', 'B-Port', 'B-Process', 'B-Protocol',
    'B-Service', 'B-SessionID', 'B-Severity', 'B-Status', 'I-Status',
    'B-System', 'B-Username'
]

def get_model_pipeline(model_path):
    """Load NER model with aggregation_strategy='simple'."""
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Model not found: {model_path}")
    
    print(f"Loading model from: {model_path}")
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForTokenClassification.from_pretrained(model_path)
    
    nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
    print("Model loaded")
    return nlp


def process_text(text, nlp_pipeline):
    """
    Physical Token Quantization for entity extraction.
    
    1. Identify physical tokens (text NOT separated by whitespace/brackets)
    2. If model finds ANY entity in physical token -> whole token selected
    3. Priority: IPAddress > DNSName
    4. Merge adjacent tokens with same label
    """
    raw_results = nlp_pipeline(text)
    physical_tokens = [match for match in re.finditer(r'[^\[\]\s]+', text)]
    
    atomic_entities = []
    
    for pt in physical_tokens:
        t_start, t_end = pt.span()
        matches = [r for r in raw_results if r['start'] < t_end and r['end'] > t_start]
        
        if not matches:
            continue
        
        labels = set(m['entity_group'] for m in matches)
        
        if 'IPAddress' in labels and 'DNSName' in labels:
            chosen_label = 'IPAddress'
        else:
            matches.sort(key=lambda x: x['start'])
            chosen_label = matches[0]['entity_group']
        
        avg_score = sum(float(m['score']) for m in matches) / len(matches)
        
        atomic_entities.append({
            "label": chosen_label,
            "text": text[t_start:t_end],
            "start": t_start,
            "end": t_end,
            "confidence": avg_score
        })
    
    # Merge adjacent same-label entities
    if not atomic_entities:
        return []
    
    final_entities = [atomic_entities[0]]
    
    for curr in atomic_entities[1:]:
        prev = final_entities[-1]
        text_between = text[prev['end']:curr['start']]
        is_pure_whitespace = text_between.strip() == '' and '[' not in text_between and ']' not in text_between
        
        if is_pure_whitespace and prev['label'] == curr['label']:
            prev['end'] = curr['end']
            prev['text'] = text[prev['start']:prev['end']]
            prev['confidence'] = (prev['confidence'] + curr['confidence']) / 2
        else:
            final_entities.append(curr)
    
    for ent in final_entities:
        ent['confidence'] = round(ent['confidence'], 4)
    
    return final_entities


def extract_entities_for_line(line_number, log_text, nlp_pipeline):
    """Extract entities with line number tracking."""
    entities = process_text(log_text, nlp_pipeline)
    
    entity_types = defaultdict(list)
    for ent in entities:
        entity_types[ent['label']].append({
            'value': ent['text'],
            'confidence': ent['confidence']
        })
    
    return {
        'line_number': line_number,
        'log_text': log_text,
        'entities': entities,
        'entity_types': dict(entity_types)
    }


# Load model
nlp = get_model_pipeline(MODELS[SELECTED_MODEL])

## Cell 7: Basic Hypothesis (Using First Label)

In [None]:
# =============================================================================
# BASIC HYPOTHESIS - USING FIRST DISCOVERED LABEL
# =============================================================================

# Create hypothesis based on first discovered label
BASIC_HYPOTHESIS = {
    'name': f'H1_{FIRST_LABEL}',
    'description': f'Hypothesis for detecting {FIRST_LABEL} attacks',
    'target_label': FIRST_LABEL,  # Ground truth label this maps to
    'weights': {
        # Default weights - adjust based on what entities are relevant
        'Process': 0.25,
        'Username': 0.20,
        'Action': 0.20,
        'IPAddress': 0.15,
        'DateTime': 0.10,
        'Status': 0.10
    },
    'critical_entity': 'Process',  # Most important entity
    'penalty_factor': 0.20  # Penalty if critical entity missing
}

# Thresholds for confidence levels
THRESHOLDS = {
    'HIGH': 0.65,
    'MEDIUM': 0.50,
    'LOW': 0.35
}

# Triage priority mapping
TRIAGE_PRIORITIES = {
    'HIGH': 'Priority 1: Investigate immediately',
    'MEDIUM': 'Priority 2: Queue for investigation',
    'LOW': 'Priority 3: Investigate later',
    'INSUFFICIENT': 'Priority 4: Archive for future relevance'
}

print(f"Basic Hypothesis Created:")
print(f"  Name: {BASIC_HYPOTHESIS['name']}")
print(f"  Target Label: {BASIC_HYPOTHESIS['target_label']}")
print(f"  Critical Entity: {BASIC_HYPOTHESIS['critical_entity']}")
print(f"  Weights: {BASIC_HYPOTHESIS['weights']}")

## Cell 8: P_Score Calculation (Option A - Binary)

In [None]:
# =============================================================================
# P_SCORE CALCULATION - OPTION A (BINARY PRESENCE)
# =============================================================================

def calculate_pscore(entity_types, hypothesis):
    """
    Calculate P_Score using BINARY entity presence.
    
    Formula: P_Score = (Sum(W_i * E_i)) * (1 - P_F)
    
    Where:
        W_i = weight for entity type i
        E_i = 1 if entity present, 0 if absent
        P_F = penalty factor (if critical entity missing)
    
    Args:
        entity_types (dict): {entity_type: [{'value': str, 'confidence': float}]}
        hypothesis (dict): Hypothesis configuration
    
    Returns:
        dict: P_Score result
    """
    weights = hypothesis['weights']
    critical_entity = hypothesis['critical_entity']
    penalty_factor = hypothesis['penalty_factor']
    
    # Calculate weighted sum
    weighted_sum = 0.0
    entity_breakdown = {}
    
    for entity_type, weight in weights.items():
        is_present = entity_type in entity_types and len(entity_types[entity_type]) > 0
        contribution = weight * (1 if is_present else 0)
        weighted_sum += contribution
        
        entity_breakdown[entity_type] = {
            'weight': weight,
            'present': is_present,
            'contribution': contribution
        }
    
    # Check critical entity
    critical_present = critical_entity in entity_types and len(entity_types[critical_entity]) > 0
    
    # Apply penalty
    if critical_present:
        p_score = weighted_sum
    else:
        p_score = weighted_sum * (1 - penalty_factor)
    
    # Determine confidence level
    if p_score >= THRESHOLDS['HIGH']:
        confidence_level = 'HIGH'
    elif p_score >= THRESHOLDS['MEDIUM']:
        confidence_level = 'MEDIUM'
    elif p_score >= THRESHOLDS['LOW']:
        confidence_level = 'LOW'
    else:
        confidence_level = 'INSUFFICIENT'
    
    return {
        'p_score': round(p_score, 4),
        'confidence_level': confidence_level,
        'triage_priority': TRIAGE_PRIORITIES[confidence_level],
        'is_malicious': p_score >= THRESHOLDS['LOW'],
        'critical_present': critical_present,
        'entity_breakdown': entity_breakdown
    }


def process_log_line(line_number, log_text, nlp_pipeline, hypothesis):
    """
    Complete processing: extract entities -> calculate P_Score -> prediction.
    """
    # Extract entities
    extraction = extract_entities_for_line(line_number, log_text, nlp_pipeline)
    
    # Calculate P_Score
    score_result = calculate_pscore(extraction['entity_types'], hypothesis)
    
    return {
        'line_number': line_number,
        'log_text': log_text,
        'entities': extraction['entities'],
        'entity_types': extraction['entity_types'],
        'p_score': score_result['p_score'],
        'confidence_level': score_result['confidence_level'],
        'triage_priority': score_result['triage_priority'],
        'is_malicious': score_result['is_malicious'],
        'critical_present': score_result['critical_present']
    }


print("P_Score calculation functions defined")

## Cell 9: Binary Validation (Approach 1)

In [None]:
# =============================================================================
# BINARY VALIDATION - TP/FP/TN/FN
# =============================================================================

def validate_binary(predictions, ground_truth, target_label=None):
    """
    Binary validation: malicious vs benign.
    
    Args:
        predictions (list): List of prediction dicts
        ground_truth (dict): Ground truth {line_number: {"labels": [...]}}
        target_label (str): Optional - only count as TP if this label present
    
    Returns:
        dict: Validation metrics
    """
    tp = fp = tn = fn = 0
    
    details = {
        'true_positives': [],
        'false_positives': [],
        'true_negatives': [],
        'false_negatives': []
    }
    
    for pred in predictions:
        line_num = pred['line_number']
        predicted_malicious = pred['is_malicious']
        
        gt = get_label_for_line(line_num, ground_truth)
        actually_malicious = gt['is_malicious']
        
        # If target_label specified, check if it's in the labels
        if target_label and actually_malicious:
            actually_malicious = target_label in gt['labels']
        
        if predicted_malicious and actually_malicious:
            tp += 1
            details['true_positives'].append({
                'line': line_num,
                'score': pred['p_score'],
                'labels': gt['labels']
            })
        elif predicted_malicious and not actually_malicious:
            fp += 1
            details['false_positives'].append({
                'line': line_num,
                'score': pred['p_score']
            })
        elif not predicted_malicious and not actually_malicious:
            tn += 1
        else:
            fn += 1
            details['false_negatives'].append({
                'line': line_num,
                'labels': gt['labels']
            })
    
    # Calculate metrics
    total = tp + fp + tn + fn
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
    accuracy = (tp + tn) / total if total > 0 else 0.0
    
    return {
        'confusion_matrix': {'TP': tp, 'FP': fp, 'TN': tn, 'FN': fn},
        'metrics': {
            'precision': round(precision, 4),
            'recall': round(recall, 4),
            'f1_score': round(f1, 4),
            'accuracy': round(accuracy, 4)
        },
        'totals': {
            'total': total,
            'actual_malicious': tp + fn,
            'actual_benign': tn + fp,
            'predicted_malicious': tp + fp,
            'predicted_benign': tn + fn
        },
        'details': details
    }


print("Binary validation function defined")

## Cell 10: Run Basic Validation

In [None]:
# =============================================================================
# RUN BASIC FTE-HARM VALIDATION
# =============================================================================

def run_basic_validation(dataset_info, nlp_pipeline, hypothesis):
    """
    Run basic FTE-HARM validation on a single dataset.
    """
    print(f"\n{'='*80}")
    print(f"BASIC FTE-HARM VALIDATION")
    print(f"Dataset: {dataset_info['name']}")
    print(f"Hypothesis: {hypothesis['name']}")
    print(f"{'='*80}")
    
    # Load dataset
    logs, ground_truth, stats = load_dataset(dataset_info)
    
    # Process all logs
    print(f"\nProcessing {len(logs)} log lines...")
    predictions = []
    
    for i, (line_num, log_text) in enumerate(logs):
        pred = process_log_line(line_num, log_text, nlp_pipeline, hypothesis)
        predictions.append(pred)
        
        if (i + 1) % 500 == 0:
            print(f"  Processed: {i+1}/{len(logs)}")
    
    print(f"Processed {len(predictions)} logs")
    
    # Validate
    print(f"\nValidating against ground truth...")
    validation = validate_binary(predictions, ground_truth, hypothesis['target_label'])
    
    # Print results
    print(f"\n{'-'*40}")
    print("RESULTS")
    print(f"{'-'*40}")
    
    cm = validation['confusion_matrix']
    print(f"\nConfusion Matrix:")
    print(f"  True Positives:  {cm['TP']}")
    print(f"  False Positives: {cm['FP']}")
    print(f"  True Negatives:  {cm['TN']}")
    print(f"  False Negatives: {cm['FN']}")
    
    m = validation['metrics']
    print(f"\nMetrics:")
    print(f"  Precision: {m['precision']:.4f}")
    print(f"  Recall:    {m['recall']:.4f}")
    print(f"  F1-Score:  {m['f1_score']:.4f}")
    print(f"  Accuracy:  {m['accuracy']:.4f}")
    
    return {
        'dataset': dataset_info['name'],
        'hypothesis': hypothesis['name'],
        'stats': stats,
        'predictions': predictions,
        'validation': validation
    }


# Select first dataset for testing
if all_datasets:
    selected_dataset = all_datasets[0]
    print(f"\nSelected dataset: {selected_dataset['name']}")
    
    # Run validation
    results = run_basic_validation(selected_dataset, nlp, BASIC_HYPOTHESIS)

## Cell 11: Save Results

In [None]:
# =============================================================================
# SAVE RESULTS TO GOOGLE DRIVE
# =============================================================================

def save_basic_results(results, output_path=OUTPUT_PATH):
    """Save basic validation results."""
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f"basic_validation_{results['dataset'].replace('/', '_')}_{timestamp}.txt"
    filepath = os.path.join(output_path, filename)
    
    with open(filepath, 'w') as f:
        f.write("="*80 + "\n")
        f.write("FTE-HARM BASIC VALIDATION RESULTS\n")
        f.write("="*80 + "\n\n")
        
        f.write(f"Timestamp: {datetime.now().isoformat()}\n")
        f.write(f"Dataset: {results['dataset']}\n")
        f.write(f"Hypothesis: {results['hypothesis']}\n")
        f.write(f"Model: {SELECTED_MODEL}\n\n")
        
        f.write("DATASET STATISTICS:\n")
        s = results['stats']
        f.write(f"  Total Lines: {s['total_lines']}\n")
        f.write(f"  Malicious: {s['malicious_lines']} ({s['malicious_pct']:.2f}%)\n")
        f.write(f"  Benign: {s['benign_lines']}\n\n")
        
        f.write("CONFUSION MATRIX:\n")
        cm = results['validation']['confusion_matrix']
        f.write(f"  True Positives:  {cm['TP']}\n")
        f.write(f"  False Positives: {cm['FP']}\n")
        f.write(f"  True Negatives:  {cm['TN']}\n")
        f.write(f"  False Negatives: {cm['FN']}\n\n")
        
        f.write("METRICS:\n")
        m = results['validation']['metrics']
        f.write(f"  Precision: {m['precision']:.4f}\n")
        f.write(f"  Recall:    {m['recall']:.4f}\n")
        f.write(f"  F1-Score:  {m['f1_score']:.4f}\n")
        f.write(f"  Accuracy:  {m['accuracy']:.4f}\n")
    
    print(f"\nResults saved to: {filepath}")
    return filepath


# Save results
if 'results' in dir():
    save_basic_results(results)

---

## Checklist

**Before running:**
- [ ] Google Drive mounted
- [ ] Datasets found by scanner
- [ ] Labels discovered (first label identified)
- [ ] Model loaded successfully
- [ ] Hypothesis created with target label

**After running:**
- [ ] Check confusion matrix (TP/FP/TN/FN)
- [ ] Review F1-Score (target: > 0.45)
- [ ] Check false negatives (missed attacks)
- [ ] Review results file in Google Drive