In [None]:
!unzip models.zip

In [None]:
# ============================================================================
# COMPREHENSIVE UNICODE ATTACK EVALUATION NOTEBOOK
# Multi-Encoder | Multi-Attack | Multi-Rate Analysis
# ============================================================================

# ============================================================================
# SECTION 1: Installs and Imports
# ============================================================================
!pip install --quiet ogb torch torchvision torchaudio torch-geometric sentence-transformers tqdm scikit-learn
!pip install torch-geometric ogb -q

In [None]:
# Download dataset
!mkdir -p data/ogbn-arxiv/raw
!wget -O data/ogbn-arxiv/raw/titleabs.tsv https://snap.stanford.edu/ogb/data/misc/ogbn_arxiv/titleabs.tsv

In [None]:
import os
import json
import math
import torch
import numpy as np
import random
import re
from tqdm.auto import tqdm
import torch.nn.functional as F
import pandas as pd
from datetime import datetime
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

# PyG / OGB
from ogb.nodeproppred import PygNodePropPredDataset
from torch_geometric.data.data import DataEdgeAttr, DataTensorAttr
from torch_geometric.data.storage import GlobalStorage
from torch_geometric.nn.models import GraphSAGE
from sentence_transformers import SentenceTransformer

print("‚úì All imports successful")

In [None]:
# ============================================================================
# SECTION 2: Dataset Preparation
# ============================================================================
print("\n" + "="*80)
print("SECTION 2: Dataset Preparation")
print("="*80)

try:
    with torch.serialization.safe_globals([DataEdgeAttr, DataTensorAttr, GlobalStorage]):
        dataset = PygNodePropPredDataset(name="ogbn-arxiv", root="data/ogbn-arxiv")
    num_nodes = dataset[0].num_nodes
except:
    dataset = PygNodePropPredDataset(name="ogbn-arxiv", root="data/ogbn-arxiv")
    num_nodes = dataset[0].num_nodes

TSV_PATH = 'data/ogbn-arxiv/raw/titleabs.tsv'
TEXTS_OUTPUT_PATH = 'data/arxiv_texts.txt'

# Load OGB ID -> MAG ID mapping
ogb_id_to_mag_id_file = 'data/ogbn-arxiv/ogbn_arxiv/mapping/nodeidx2paperid.csv.gz'
assert os.path.exists(ogb_id_to_mag_id_file), "Missing OGB mapping file"

ogb_id_map = pd.read_csv(ogb_id_to_mag_id_file)
ogb_id_map.columns = ['ogb_id', 'mag_id']

# Load raw text data
print("Loading raw text TSV...")
raw_texts_df = pd.read_csv(TSV_PATH, sep='\t', header=None,
                           names=['mag_id', 'title', 'abstract'],
                           on_bad_lines='skip')

# Merge and align
print("Aligning OGB Node IDs with MAG Texts...")
merged_df = pd.merge(ogb_id_map, raw_texts_df, on='mag_id', how='left')
merged_df = merged_df.sort_values(by='ogb_id')
merged_df['full_text'] = merged_df['title'].fillna('') + ' ' + merged_df['abstract'].fillna('')

texts_list = merged_df['full_text'].tolist()

with open(TEXTS_OUTPUT_PATH, 'w', encoding='utf-8') as f:
    for text in texts_list:
        f.write(text.strip() + '\n')

print(f"‚úì Created text file: {TEXTS_OUTPUT_PATH}")
print(f"‚úì Total texts: {len(texts_list)}")

In [None]:
# ============================================================================
# SECTION 3: Universal Encoder Wrapper
# ============================================================================
print("\n" + "="*80)
print("SECTION 3: Universal Encoder Wrapper")
print("="*80)

class UniversalEncoder:
    """
    Unified interface for multiple sentence encoders.
    Supports: SBERT, USE, SimCSE, E5, and MiniLM variants.
    """

    ENCODER_CONFIGS = {
        'minilm': {
            'model_name': 'sentence-transformers/all-MiniLM-L6-v2',
            'dim': 384,
            'type': 'sbert'
        },
        'mpnet': {
            'model_name': 'sentence-transformers/all-mpnet-base-v2',
            'dim': 768,
            'type': 'sbert'
        },
        'e5-base': {
            'model_name': 'intfloat/e5-base-v2',
            'dim': 768,
            'type': 'e5'
        },
        'e5-large-multilingual': {
            'model_name': 'intfloat/multilingual-e5-large',
            'dim': 1024,
            'type': 'e5'
        },
        'paraphrase-multilingual': {
            'model_name': 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2',
            'dim': 768,
            'type': 'sbert'
        }
    }

    def __init__(self, encoder_name='minilm', device='cuda'):
        if encoder_name not in self.ENCODER_CONFIGS:
            raise ValueError(f"Unknown encoder: {encoder_name}. Available: {list(self.ENCODER_CONFIGS.keys())}")

        self.encoder_name = encoder_name
        self.config = self.ENCODER_CONFIGS[encoder_name]
        self.device = device

        print(f"Loading {encoder_name} ({self.config['model_name']})...")
        self.model = SentenceTransformer(self.config['model_name'], device=device)
        self.model.eval()

        print(f"‚úì Loaded {encoder_name} (dim={self.config['dim']})")

    def encode(self, texts, batch_size=64, show_progress=False):
        """Encode texts to embeddings."""
        if self.config['type'] == 'e5':
            # E5 models require "query: " prefix for queries
            texts = [f"query: {text}" for text in texts]

        with torch.no_grad():
            embeddings = self.model.encode(
                texts,
                batch_size=batch_size,
                show_progress_bar=show_progress,
                convert_to_tensor=True,
                device=self.device
            )

        return embeddings

    @property
    def embedding_dim(self):
        return self.config['dim']

print("‚úì Universal Encoder Wrapper defined")

In [None]:
# ============================================================================
# SECTION 4: Load Graph Structure and Model
# ============================================================================
print("\n" + "="*80)
print("SECTION 4: Load Graph Structure")
print("="*80)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

# Load dataset
if [DataEdgeAttr, DataTensorAttr, GlobalStorage]:
    with torch.serialization.safe_globals([DataEdgeAttr, DataTensorAttr, GlobalStorage]):
        dataset = PygNodePropPredDataset(name="ogbn-arxiv", root="data/ogbn-arxiv")
        data = dataset[0]
else:
    dataset = PygNodePropPredDataset(name="ogbn-arxiv", root="data/ogbn-arxiv")
    data = dataset[0]

num_nodes = data.num_nodes
num_classes = int(dataset.num_classes)
split_idx = dataset.get_idx_split()
train_idx = split_idx["train"]
val_idx = split_idx["valid"]
test_idx = split_idx["test"]

# Create masks
train_mask = torch.zeros(num_nodes, dtype=torch.bool)
train_mask[train_idx] = True
val_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask[val_idx] = True
test_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask[test_idx] = True

train_mask = train_mask.to(device)
val_mask = val_mask.to(device)
test_mask = test_mask.to(device)
data.edge_index = data.edge_index.to(device)
data.y = data.y.squeeze().to(device)

print(f"‚úì Nodes: {num_nodes}, Classes: {num_classes}")
print(f"‚úì Train: {train_idx.shape[0]}, Val: {val_idx.shape[0]}, Test: {test_idx.shape[0]}")

# Load texts
with open(TEXTS_OUTPUT_PATH, "r", encoding="utf8") as f:
    texts = [line.strip() for line in f]

print(f"‚úì Loaded {len(texts)} texts")


# Verify directory structure
print("\nüîç Verifying file structure...")
required_dirs = ['embeddings', 'models', 'results']
for dir_name in required_dirs:
    if not os.path.exists(dir_name):
        print(f"  Creating {dir_name}/")
        os.makedirs(dir_name, exist_ok=True)
    else:
        print(f"  ‚úì {dir_name}/ exists")

# Check for trained models
print("\n Available trained models:")
if os.path.exists('models'):
    model_files = [f for f in os.listdir('models') if f.endswith('_model.pt')]
    if model_files:
        for model_file in model_files:
            print(f"  ‚úì {model_file}")
    else:
        print("    No model files found. Run training notebook first.")

In [None]:
# ============================================================================
# SECTION 5: Enhanced Attack Strategies
# ============================================================================
print("\n" + "="*80)
print("SECTION 5: Attack Strategies")
print("="*80)

# Topic-relevant emojis
TOPIC_EMOJIS = {
    'neural': ['üß†', 'ü§ñ', 'üí°'], 'network': ['üï∏Ô∏è', 'üåê', 'üîó'],
    'learning': ['üìö', 'üéì', 'üìñ'], 'deep': ['üèä', '‚¨áÔ∏è'],
    'machine': ['ü§ñ', '‚öôÔ∏è', 'üîß'], 'model': ['üìä', 'üéØ', 'üìà'],
    'data': ['üíæ', 'üìä', 'üóÑÔ∏è'], 'algorithm': ['üî¢', '‚ö°', 'üé≤'],
    'computer': ['üíª', 'üñ•Ô∏è'], 'graph': ['üìà', 'üìä', 'üï∏Ô∏è'],
    'optimization': ['‚ö°', 'üéØ', 'üìà'], 'training': ['üèãÔ∏è', 'üí™', 'üéì'],
}

# Homoglyphs (visually similar characters)
HOMOGLYPHS = {
    'a': ['–∞', '·∫°'], 'e': ['–µ', 'ƒì'], 'o': ['–æ', '≈ç'],
    'p': ['—Ä'], 'c': ['—Å'], 'x': ['—Ö'], 'y': ['—É'],
    'i': ['—ñ', 'ƒ´'], 's': ['—ï'], 'n': ['’∏']
}

# Currency symbols
CURRENCY_SYMBOLS = ['$', '‚Ç¨', '¬£', '¬•', '‚Çπ', '‚ÇΩ']

# Mixed script punctuation
MIXED_PUNCT = {
    '.': ['„ÄÇ', 'Ôºé'], ',': ['Ôºå', '„ÄÅ'], ';': ['Ôºõ'],
    ':': ['Ôºö'], '!': ['ÔºÅ'], '?': ['Ôºü']
}

def emoji_injection_attack(text, rate=0.15, seed=None):
    """Inject topic-relevant emojis."""
    if seed is not None:
        random.seed(seed)

    words = text.split()
    for i, word in enumerate(words):
        word_lower = word.lower().strip('.,;:!?()')
        if word_lower in TOPIC_EMOJIS and random.random() < rate:
            emoji = random.choice(TOPIC_EMOJIS[word_lower])
            words[i] = f"{word}{emoji}"

    return ' '.join(words)

def homoglyph_attack(text, rate=0.15, seed=None):
    """Replace characters with homoglyphs."""
    if seed is not None:
        random.seed(seed)

    chars = list(text.lower())
    replaceable = [i for i, c in enumerate(chars) if c in HOMOGLYPHS and chars[i].isalpha()]

    if not replaceable:
        return text

    num_replace = max(1, int(len(replaceable) * rate))
    positions = random.sample(replaceable, min(num_replace, len(replaceable)))

    for pos in positions:
        chars[pos] = random.choice(HOMOGLYPHS[chars[pos]])

    return ''.join(chars)

def currency_attack(text, rate=0.08, seed=None):
    """Add currency symbols near numbers."""
    if seed is not None:
        random.seed(seed)

    matches = list(re.finditer(r'\b\d+\.?\d*\b', text))
    if not matches:
        return text

    num_inject = max(1, int(len(matches) * rate))
    chosen = random.sample(matches, min(num_inject, len(matches)))

    result = text
    for match in sorted(chosen, key=lambda m: m.start(), reverse=True):
        num = match.group()
        symbol = random.choice(CURRENCY_SYMBOLS)
        replacement = f"{symbol}{num}" if random.random() < 0.5 else f"{num}{symbol}"
        result = result[:match.start()] + replacement + result[match.end():]

    return result

def mixed_script_attack(text, rate=0.15, seed=None):
    """Replace punctuation with CJK variants."""
    if seed is not None:
        random.seed(seed)

    chars = list(text)
    punct_pos = [i for i, c in enumerate(chars) if c in MIXED_PUNCT]

    if not punct_pos:
        return text

    num_replace = max(1, int(len(punct_pos) * rate))
    positions = random.sample(punct_pos, min(num_replace, len(punct_pos)))

    for pos in positions:
        chars[pos] = random.choice(MIXED_PUNCT[chars[pos]])

    return ''.join(chars)

def apply_attack(text, attack_type, rate, seed=None):
    """Apply specified attack with given rate."""
    if attack_type == 'emoji':
        return emoji_injection_attack(text, rate=rate, seed=seed)
    elif attack_type == 'homoglyph':
        return homoglyph_attack(text, rate=rate, seed=seed)
    elif attack_type == 'currency':
        return currency_attack(text, rate=rate, seed=seed)
    elif attack_type == 'mixed_script':
        return mixed_script_attack(text, rate=rate, seed=seed)
    else:
        raise ValueError(f"Unknown attack: {attack_type}")

print("‚úì Attack strategies defined")

In [None]:
# ============================================================================
# SECTION 6: Model Definition
# ============================================================================
print("\n" + "="*80)
print("SECTION 6: Model Definition")
print("="*80)

class SAGEModel(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, num_classes):
        super().__init__()
        self.sage_model = GraphSAGE(
            in_channels=in_channels,
            hidden_channels=hidden_channels,
            num_layers=2,
            out_channels=num_classes,
            dropout=0.5,
            act='relu'
        )

    def forward(self, x, edge_index):
        return self.sage_model(x, edge_index)

print("‚úì Model class defined")

In [None]:
# ============================================================================
# SECTION 7: Comprehensive Evaluation Function
# ============================================================================
print("\n" + "="*80)
print("SECTION 7: Evaluation Metrics")
print("="*80)

@torch.no_grad()
def comprehensive_eval(model, clean_feats, attacked_feats, edge_index, y, test_mask, test_idx):
    """Compute comprehensive metrics."""
    model.eval()

    # Clean predictions
    clean_out = model(clean_feats, edge_index)
    clean_preds = clean_out.argmax(dim=1)
    clean_acc = (clean_preds[test_mask] == y[test_mask]).float().mean().item()

    # Attacked predictions
    attacked_out = model(attacked_feats, edge_index)
    attacked_preds = attacked_out.argmax(dim=1)
    attacked_acc = (attacked_preds[test_mask] == y[test_mask]).float().mean().item()

    # Get test set predictions and labels
    test_y = y[test_mask].cpu().numpy()
    test_clean_preds = clean_preds[test_mask].cpu().numpy()
    test_attacked_preds = attacked_preds[test_mask].cpu().numpy()

    # Precision, Recall, F1 (macro average)
    clean_prec, clean_rec, clean_f1, _ = precision_recall_fscore_support(
        test_y, test_clean_preds, average='macro', zero_division=0
    )
    attacked_prec, attacked_rec, attacked_f1, _ = precision_recall_fscore_support(
        test_y, test_attacked_preds, average='macro', zero_division=0
    )

    # Flip analysis
    test_clean_correct = (clean_preds[test_mask] == y[test_mask])
    test_attacked_correct = (attacked_preds[test_mask] == y[test_mask])
    flipped_to_wrong = test_clean_correct & ~test_attacked_correct
    num_flipped = flipped_to_wrong.sum().item()

    # Embedding distances
    test_clean_emb = clean_feats[test_idx]
    test_attacked_emb = attacked_feats[test_idx]

    mse = F.mse_loss(test_clean_emb, test_attacked_emb).item()
    cosine_sim = F.cosine_similarity(test_clean_emb, test_attacked_emb).mean().item()
    l2_dist = torch.norm(test_clean_emb - test_attacked_emb, dim=1).mean().item()

    return {
        'clean_accuracy': clean_acc,
        'attacked_accuracy': attacked_acc,
        'accuracy_drop': clean_acc - attacked_acc,
        'clean_precision': clean_prec,
        'attacked_precision': attacked_prec,
        'precision_drop': clean_prec - attacked_prec,
        'clean_recall': clean_rec,
        'attacked_recall': attacked_rec,
        'recall_drop': clean_rec - attacked_rec,
        'clean_f1': clean_f1,
        'attacked_f1': attacked_f1,
        'f1_drop': clean_f1 - attacked_f1,
        'num_flipped': num_flipped,
        'flip_rate': num_flipped / test_mask.sum().item(),
        'mse': mse,
        'cosine_similarity': cosine_sim,
        'l2_distance': l2_dist
    }

print("‚úì Evaluation function defined")

In [None]:
# ============================================================================
# SECTION 8: Main Experiment Pipeline
# ============================================================================
print("\n" + "="*80)
print("SECTION 8: Automated Experiment Pipeline")
print("="*80)

# Configuration
ENCODERS = ['minilm'] #'mpnet'] #'e5-base', 'e5-large-multilingual', 'paraphrase-multilingual']
ATTACKS = ['emoji', 'homoglyph', 'currency', 'mixed_script']
ATTACK_RATES = [0.15, 0.25, 0.35]

# Create directories
os.makedirs('embeddings', exist_ok=True)
os.makedirs('models', exist_ok=True)
os.makedirs('results', exist_ok=True)

def run_experiment(encoder_name, attack_type, attack_rate):
    """Run single experiment configuration."""
    print(f"\n{'='*80}")
    print(f"EXPERIMENT: {encoder_name} | {attack_type} | rate={attack_rate}")
    print(f"{'='*80}")

    # File naming
    clean_emb_path = f"embeddings/{encoder_name}_clean.pt"
    attacked_emb_path = f"embeddings/{encoder_name}_{attack_type}_rate{int(attack_rate*100)}.pt"
    model_path = f"content/models/{encoder_name}_model.pt"
    results_path = f"results/{encoder_name}_{attack_type}_rate{int(attack_rate*100)}.json"

    # Step 1: Load or create clean embeddings
    if os.path.exists(clean_emb_path):
        print(f"‚úì Loading clean embeddings from {clean_emb_path}")
        clean_embeddings = torch.load(clean_emb_path, map_location=device)
    else:
        print(f"Creating clean embeddings with {encoder_name}...")
        encoder = UniversalEncoder(encoder_name, device=device)

        batch_size = 64
        embs = []
        for start in tqdm(range(0, num_nodes, batch_size), desc="Encoding"):
            batch_texts = texts[start:start+batch_size]
            batch_emb = encoder.encode(batch_texts, batch_size=batch_size)
            embs.append(batch_emb)

        clean_embeddings = torch.cat(embs, dim=0)
        torch.save(clean_embeddings.cpu(), clean_emb_path)
        print(f"‚úì Saved clean embeddings to {clean_emb_path}")

    clean_embeddings = clean_embeddings.to(device)

    # Step 2: Load pre-trained model
    if not os.path.exists(model_path):
        raise FileNotFoundError(
            f"Model not found: {model_path}\n"
            f"Please run the training notebook first to generate models."
        )

    print(f"‚úì Loading pre-trained model from {model_path}")
    model = SAGEModel(
        in_channels=clean_embeddings.size(1),
        hidden_channels=128,
        num_classes=num_classes
    ).to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()

    # Step 3: Create attacked texts
    print(f"Applying {attack_type} attack (rate={attack_rate})...")
    attacked_texts = texts.copy()
    test_indices = test_idx.cpu().tolist()

    for node_id in tqdm(test_indices, desc="Attacking"):
        attacked_texts[node_id] = apply_attack(
            attacked_texts[node_id],
            attack_type=attack_type,
            rate=attack_rate,
            seed=node_id
        )

    # Step 4: Encode attacked texts
    if os.path.exists(attacked_emb_path):
        print(f"‚úì Loading attacked embeddings from {attacked_emb_path}")
        attacked_embeddings = torch.load(attacked_emb_path, map_location=device)
    else:
        print(f"Encoding attacked texts...")
        encoder = UniversalEncoder(encoder_name, device=device)

        batch_size = 64
        embs = []
        for start in tqdm(range(0, num_nodes, batch_size), desc="Encoding Attacked"):
            batch_texts = attacked_texts[start:start+batch_size]
            batch_emb = encoder.encode(batch_texts, batch_size=batch_size)
            embs.append(batch_emb)

        attacked_embeddings = torch.cat(embs, dim=0)
        torch.save(attacked_embeddings.cpu(), attacked_emb_path)
        print(f"‚úì Saved attacked embeddings to {attacked_emb_path}")

    attacked_embeddings = attacked_embeddings.to(device)

    # Step 5: Create mixed feature set (clean train/val + attacked test)
    features_attacked = clean_embeddings.clone()
    features_attacked[test_idx] = attacked_embeddings[test_idx]

    # Step 6: Evaluate
    print("Evaluating...")
    results = comprehensive_eval(
        model, clean_embeddings, features_attacked,
        data.edge_index, data.y, test_mask, test_idx
    )

    # Add metadata
    results['encoder'] = encoder_name
    results['attack_type'] = attack_type
    results['attack_rate'] = attack_rate
    results['timestamp'] = datetime.now().isoformat()

    # Save results
    with open(results_path, 'w') as f:
        json.dump(results, f, indent=2)

    print(f"‚úì Results saved to {results_path}")

    # Print summary
    print(f"\n RESULTS SUMMARY:")
    print(f"  Clean Accuracy:    {results['clean_accuracy']:.4f}")
    print(f"  Attacked Accuracy: {results['attacked_accuracy']:.4f}")
    print(f"  Accuracy Drop:     {results['accuracy_drop']:.4f} ({results['accuracy_drop']*100:.2f}%)")
    print(f"  Clean F1:          {results['clean_f1']:.4f}")
    print(f"  Attacked F1:       {results['attacked_f1']:.4f}")
    print(f"  F1 Drop:           {results['f1_drop']:.4f}")
    print(f"  Flipped Nodes:     {results['num_flipped']} ({results['flip_rate']*100:.2f}%)")
    print(f"  Cosine Similarity: {results['cosine_similarity']:.4f}")

    return results

In [None]:
# ============================================================================
# SECTION 9: Run All Experiments
# ============================================================================
print("\n" + "="*80)
print("SECTION 9: Running All Experiments")
print("="*80)

# Validate that all required models exist
print("\n Validating pre-trained models...")
missing_models = []
for encoder_name in ENCODERS:
    model_path = f"content/models/{encoder_name}_model.pt"
    if not os.path.exists(model_path):
        missing_models.append(encoder_name)

if missing_models:
    print(f"\n‚ùå ERROR: Missing pre-trained models for: {missing_models}")
    print("Please run the training notebook first to generate these models.")
    raise FileNotFoundError("Missing required model files")

print("‚úì All required models found")

all_results = []
total_experiments = len(ENCODERS) * len(ATTACKS) * len(ATTACK_RATES)
completed = 0

print(f"\n Starting {total_experiments} experiments...")
print(f"   Encoders: {len(ENCODERS)}")
print(f"   Attacks: {len(ATTACKS)}")
print(f"   Rates: {len(ATTACK_RATES)}")

for encoder_name in ENCODERS:
    for attack_type in ATTACKS:
        for attack_rate in ATTACK_RATES:
            try:
                results = run_experiment(encoder_name, attack_type, attack_rate)
                all_results.append(results)
                completed += 1
                print(f"\n‚úì Completed {completed}/{total_experiments} experiments")
            except Exception as e:
                print(f"\n‚ùå Error in {encoder_name}/{attack_type}/{attack_rate}: {str(e)}")
                continue

In [None]:
# ============================================================================
# SECTION 10: Summary Report
# ============================================================================
print("\n" + "="*80)
print("SECTION 10: Final Summary Report")
print("="*80)

# Save all results
summary_path = f"results/summary_all_experiments_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(summary_path, 'w') as f:
    json.dump(all_results, f, indent=2)

print(f"\n‚úì All results saved to {summary_path}")

# Create comparison table
print("\n" + "="*80)
print("ATTACK EFFECTIVENESS COMPARISON")
print("="*80)

df_results = pd.DataFrame(all_results)

# Group by encoder
print("\n BY ENCODER:")
encoder_summary = df_results.groupby('encoder').agg({
    'accuracy_drop': ['mean', 'std'],
    'f1_drop': ['mean', 'std'],
    'flip_rate': ['mean', 'std']
}).round(4)
print(encoder_summary)

# Group by attack
print("\n BY ATTACK TYPE:")
attack_summary = df_results.groupby('attack_type').agg({
    'accuracy_drop': ['mean', 'std'],
    'f1_drop': ['mean', 'std'],
    'flip_rate': ['mean', 'std']
}).round(4)
print(attack_summary)

# Group by rate
print("\n BY ATTACK RATE:")
rate_summary = df_results.groupby('attack_rate').agg({
    'accuracy_drop': ['mean', 'std'],
    'f1_drop': ['mean', 'std'],
    'flip_rate': ['mean', 'std']
}).round(4)
print(rate_summary)

# Find most effective combinations
print("\n TOP 10 MOST EFFECTIVE ATTACKS (by accuracy drop):")
top_attacks = df_results.nlargest(10, 'accuracy_drop')[
    ['encoder', 'attack_type', 'attack_rate', 'accuracy_drop', 'f1_drop', 'flip_rate']
]
print(top_attacks.to_string(index=False))

print("\n" + "="*80)
print("‚úÖ ALL EXPERIMENTS COMPLETE!")
print("="*80)
print(f"\nGenerated files:")
print(f"   embeddings/  - {len(ENCODERS) * (1 + len(ATTACKS) * len(ATTACK_RATES))} embedding files")
print(f"   models/      - {len(ENCODERS)} model checkpoints")
print(f"   results/     - {len(all_results)} result JSON files + summary")

In [None]:
!zip -r /content/embeddings.zip /content/embeddings
!zip -r /content/results.zip /content/results