# FAIIA-IDS: Research Paper Deliverables Generation

This notebook generates the tables and figures required for the research paper. It assumes that the training scripts (`train_main.py`, `run_ablation.py`, `train_baselines.py`) have been executed and their artifacts saved.

In [None]:
# 1. Clone Repository
# TODO: Replace with your actual repository URL
GIT_REPO_URL = "https://github.com/Arif-Foysal/FAA-Net.git"
REPO_DIR = "FAA-Net" # This usually matches the name of the git repo

!git clone {GIT_REPO_URL}

import os
if os.path.exists(REPO_DIR):
    os.chdir(REPO_DIR)
    print(f"Changed directory to: {os.getcwd()}")
else:
    print(f"Warning: Could not find directory {REPO_DIR}. Check if git clone succeeded.")

# 2. Mount Google Drive (for saving models)
try:
    from google.colab import drive
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")
except ImportError:
    print("Not running in Google Colab, skipping Drive mount.")

# 3. Install Dependencies
!pip install -r requirements.txt

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_recall_curve, roc_curve, auc
import joblib

# Add project root to path
sys.path.append(os.path.abspath('..'))
from core.config import DROPPED_FEATURES
from core.data_loader import load_and_preprocess_data
from core.data_loader import get_data_paths

# Setup output directory for figures
FIG_DIR = '../paper_figures'
os.makedirs(FIG_DIR, exist_ok=True)

# Check for Google Drive path if useful, otherwise check local and parent dirs
base_dir = '.'
if os.path.exists('/content/drive/MyDrive/FAIIA_Models'):
    base_dir = '/content/drive/MyDrive/FAIIA_Models'
    print(f"Using artifacts from Google Drive: {base_dir}")
elif os.path.exists('../edan_v3_main.pt'):
    base_dir = '..'
    print(f"Found artifacts in parent directory: {os.path.abspath(base_dir)}")
else:
    print(f"Using local artifacts in: {os.path.abspath(base_dir)}")

## 1. Data & Preprocessing Artifacts
### Table D1 & D2: Dataset Statistics

In [None]:
try:
    # Reload data to get raw stats (relying on cache if available)
    data_dir = "/content" if os.path.exists("/content") else ".."
    # Note: We need the test set categorical labels for Per-Attack Analysis later
    X_train, X_test, y_train, y_test, y_train_cat, y_test_cat = load_and_preprocess_data(data_dir=data_dir)
    
    print("\n=== Table D1: Dataset Statistics ===")
    d1 = pd.DataFrame({
        'Split': ['Train', 'Test', 'Total'],
        'Samples': [len(y_train), len(y_test), len(y_train) + len(y_test)],
        'Minority (Attack)': [y_train.sum(), y_test.sum(), y_train.sum() + y_test.sum()],
        'Majority (Normal)': [len(y_train)-y_train.sum(), len(y_test)-y_test.sum(), (len(y_train)-y_train.sum()) + (len(y_test)-y_test.sum())]
    })
    d1['Imbalance Ratio'] = d1['Majority (Normal)'] / d1['Minority (Attack)']
    display(d1)
    
    if y_train_cat is not None:
        print("\n=== Table D2: Per-Attack Sample Distribution ===")
        # We need to map encoded values back to names if we had the encoder, 
        # but for now we will show distribution of encoded classes
        train_counts = pd.Series(y_train_cat).value_counts().sort_index()
        test_counts = pd.Series(y_test_cat).value_counts().sort_index()
        d2 = pd.DataFrame({'Train': train_counts, 'Test': test_counts})
        d2['Total'] = d2['Train'] + d2['Test']
        display(d2)
except Exception as e:
    print(f"Could not load dataset for stats: {e}")

## 2. Model Architecture Artifacts
### Figure F1: FAIIA Architecture Diagram

```mermaid
graph TD
    Input[Input Features] --> Norm[BatchNorm]
    Norm --> ProbEst[Prob Estimator (MLP)]
    ProbEst -->|Minority Prob| FAIIA[FAIIA Attention Block]
    Norm --> FAIIA
    
    subgraph FAIIA Block
    SelfAtt[Self-Attention] 
    Proto[Prototype Attention]
    Focal[Focal Modulation]
    
    SelfAtt --> Focal
    Proto --> Focal
    end

    FAIIA --> SE[Squeeze-and-Excitation]
    SE --> Res[Residual Hidden Blocks]
    Res --> Head[Classifier Head]
    Head --> Output[Attack Probability]
```

### Figure F2: FAIIA vs Vanilla DNN

```mermaid
graph LR
    subgraph Vanilla DNN
    I1[Input] --> L1[Linear+BN+ReLU] --> L2[Linear...] --> O1[Logits]
    end

    subgraph FAIIA-IDS
    I2[Input] --> F[FAIIA Attention] --> S[SE Block] --> H[Hidden Blocks] --> O2[Sigmoid Output]
    P[Minority Prototypes] -.-> F
    end
```

## 3. Training & Convergence Evidence
### Figure F3-F5: Training Curves

In [None]:
history_path = os.path.join(base_dir, 'edan_v3_history.csv')
vanilla_history_path = os.path.join(base_dir, 'vanilladnn_history.csv')

if os.path.exists(history_path):
    hist = pd.read_csv(history_path)
    epochs = range(1, len(hist) + 1)
    
    # Try to load Vanilla DNN history
    hist_vanilla = None
    if os.path.exists(vanilla_history_path):
        hist_vanilla = pd.read_csv(vanilla_history_path)
        epochs_vanilla = range(1, len(hist_vanilla) + 1)
    
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    # Loss
    axes[0].plot(epochs, hist['train_loss'], label='FAIIA Train', color='blue')
    axes[0].plot(epochs, hist['val_loss'], label='FAIIA Val', linestyle='--', color='blue')
    if hist_vanilla is not None:
        axes[0].plot(epochs_vanilla, hist_vanilla['train_loss'], label='Vanilla Train', color='gray', alpha=0.7)
        axes[0].plot(epochs_vanilla, hist_vanilla['val_loss'], label='Vanilla Val', linestyle='--', color='gray', alpha=0.7)

    axes[0].set_title('Figure 6: Convergence plot (Loss vs Epoch)')
    axes[0].set_xlabel('Epochs')
    axes[0].set_ylabel('Loss')
    axes[0].legend()
    axes[0].grid(True)
    
    # F1
    axes[1].plot(epochs, hist['train_f1'], label='FAIIA Train', color='green')
    axes[1].plot(epochs, hist['val_f1'], label='FAIIA Val', linestyle='--', color='green')
    if hist_vanilla is not None:
        axes[1].plot(epochs_vanilla, hist_vanilla['train_f1'], label='Vanilla Train', color='gray', alpha=0.7)
        axes[1].plot(epochs_vanilla, hist_vanilla['val_f1'], label='Vanilla Val', linestyle='--', color='gray', alpha=0.7)

    axes[1].set_title('Figure 1: F1-Score vs Epoch')
    axes[1].set_xlabel('Epochs')
    axes[1].set_ylabel('F1 Score')
    axes[1].legend()
    axes[1].grid(True)
    
    # Recall
    axes[2].plot(epochs, hist['train_recall'], label='FAIIA Train', color='red')
    axes[2].plot(epochs, hist['val_recall'], label='FAIIA Val', linestyle='--', color='red')
    if hist_vanilla is not None:
        axes[2].plot(epochs_vanilla, hist_vanilla['train_recall'], label='Vanilla Train', color='gray', alpha=0.7)
        axes[2].plot(epochs_vanilla, hist_vanilla['val_recall'], label='Vanilla Val', linestyle='--', color='gray', alpha=0.7)

    axes[2].set_title('Recall vs Epoch')
    axes[2].set_xlabel('Epochs')
    axes[2].set_ylabel('Recall')
    axes[2].legend()
    axes[2].grid(True)
    
    plt.tight_layout()
    plt.savefig(os.path.join(FIG_DIR, 'training_curves.png'))
    plt.show()
else:
    print("Training history not found. Run train_main.py first.")


## 4. Main Model Performance
### Table T1: Main Results

In [None]:
metrics_path = os.path.join(base_dir, 'edan_v3_metrics.csv')
if os.path.exists(metrics_path):
    t1 = pd.read_csv(metrics_path)
    t1.index = ['FAIIA (EDAN v3)']
    display(t1)
else:
    print("Main model metrics not found.")

### Table 2 & Figures 4-5: Per-Attack Analysis (Minority vs Majority)

In [None]:
from core.data_loader import get_data_paths
# Analyze Performance per Attack Category
pred_path = os.path.join(base_dir, 'edan_v3_predictions.npz')

if os.path.exists(pred_path):
    # 1. Load Predictions
    data = np.load(pred_path)
    y_true_bin = data['y_true']
    y_probs = data['y_probs']
    
    # 2. Get Attack Categories
    _, test_path = get_data_paths(data_dir="/content" if os.path.exists("/content") else "..")
    
    if os.path.exists(test_path):
        df_test_raw = pd.read_csv(test_path)
        if len(df_test_raw) != len(y_probs):
            print(f"Warning: Test set size ({len(df_test_raw)}) does not match predictions ({len(y_probs)}).")
        else:
            analysis_df = pd.DataFrame({
                'True Binary': y_true_bin,
                'Prob': y_probs,
                'Pred Binary': (y_probs > 0.5).astype(int),
                'Category': df_test_raw['attack_cat'].fillna('Normal')
            })
            analysis_df['Category'] = analysis_df['Category'].replace({'Backdoors': 'Backdoor'})
            attack_metrics = []
            for cat in analysis_df['Category'].unique():
                subset = analysis_df[analysis_df['Category'] == cat]
                count = len(subset)
                detected = subset['Pred Binary'].sum()
                attack_metrics.append({
                    'Attack': cat,
                    'Samples': count,
                    'Detection Rate': detected / count,
                    'Type': 'Majority' if count >= 5000 else 'Minority'
                })
            t2 = pd.DataFrame(attack_metrics).sort_values('Samples', ascending=False)
            print("\n=== Table 2: Per-attack metrics (Minority < 5000 vs Majority >= 5000) ===")
            display(t2)
            
            plt.figure(figsize=(10, 6))
            sns.barplot(data=t2[t2['Type'] == 'Minority'], x='Attack', y='Detection Rate', palette='viridis')
            plt.title('Figure 4: Minority detection comparison (Recall)')
            plt.xticks(rotation=45)
            plt.ylim(0, 1.1)
            plt.tight_layout()
            plt.savefig(os.path.join(FIG_DIR, 'minority_detection.png'))
            plt.show()
            
            plt.figure(figsize=(10, 6))
            sns.barplot(data=t2[t2['Type'] == 'Majority'], x='Attack', y='Detection Rate', palette='magma')
            plt.title('Figure 5: Majority detection comparison')
            plt.xticks(rotation=45)
            plt.ylim(0, 1.1)
            plt.tight_layout()
            plt.savefig(os.path.join(FIG_DIR, 'majority_detection.png'))
            plt.show()
    else:
         print("Test CSV not found for per-attack analysis.")
else:
    print("Predictions not found for per-attack analysis.")


## 5. Classical Baseline Comparisons
### Table T2: Classical ML Baselines

In [None]:
baseline_path = 'baseline_summary.csv'
# Check Drive location too
if not os.path.exists(baseline_path) and os.path.exists(os.path.join(base_dir, 'baseline_summary.csv')):
    baseline_path = os.path.join(base_dir, 'baseline_summary.csv')

if os.path.exists(baseline_path):
    t2 = pd.read_csv(baseline_path, index_col=0)
    display(t2)
else:
    print("Baseline results not found. Run train_baselines.py first.")

## 6. Deep Learning Ablation Study
### Table T3: FAIIA Ablation

In [None]:
ablation_path = 'ablation_summary.csv'
if not os.path.exists(ablation_path) and os.path.exists(os.path.join(base_dir, 'ablation_summary.csv')):
    ablation_path = os.path.join(base_dir, 'ablation_summary.csv')

if os.path.exists(ablation_path):
    t3 = pd.read_csv(ablation_path, index_col=0)
    t3['Attention'] = t3.index.str.contains('FAIIA')
    t3['Prototypes'] = t3.index.str.contains('FAIIA')
    t3['Focal'] = t3.index.str.contains('Focal')
    cols = ['Attention', 'Prototypes', 'Focal'] + [c for c in t3.columns if c not in ['Attention', 'Prototypes', 'Focal']]
    t3 = t3[cols]
    display(t3)
    
    plt.figure(figsize=(12, 6))
    if 'F1-Score' in t3.columns:
        plot_df = t3.sort_values('F1-Score')
        sns.barplot(x=plot_df.index, y=plot_df['F1-Score'], palette='coolwarm')
        plt.title('Figure 7: Ablation comparison')
        plt.ylabel('F1 Score')
        plt.xlabel('Model Variant')
        plt.xticks(rotation=45, ha='right')
        plt.ylim(0.8, 1.0)
        plt.grid(axis='y', alpha=0.3)
        plt.tight_layout()
        plt.savefig(os.path.join(FIG_DIR, 'ablation_comparison.png'))
        plt.show()
else:
    print("Ablation results not found. Run run_ablation.py first.")


## 7. Comparative Analysis Plots
### Figure F7 & F8: ROC and PR Curves

In [None]:
def plot_curves(models_dict, curve_type='ROC'):
    plt.figure(figsize=(10, 8))
    
    for name, file_path in models_dict.items():
        if not os.path.exists(file_path):
            continue
            
        data = np.load(file_path)
        y_true = data['y_true']
        y_probs = data['y_probs']
        
        if curve_type == 'ROC':
            fpr, tpr, _ = roc_curve(y_true, y_probs)
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.4f})')
            plt.plot([0, 1], [0, 1], 'k--', alpha=0.5)
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title('Figure 3: ROC curves per model')
        else:
            precision, recall, _ = precision_recall_curve(y_true, y_probs)
            plt.plot(recall, precision, label=f'{name}')
            plt.xlabel('Recall')
            plt.ylabel('Precision')
            plt.title('Figure 2: PR curves per model')
            
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(FIG_DIR, f'{curve_type.lower()}_curves.png'))
    plt.show()

# Define prediction files
pred_files = {
    'FAIIA (Full)': os.path.join(base_dir, 'edan_v3_predictions.npz'),
    'Vanilla DNN': os.path.join(base_dir, 'vanilladnn_focal_predictions.npz'),
    'XGBoost': os.path.join(base_dir, 'xgboost_predictions.npz'),
    'LightGBM': os.path.join(base_dir, 'lightgbm_predictions.npz')
}

print("Plotting ROC Curves...")
plot_curves(pred_files, 'ROC')

print("Plotting PR Curves...")
plot_curves(pred_files, 'PR')

## 8. Efficiency
### Table A3: Model Complexity

In [None]:
# Load model to count parameters properly
from core.model import EDANv3
from core.ablation import VanillaDNN_Ablation
from core.config import V3_CONFIG

# Mock input dim
input_dim = 40 # approx

dnn = VanillaDNN_Ablation(input_dim=input_dim).to('cpu')
faiia = EDANv3(input_dim=input_dim, 
               num_heads=V3_CONFIG['num_heads'], 
               attention_dim=V3_CONFIG['attention_dim'],
               n_prototypes=V3_CONFIG['n_prototypes']).to('cpu')

efficiency_df = pd.DataFrame({
    'Model': ['Vanilla DNN', 'FAIIA (EDAN v3)'],
    'Parameters': [dnn.count_parameters(), faiia.count_parameters()],
    'Inference': ['Fast', 'Moderate'] # Placeholder
})
display(efficiency_df)