# üî• Advanced Attention-KAN 2025 - SOTA Methods
## State-of-the-Art Defect Prediction

**Based on Latest Research (2024-2025):**
- Multi-Head Self-Attention (Transformer-style)
- Dual Attention (Channel + Instance)
- TabNet-Style Sparse Feature Selection
- Residual Connections + Layer Normalization
- Advanced Training: Cosine LR, Label Smoothing, Mixup

**Target:** Recall ‚â•90%, Accuracy ‚â•75%, Precision ‚â•45%

**Sources:**
- [Nature: Hybrid Deep Learning (2024)](https://www.nature.com/articles/s41598-024-65639-4)
- [PMC: Attention Feature Extraction (2024)](https://pmc.ncbi.nlm.nih.gov/articles/PMC11996211/)
- [PLOS: Attention GRU-LSTM (2024)](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0247444)

---

In [None]:
import os, glob, warnings, math
import numpy as np
import pandas as pd
from scipy.io import arff
from io import StringIO
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(RANDOM_SEED)

print("‚úÖ Imports ready!")

In [None]:
# ============================================================================
# SOTA 1: MULTI-HEAD SELF-ATTENTION (Transformer-style)
# ============================================================================

class MultiHeadSelfAttention(nn.Module):
    """Multi-Head Self-Attention for feature importance"""
    
    def __init__(self, in_features, num_heads=4, dropout=0.1):
        super().__init__()
        assert in_features % num_heads == 0, "in_features must be divisible by num_heads"
        
        self.in_features = in_features
        self.num_heads = num_heads
        self.head_dim = in_features // num_heads
        
        # Q, K, V projections
        self.qkv = nn.Linear(in_features, in_features * 3)
        self.out_proj = nn.Linear(in_features, in_features)
        self.dropout = nn.Dropout(dropout)
        
        # Layer Norm (better than BatchNorm for attention)
        self.norm = nn.LayerNorm(in_features)
    
    def forward(self, x):
        batch_size = x.shape[0]
        
        # Normalize first
        x_norm = self.norm(x)
        
        # Compute Q, K, V
        qkv = self.qkv(x_norm).reshape(batch_size, 3, self.num_heads, self.head_dim)
        qkv = qkv.permute(1, 0, 2, 3)  # [3, batch, heads, head_dim]
        q, k, v = qkv[0], qkv[1], qkv[2]
        
        # Scaled dot-product attention
        scale = math.sqrt(self.head_dim)
        attn = (q @ k.transpose(-2, -1)) / scale  # [batch, heads, 1, 1]
        attn = F.softmax(attn, dim=-1)
        attn = self.dropout(attn)
        
        # Apply attention to values
        out = (attn @ v).transpose(1, 2).reshape(batch_size, self.in_features)
        out = self.out_proj(out)
        
        # Residual connection
        out = out + x
        
        # Return weighted features and attention weights
        return out, attn.mean(dim=1).squeeze()

print("‚úÖ Multi-Head Self-Attention ready!")

In [None]:
# ============================================================================
# SOTA 2: DUAL ATTENTION (Channel + Instance)
# ============================================================================

class ChannelAttention(nn.Module):
    """Channel Attention: Which features are important?"""
    
    def __init__(self, in_features, reduction=4):
        super().__init__()
        self.avg_pool = nn.AdaptiveAvgPool1d(1)
        self.max_pool = nn.AdaptiveMaxPool1d(1)
        
        self.fc = nn.Sequential(
            nn.Linear(in_features, in_features // reduction),
            nn.ReLU(),
            nn.Linear(in_features // reduction, in_features),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        # x: [batch, features]
        x_unsq = x.unsqueeze(-1)  # [batch, features, 1]
        
        avg_out = self.fc(self.avg_pool(x_unsq).squeeze(-1))
        max_out = self.fc(self.max_pool(x_unsq).squeeze(-1))
        
        attn = avg_out + max_out  # Combine
        return x * attn, attn


class InstanceAttention(nn.Module):
    """Instance Attention: Which samples are important?"""
    
    def __init__(self, in_features):
        super().__init__()
        self.attention = nn.Sequential(
            nn.Linear(in_features, in_features // 2),
            nn.Tanh(),
            nn.Linear(in_features // 2, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        # x: [batch, features]
        weights = self.attention(x)  # [batch, 1]
        return x * weights, weights


class DualAttention(nn.Module):
    """Dual Attention: Channel + Instance"""
    
    def __init__(self, in_features, reduction=4):
        super().__init__()
        self.channel_attn = ChannelAttention(in_features, reduction)
        self.instance_attn = InstanceAttention(in_features)
        self.norm = nn.LayerNorm(in_features)
    
    def forward(self, x):
        # Normalize
        x = self.norm(x)
        
        # Channel attention
        x, channel_attn = self.channel_attn(x)
        
        # Instance attention
        x, instance_attn = self.instance_attn(x)
        
        return x, channel_attn

print("‚úÖ Dual Attention ready!")

In [None]:
# ============================================================================
# SOTA 3: TABNET-STYLE SPARSE FEATURE SELECTION
# ============================================================================

class SparseFeatureSelection(nn.Module):
    """TabNet-inspired sparse feature selector"""
    
    def __init__(self, in_features, virtual_batch_size=128, momentum=0.02):
        super().__init__()
        self.in_features = in_features
        
        # Feature transformer
        self.transform = nn.Sequential(
            nn.Linear(in_features, in_features),
            nn.BatchNorm1d(in_features, momentum=momentum),
            nn.ReLU(),
        )
        
        # Sparse mask generator
        self.mask_generator = nn.Sequential(
            nn.Linear(in_features, in_features),
            nn.BatchNorm1d(in_features, momentum=momentum),
        )
    
    def forward(self, x, prior=None):
        # Transform features
        x_trans = self.transform(x)
        
        # Generate sparse mask
        mask_logits = self.mask_generator(x)
        
        # Multiply with prior (for sequential attention)
        if prior is not None:
            mask_logits = mask_logits * prior
        
        # Sparsemax (more sparse than softmax)
        mask = F.softmax(mask_logits, dim=-1)
        
        # Apply mask
        return x_trans * mask, mask

print("‚úÖ Sparse Feature Selection ready!")

In [None]:
# ============================================================================
# KAN LAYERS (same as before)
# ============================================================================

class KANLinear(nn.Module):
    def __init__(self, in_features, out_features, grid_size=5):
        super().__init__()
        self.grid = nn.Parameter(torch.linspace(-1, 1, grid_size).unsqueeze(0).unsqueeze(0).repeat(out_features, in_features, 1))
        self.coef = nn.Parameter(torch.randn(out_features, in_features, grid_size) * 0.1)
        self.base_weight = nn.Parameter(torch.randn(out_features, in_features) * 0.1)
    
    def forward(self, x):
        basis = torch.exp(-torch.abs(x.unsqueeze(1).unsqueeze(-1) - self.grid.unsqueeze(0)) ** 2 / 0.5)
        return (basis * self.coef.unsqueeze(0)).sum(dim=-1).sum(dim=-1) + torch.matmul(x, self.base_weight.t())

print("‚úÖ KAN layers ready!")

In [None]:
# ============================================================================
# ADVANCED ATTENTION-KAN 2025 (ALL SOTA COMBINED)
# ============================================================================

class AdvancedAttentionKAN(nn.Module):
    """State-of-the-Art KAN with Multiple Attention Mechanisms"""
    
    def __init__(self, input_dim, hidden_dim=64, grid_size=5, num_heads=4):
        super().__init__()
        
        # Make input_dim divisible by num_heads
        self.proj_dim = ((input_dim + num_heads - 1) // num_heads) * num_heads
        
        # Input projection (if needed)
        if input_dim != self.proj_dim:
            self.input_proj = nn.Linear(input_dim, self.proj_dim)
        else:
            self.input_proj = nn.Identity()
        
        # SOTA 1: Multi-Head Self-Attention
        self.multi_head_attn = MultiHeadSelfAttention(self.proj_dim, num_heads)
        
        # SOTA 2: Dual Attention
        self.dual_attn = DualAttention(self.proj_dim)
        
        # SOTA 3: Sparse Feature Selection
        self.sparse_select = SparseFeatureSelection(self.proj_dim)
        
        # KAN layers with residual connections
        self.kan1 = KANLinear(self.proj_dim, hidden_dim, grid_size)
        self.norm1 = nn.LayerNorm(hidden_dim)
        self.dropout1 = nn.Dropout(0.3)
        
        self.kan2 = KANLinear(hidden_dim, hidden_dim // 2, grid_size)
        self.norm2 = nn.LayerNorm(hidden_dim // 2)
        self.dropout2 = nn.Dropout(0.3)
        
        # Output
        self.output = nn.Linear(hidden_dim // 2, 1)
    
    def forward(self, x):
        # Project input if needed
        x = self.input_proj(x)
        
        # Multi-Head Self-Attention
        x, mh_attn = self.multi_head_attn(x)
        
        # Dual Attention
        x, dual_attn = self.dual_attn(x)
        
        # Sparse Feature Selection
        x, sparse_mask = self.sparse_select(x)
        
        # KAN layers
        x = self.kan1(x)
        x = self.norm1(x)
        x = F.gelu(x)  # GELU instead of ReLU
        x = self.dropout1(x)
        
        x = self.kan2(x)
        x = self.norm2(x)
        x = F.gelu(x)
        x = self.dropout2(x)
        
        # Output
        x = self.output(x)
        x = torch.sigmoid(x)
        
        return x
    
    def get_feature_importance(self, X):
        """Get combined feature importance from all attention mechanisms"""
        self.eval()
        if not isinstance(X, torch.Tensor):
            X = torch.FloatTensor(X)
        
        device = next(self.parameters()).device
        X = X.to(device)
        
        with torch.no_grad():
            X = self.input_proj(X)
            _, mh_attn = self.multi_head_attn(X)
            _, dual_attn = self.dual_attn(X)
            _, sparse_mask = self.sparse_select(X)
            
            # Combine all attention scores
            combined = (mh_attn + dual_attn + sparse_mask) / 3.0
            importance = combined.cpu().numpy().mean(axis=0)
        
        # Project back to original dimension if needed
        if self.proj_dim != X.shape[1]:
            # Take first input_dim values
            importance = importance[:X.shape[1]]
        
        return importance

print("‚úÖ Advanced Attention-KAN 2025 ready!")

In [None]:
# ============================================================================
# ADVANCED TRAINING TECHNIQUES
# ============================================================================

class FocalLossWithLabelSmoothing(nn.Module):
    """Focal Loss + Label Smoothing"""
    
    def __init__(self, alpha=0.25, gamma=2.0, pos_weight=3.0, smoothing=0.1):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.pos_weight = pos_weight
        self.smoothing = smoothing
    
    def forward(self, inputs, targets):
        # Label smoothing
        targets_smooth = targets * (1 - self.smoothing) + self.smoothing * 0.5
        
        # Focal loss
        bce = F.binary_cross_entropy(inputs, targets_smooth, reduction='none')
        pt = torch.exp(-bce)
        focal = (self.alpha * targets_smooth + (1 - self.alpha) * (1 - targets_smooth)) * (1 - pt) ** self.gamma * bce
        
        # FN penalty
        focal[targets == 1] *= self.pos_weight
        
        return focal.mean()


def mixup_data(x, y, alpha=0.2):
    """Mixup data augmentation"""
    lam = np.random.beta(alpha, alpha) if alpha > 0 else 1
    batch_size = x.size(0)
    index = torch.randperm(batch_size).to(x.device)
    mixed_x = lam * x + (1 - lam) * x[index]
    mixed_y = lam * y + (1 - lam) * y[index]
    return mixed_x, mixed_y

print("‚úÖ Advanced loss functions ready!")

In [None]:
# ============================================================================
# TRAINING WITH ADVANCED TECHNIQUES
# ============================================================================

def train_advanced_model(model, X_train, y_train, X_val, y_val, 
                        lr=0.001, epochs=50, pos_weight=3.0, warmup_epochs=5):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    X_train_t = torch.FloatTensor(X_train).to(device)
    y_train_t = torch.FloatTensor(y_train).unsqueeze(1).to(device)
    X_val_t = torch.FloatTensor(X_val).to(device)
    y_val_t = torch.FloatTensor(y_val).unsqueeze(1).to(device)
    
    loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=32, shuffle=True)
    
    # Advanced loss
    criterion = FocalLossWithLabelSmoothing(alpha=0.25, gamma=2.0, pos_weight=pos_weight, smoothing=0.1)
    
    # AdamW optimizer (better than Adam)
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
    
    # Cosine Annealing LR Scheduler
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs, eta_min=lr/100)
    
    best_recall = 0
    patience, patience_counter = 15, 0
    
    for epoch in range(epochs):
        model.train()
        
        # Warmup
        if epoch < warmup_epochs:
            lr_scale = (epoch + 1) / warmup_epochs
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr * lr_scale
        
        for batch_X, batch_y in loader:
            # Mixup augmentation
            if np.random.random() < 0.5:
                batch_X, batch_y = mixup_data(batch_X, batch_y, alpha=0.2)
            
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            
            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
        
        # Scheduler step (after warmup)
        if epoch >= warmup_epochs:
            scheduler.step()
        
        # Validation
        model.eval()
        with torch.no_grad():
            val_out = model(X_val_t)
            val_pred = (val_out > 0.5).float().cpu().numpy()
            val_recall = recall_score(y_val, val_pred, zero_division=0)
        
        if val_recall > best_recall:
            best_recall = val_recall
            patience_counter = 0
        else:
            patience_counter += 1
        
        if patience_counter >= patience:
            break
    
    return model

print("‚úÖ Advanced training ready!")

In [None]:
# Data loading (same as before)
def load_arff(file_path):
    try:
        data, _ = arff.loadarff(file_path)
        df = pd.DataFrame(data)
        for col in df.columns:
            if df[col].dtype == object:
                try: df[col] = df[col].str.decode('utf-8')
                except: pass
        return df
    except:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            content = f.read()
        return pd.read_csv(StringIO(content[content.lower().find('@data') + 5:].strip()), header=None)

def prepare_data(df):
    X = df.iloc[:, :-1].values.astype(np.float32)
    y = df.iloc[:, -1].values
    if y.dtype == object: y = LabelEncoder().fit_transform(y)
    else: y = y.astype(np.int32)
    if np.any(np.isnan(X)): X[np.where(np.isnan(X))] = np.take(np.nanmedian(X, axis=0), np.where(np.isnan(X))[1])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_SEED)
    scaler = MinMaxScaler()
    X_train, X_test = scaler.fit_transform(X_train), scaler.transform(X_test)
    try: X_train, y_train = SMOTE(sampling_strategy=0.8, random_state=RANDOM_SEED).fit_resample(X_train, y_train)
    except: pass
    return X_train, X_test, y_train, y_test

print("‚úÖ Data loading ready!")

In [None]:
# Threshold optimization
def find_optimal_threshold(model, X_val, y_val, target_recall=0.90):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.eval()
    with torch.no_grad():
        y_prob = model(torch.FloatTensor(X_val).to(device)).cpu().numpy().flatten()
    best_threshold, best_f1 = 0.5, 0
    for threshold in np.arange(0.1, 0.7, 0.05):
        y_pred = (y_prob >= threshold).astype(int)
        recall = recall_score(y_val, y_pred, zero_division=0)
        f1 = f1_score(y_val, y_pred, zero_division=0)
        if recall >= target_recall and f1 > best_f1:
            best_threshold, best_f1 = threshold, f1
    return best_threshold

print("‚úÖ Threshold optimization ready!")

In [None]:
# Simple GWO
class SimpleGWO:
    def __init__(self, bounds, fitness_func, n_wolves=6, n_iter=10):
        self.bounds = np.array(bounds)
        self.fitness_func = fitness_func
        self.n_wolves, self.n_iter = n_wolves, n_iter
        self.dim = len(bounds)
        self.positions = np.random.uniform(self.bounds[:, 0], self.bounds[:, 1], size=(n_wolves, self.dim))
        self.alpha_pos, self.alpha_score = np.zeros(self.dim), float('-inf')
        self.beta_pos, self.beta_score = np.zeros(self.dim), float('-inf')
        self.delta_pos, self.delta_score = np.zeros(self.dim), float('-inf')
    def optimize(self):
        for it in range(self.n_iter):
            for i in range(self.n_wolves):
                fitness = self.fitness_func(self.positions[i])
                if fitness > self.alpha_score:
                    self.delta_score, self.delta_pos = self.beta_score, self.beta_pos.copy()
                    self.beta_score, self.beta_pos = self.alpha_score, self.alpha_pos.copy()
                    self.alpha_score, self.alpha_pos = fitness, self.positions[i].copy()
                elif fitness > self.beta_score:
                    self.delta_score, self.delta_pos = self.beta_score, self.beta_pos.copy()
                    self.beta_score, self.beta_pos = fitness, self.positions[i].copy()
                elif fitness > self.delta_score:
                    self.delta_score, self.delta_pos = fitness, self.positions[i].copy()
            a = 2 - it * (2.0 / self.n_iter)
            for i in range(self.n_wolves):
                for j in range(self.dim):
                    r1, r2 = np.random.random(2)
                    X1 = self.alpha_pos[j] - (2 * a * r1 - a) * abs(2 * r2 * self.alpha_pos[j] - self.positions[i, j])
                    r1, r2 = np.random.random(2)
                    X2 = self.beta_pos[j] - (2 * a * r1 - a) * abs(2 * r2 * self.beta_pos[j] - self.positions[i, j])
                    r1, r2 = np.random.random(2)
                    X3 = self.delta_pos[j] - (2 * a * r1 - a) * abs(2 * r2 * self.delta_pos[j] - self.positions[i, j])
                    self.positions[i, j] = np.clip((X1 + X2 + X3) / 3.0, self.bounds[j, 0], self.bounds[j, 1])
        return self.alpha_pos, self.alpha_score

print("‚úÖ GWO ready!")

In [None]:
# Visualization
def plot_importance(model, X_data, dataset_name, top_k=15):
    importance = model.get_feature_importance(X_data)
    sorted_idx = np.argsort(importance)[::-1][:top_k]
    top_imp = importance[sorted_idx]
    top_names = [f'F{i}' for i in sorted_idx]
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.barh(range(len(top_imp)), top_imp, color=plt.cm.plasma(top_imp / top_imp.max()))
    ax.set_yticks(range(len(top_imp)))
    ax.set_yticklabels(top_names)
    ax.set_xlabel('Attention Weight', fontweight='bold')
    ax.set_title(f'{dataset_name}: Top {top_k} Features (SOTA 2025)', fontweight='bold')
    ax.invert_yaxis()
    ax.grid(axis='x', alpha=0.3)
    for i, v in enumerate(top_imp):
        ax.text(v + 0.01, i, f'{v:.3f}', va='center', fontsize=9)
    plt.tight_layout()
    plt.savefig(f'{dataset_name}_sota2025.png', dpi=300, bbox_inches='tight')
    plt.show()

print("‚úÖ Visualization ready!")

In [None]:
# Main Execution
def run_sota_experiment(dataset_dir='/content/drive/MyDrive/nasa-defect-gwo-kan/dataset'):
    files = [f for f in glob.glob(os.path.join(dataset_dir, '*.arff')) 
             if any(ds in os.path.basename(f).upper() for ds in ['PC1', 'CM1', 'KC1'])]
    results = []
    
    for file_path in files:
        dataset_name = os.path.basename(file_path).replace('.arff', '')
        print(f"\n{'='*60}\nüöÄ {dataset_name}: SOTA 2025 Training...\n{'='*60}")
        
        try:
            df = load_arff(file_path)
            X_train, X_test, y_train, y_test = prepare_data(df)
            X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=RANDOM_SEED)
            input_dim = X_train.shape[1]
            
            print("üîß GWO optimization...")
            def fitness(params):
                try:
                    model = AdvancedAttentionKAN(input_dim, int(params[0]), int(params[1]), num_heads=4)
                    model = train_advanced_model(model, X_train, y_train, X_val, y_val, params[2], 30, params[3], 5)
                    threshold = find_optimal_threshold(model, X_val, y_val, 0.90)
                    y_pred = (model(torch.FloatTensor(X_val).to(next(model.parameters()).device)).detach().cpu().numpy().flatten() >= threshold).astype(int)
                    return 0.6 * recall_score(y_val, y_pred, zero_division=0) + 0.3 * f1_score(y_val, y_pred, zero_division=0) + 0.1 * accuracy_score(y_val, y_pred)
                except: return 0.0
            
            gwo = SimpleGWO([(48, 128), (4, 8), (0.0005, 0.005), (2.5, 6.0)], fitness, 6, 10)
            best_params, _ = gwo.optimize()
            hidden_dim, grid_size, lr, pos_weight = int(best_params[0]), int(best_params[1]), best_params[2], best_params[3]
            
            print("ü§ñ Training final model...")
            model = AdvancedAttentionKAN(input_dim, hidden_dim, grid_size, num_heads=4)
            model = train_advanced_model(model, X_train, y_train, X_val, y_val, lr, 50, pos_weight, 5)
            
            print("üéØ Finding threshold...")
            threshold = find_optimal_threshold(model, X_val, y_val, 0.90)
            
            print("üìà Evaluating...")
            device = next(model.parameters()).device
            y_prob = model(torch.FloatTensor(X_test).to(device)).detach().cpu().numpy().flatten()
            y_pred = (y_prob >= threshold).astype(int)
            
            metrics = {
                'Accuracy': accuracy_score(y_test, y_pred),
                'Precision': precision_score(y_test, y_pred, zero_division=0),
                'Recall': recall_score(y_test, y_pred, zero_division=0),
                'F1': f1_score(y_test, y_pred, zero_division=0),
                'F2': fbeta_score(y_test, y_pred, beta=2, zero_division=0),
                'AUC': roc_auc_score(y_test, y_prob) if len(np.unique(y_test)) > 1 else 0
            }
            
            print("üìä Creating heatmap...")
            plot_importance(model, X_test, dataset_name, 15)
            
            results.append({'Dataset': dataset_name, 'Threshold': threshold, **metrics})
            print(f"‚úÖ {dataset_name} complete!\n")
        except Exception as e:
            print(f"‚ùå Error: {e}\n")
    
    df = pd.DataFrame(results)
    avg = {'Dataset': 'AVERAGE'}
    for col in ['Accuracy', 'Precision', 'Recall', 'F1', 'F2', 'AUC']:
        avg[col] = df[col].mean()
    df = pd.concat([df, pd.DataFrame([avg])], ignore_index=True)
    return df

print("‚úÖ Main execution ready!")

In [None]:
# RUN SOTA 2025!
print("\n" + "="*60)
print("üî• ADVANCED ATTENTION-KAN 2025 - SOTA METHODS")
print("="*60)
print("\n‚ú® Techniques:")
print("  1. Multi-Head Self-Attention (4 heads)")
print("  2. Dual Attention (Channel + Instance)")
print("  3. TabNet-Style Sparse Selection")
print("  4. Residual Connections + LayerNorm")
print("  5. Cosine Annealing LR + Warmup")
print("  6. Label Smoothing + Mixup")
print("  7. Gradient Clipping + AdamW")
print("\nüéØ Target: Recall‚â•90%, Accuracy‚â•75%, Precision‚â•45%\n")

results = run_sota_experiment('/content/drive/MyDrive/nasa-defect-gwo-kan/dataset')

print("\n" + "="*60)
print("üìä FINAL RESULTS - SOTA 2025")
print("="*60)
print(results.to_string(index=False))

results.to_excel('sota_2025_results.xlsx', index=False)
print("\nüíæ Saved: sota_2025_results.xlsx")

print("\n" + "="*60)
print("üéØ AVERAGE METRICS")
print("="*60)
avg = results[results['Dataset'] == 'AVERAGE'].iloc[0]
print(f"\n  Accuracy:  {avg['Accuracy']:.4f} {'‚úÖ' if avg['Accuracy'] >= 0.75 else '‚ùå'}")
print(f"  Precision: {avg['Precision']:.4f} {'‚úÖ' if avg['Precision'] >= 0.45 else '‚ùå'}")
print(f"  Recall:    {avg['Recall']:.4f} {'‚úÖ' if avg['Recall'] >= 0.90 else '‚ùå'} ‚≠ê")
print(f"  F1-Score:  {avg['F1']:.4f} {'‚úÖ' if avg['F1'] >= 0.55 else '‚ùå'}")
print(f"  F2-Score:  {avg['F2']:.4f}")
print(f"  AUC:       {avg['AUC']:.4f}")
print("\n" + "="*60)
print("üöÄ SOTA 2025 COMPLETE!")
print("="*60)

In [None]:
# Comparison Plot
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
fig.suptitle('Advanced Attention-KAN 2025 - SOTA Results', fontsize=16, fontweight='bold')
metrics = ['Accuracy', 'Precision', 'Recall', 'F1', 'F2', 'AUC']
colors = ['#e74c3c', '#3498db', '#2ecc71', '#f39c12', '#9b59b6', '#1abc9c']
plot_data = results[results['Dataset'] != 'AVERAGE']
for idx, (metric, color) in enumerate(zip(metrics, colors)):
    ax = axes[idx // 3, idx % 3]
    ax.barh(plot_data['Dataset'], plot_data[metric], color=color, alpha=0.8)
    ax.set_xlabel(metric, fontweight='bold', fontsize=12)
    ax.set_xlim(0, 1)
    ax.grid(axis='x', alpha=0.3)
    if metric == 'Recall':
        ax.axvline(x=0.90, color='red', linestyle='--', linewidth=2, label='Target')
        ax.set_facecolor('#ffe6e6')
        ax.set_title('‚≠ê PRIMARY ‚≠ê', fontsize=11, color='red', fontweight='bold')
    elif metric == 'Accuracy':
        ax.axvline(x=0.75, color='darkred', linestyle='--', linewidth=2, alpha=0.6)
    elif metric == 'Precision':
        ax.axvline(x=0.45, color='darkblue', linestyle='--', linewidth=2, alpha=0.6)
    elif metric == 'F1':
        ax.axvline(x=0.55, color='darkgreen', linestyle='--', linewidth=2, alpha=0.6)
plt.tight_layout()
plt.savefig('sota_2025_comparison.png', dpi=300, bbox_inches='tight')
plt.show()
print("üíæ Saved: sota_2025_comparison.png")