In [1]:
import json
import os
import numpy as np
import matplotlib.pyplot as plt
import gc
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
import traceback
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random

# ==================================================================
# REPRODUCIBILITY SETTINGS
# ==================================================================
SEED = 42

def set_seed(seed=SEED):
    """Set all seeds for reproducibility"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)

# Set seeds at import time
set_seed(SEED)

In [2]:
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
CACHE_DIR_NAME = "activation_cache"

LAYER_CONFIG = {
    "Qwen2.5-7B": 
    {
        "attn": [15,16,18],
        "mlp": [16,18,20],
        "hidden": [18,19,20]
    },    
    "Falcon3-7B-Base": 
    {
        "attn": [2,7,12],
        "mlp": [10,11,12],
        "hidden": [2,3,19]
    }
}

# ==================================================================
# HYPERPARAMETERS
# ==================================================================
# The architecture is now: Input -> Encoder -> Latent(256) -> Head -> Output
LATENT_DIM = 256
HIDDEN_DIM = 1024  # Internal dimension for the Encoder
BATCH_SIZE = 64
EPOCHS = 100
LR = 1e-3
PATIENCE = 15

In [3]:
def stats_per_json(model_name, dataset_name):
    file_path = os.path.join(PROJECT_ROOT, CACHE_DIR_NAME, model_name, dataset_name, "generations", "hallucination_labels.json")
    with open(file_path, 'r') as file:
        data = json.load(file)
    total = len(data)
    hallucinations = sum(1 for item in data if item['is_hallucination'])
    allucinated_items = [item['instance_id'] for item in data if item['is_hallucination']]
    return {
        'total': total,
        'hallucinations': hallucinations,
        'hallucinated_items': allucinated_items,
        'model_name': model_name
    }

qwen_stats = stats_per_json("Qwen2.5-7B", "belief_bank")
falcon_stats = stats_per_json("Falcon3-7B-Base", "belief_bank")

def load_and_split_layers(model_name, dataset_name, layer_indices, type_layer, stats, train_indices, test_indices):
    print(f" Loading {model_name} [{type_layer}]: layers {layer_indices}...")
    total_samples = stats['total']
    hallucinated_set = set(stats['hallucinated_items'])
    
    y_full = np.zeros(total_samples, dtype=np.int8)
    y_full[list(hallucinated_set)] = 1
    y_train = y_full[train_indices]
    y_test  = y_full[test_indices]

    all_features = []
    for layer_idx in layer_indices:
        file_path = os.path.join(PROJECT_ROOT, CACHE_DIR_NAME, model_name, dataset_name,
                                 "activation_"+type_layer, f"layer{layer_idx}_activations.pt")
        if not os.path.exists(file_path): continue
        acts = torch.load(file_path, map_location='cpu')
        if acts.shape[0] > total_samples: acts = acts[:total_samples]
        X_layer = acts.float().numpy() if isinstance(acts, torch.Tensor) else acts.astype(np.float32)
        if X_layer.ndim > 2: X_layer = X_layer.reshape(X_layer.shape[0], -1)
        all_features.append(X_layer)
        del acts; gc.collect()

    if not all_features: raise ValueError(f"No layers found for {model_name}")
    X_full = np.concatenate(all_features, axis=1)
    X_train = X_full[train_indices]
    X_test  = X_full[test_indices]
    return X_train, X_test, y_train, y_test

class SimpleDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X).float()
        self.y = torch.from_numpy(y).float() # BCE expects float
    def __len__(self): return len(self.X)
    def __getitem__(self, idx): return self.X[idx], self.y[idx]

def get_generator(seed=SEED):
    g = torch.Generator()
    g.manual_seed(seed)
    return g

In [4]:
# ------------------------------------------------------------------
# 1. ENCODER: Maps Input Dimension -> Latent Dimension
# ------------------------------------------------------------------
class Encoder(nn.Module):
    def __init__(self, input_dim: int, latent_dim: int, hidden_dim: int = 1024, dropout: float = 0.3):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.LayerNorm(hidden_dim // 2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, latent_dim),
            nn.LayerNorm(latent_dim) # Normalize latent space for stability
        )
        
    def forward(self, x):
        return self.net(x)

# ------------------------------------------------------------------
# 2. CLASSIFICATION HEAD: Maps Latent Dimension -> Probability
# ------------------------------------------------------------------
class ClassificationHead(nn.Module):
    def __init__(self, latent_dim: int, hidden_dim: int = 128, dropout: float = 0.3):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(latent_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, 1) # Binary output
        )
        
    def forward(self, x):
        return self.net(x).squeeze(-1)

    def predict(self, x):
        with torch.no_grad():
            logits = self.forward(x)
            return (torch.sigmoid(logits) > 0.5).long()

In [5]:
# ==================================================================
# STRATEGY 1: TRAIN TEACHER (Encoder + Head Jointly)
# ==================================================================
def train_teacher_pipeline(X_train, y_train, X_val, y_val, input_dim, device, model_name):
    print(f"   [Teacher] Training full pipeline for {model_name}...")
    
    # Initialize separate modules
    encoder = Encoder(input_dim, LATENT_DIM, HIDDEN_DIM).to(device)
    head = ClassificationHead(LATENT_DIM).to(device)
    
    # Combine parameters for optimizer
    params = list(encoder.parameters()) + list(head.parameters())
    optimizer = optim.AdamW(params, lr=LR, weight_decay=1e-2)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)
    
    # Class weights for imbalance
    n_pos = y_train.sum()
    n_neg = len(y_train) - n_pos
    pos_weight = torch.tensor([n_neg / n_pos]).to(device)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    
    train_loader = DataLoader(SimpleDataset(X_train, y_train), batch_size=BATCH_SIZE, shuffle=True, generator=get_generator())
    val_loader = DataLoader(SimpleDataset(X_val, y_val), batch_size=BATCH_SIZE, shuffle=False)
    
    best_acc = 0.0
    patience_counter = 0
    best_states = None
    
    for epoch in range(EPOCHS):
        encoder.train(); head.train()
        train_loss = 0.0
        
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            
            latents = encoder(X_batch)
            logits = head(latents)
            
            loss = criterion(logits, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            
        # Validation
        encoder.eval(); head.eval()
        all_preds, all_labels = [], []
        
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch = X_batch.to(device)
                latents = encoder(X_batch)
                preds = head.predict(latents)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(y_batch.numpy())
        
        acc = accuracy_score(all_labels, all_preds)
        scheduler.step()
        
        if acc > best_acc:
            best_acc = acc
            patience_counter = 0
            best_states = {
                'encoder': encoder.state_dict().copy(),
                'head': head.state_dict().copy()
            }
        else:
            patience_counter += 1
            if patience_counter >= PATIENCE:
                print(f"     Early stopping at epoch {epoch+1}. Best Acc: {best_acc:.4f}")
                break
                
    encoder.load_state_dict(best_states['encoder'])
    head.load_state_dict(best_states['head'])
    
    return encoder, head, best_acc

# ==================================================================
# STRATEGY 2: TRAIN STUDENT (New Encoder + Frozen Head)
# ==================================================================
def train_student_adapter(X_train, y_train, X_val, y_val, input_dim, frozen_head, device, student_name):
    print(f"   [Student] Training Adapter Encoder for {student_name} (Head Frozen)...")
    
    # 1. Freeze the Head
    frozen_head.eval()
    for param in frozen_head.parameters():
        param.requires_grad = False
        
    # 2. New Encoder for Student
    encoder = Encoder(input_dim, LATENT_DIM, HIDDEN_DIM).to(device)
    
    # 3. Optimize ONLY the encoder
    optimizer = optim.AdamW(encoder.parameters(), lr=LR, weight_decay=1e-2)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)
    
    n_pos = y_train.sum()
    n_neg = len(y_train) - n_pos
    pos_weight = torch.tensor([n_neg / n_pos]).to(device)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    
    train_loader = DataLoader(SimpleDataset(X_train, y_train), batch_size=BATCH_SIZE, shuffle=True, generator=get_generator())
    val_loader = DataLoader(SimpleDataset(X_val, y_val), batch_size=BATCH_SIZE, shuffle=False)
    
    best_acc = 0.0
    patience_counter = 0
    best_state = None
    
    for epoch in range(EPOCHS):
        encoder.train()
        train_loss = 0.0
        
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            
            # Forward pass: Student Input -> Student Encoder -> Frozen Head -> Loss
            latents = encoder(X_batch)
            logits = frozen_head(latents) # Head is fixed
            
            loss = criterion(logits, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            
        # Validation
        encoder.eval()
        all_preds, all_labels = [], []
        
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch = X_batch.to(device)
                latents = encoder(X_batch)
                preds = frozen_head.predict(latents)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(y_batch.numpy())
        
        acc = accuracy_score(all_labels, all_preds)
        scheduler.step()
        
        if acc > best_acc:
            best_acc = acc
            patience_counter = 0
            best_state = encoder.state_dict().copy()
        else:
            patience_counter += 1
            if patience_counter >= PATIENCE:
                print(f"     Early stopping at epoch {epoch+1}. Best Acc: {best_acc:.4f}")
                break
                
    encoder.load_state_dict(best_state)
    return encoder, best_acc

In [6]:
def plot_confusion_matrix(y_true, y_pred, title, filename):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(title)
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    os.makedirs("confusion_matrices_frozen_head", exist_ok=True)
    plt.savefig(os.path.join("confusion_matrices_frozen_head", filename))
    plt.close()

# ==================================================================
# MAIN EXECUTION
# ==================================================================
n_samples = qwen_stats['total']
rng = np.random.RandomState(SEED)
shuffled_indices = rng.permutation(n_samples)
split = int(0.7 * n_samples)
train_indices, test_indices = shuffled_indices[:split], shuffled_indices[split:]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

results_log = []

# Define the two configurations
scenarios = [
    {"teacher": "Qwen2.5-7B", "student": "Falcon3-7B-Base"},
    {"teacher": "Falcon3-7B-Base", "student": "Qwen2.5-7B"}
]

for layer_type in ['attn', 'mlp', 'hidden']:
    print(f"\n{'='*60}")
    print(f"PROCESSING LAYER TYPE: {layer_type.upper()}")
    print(f"{'='*60}")
    
    # 1. Load and Preprocess Data for BOTH models first
    print("Loading data for Qwen...")
    X_qwen_tr, X_qwen_te, y_qwen_tr, y_qwen_te = load_and_split_layers(
        "Qwen2.5-7B", "belief_bank", LAYER_CONFIG["Qwen2.5-7B"][layer_type], 
        layer_type, qwen_stats, train_indices, test_indices)

    print("Loading data for Falcon...")
    X_falcon_tr, X_falcon_te, y_falcon_tr, y_falcon_te = load_and_split_layers(
        "Falcon3-7B-Base", "belief_bank", LAYER_CONFIG["Falcon3-7B-Base"][layer_type], 
        layer_type, falcon_stats, train_indices, test_indices)

    # Scaling (Independent for each model)
    s_qwen = StandardScaler()
    X_qwen_tr = s_qwen.fit_transform(X_qwen_tr)
    X_qwen_te = s_qwen.transform(X_qwen_te)

    s_falcon = StandardScaler()
    X_falcon_tr = s_falcon.fit_transform(X_falcon_tr)
    X_falcon_te = s_falcon.transform(X_falcon_te)

    # Pack data into a dictionary for easier access
    data_map = {
        "Qwen2.5-7B": {"X_tr": X_qwen_tr, "y_tr": y_qwen_tr, "X_te": X_qwen_te, "y_te": y_qwen_te},
        "Falcon3-7B-Base": {"X_tr": X_falcon_tr, "y_tr": y_falcon_tr, "X_te": X_falcon_te, "y_te": y_falcon_te}
    }

    # 2. Run Both Scenarios
    for sc in scenarios:
        t_name = sc['teacher']
        s_name = sc['student']
        print(f"\n--- SCENARIO: Teacher={t_name} -> Student={s_name} ---")
        
        # Get Data
        teacher_data = data_map[t_name]
        student_data = data_map[s_name]
        
        # Split Train into Train/Val for early stopping
        n_tr = len(teacher_data["X_tr"])
        idx = np.arange(n_tr)
        np.random.shuffle(idx)
        v_size = int(0.15 * n_tr)
        tr_idx, val_idx = idx[v_size:], idx[:v_size]
        
        # --- PHASE 1: Train Teacher ---
        enc_teacher, head_shared, best_acc_t = train_teacher_pipeline(
            teacher_data["X_tr"][tr_idx], teacher_data["y_tr"][tr_idx],
            teacher_data["X_tr"][val_idx], teacher_data["y_tr"][val_idx],
            input_dim=teacher_data["X_tr"].shape[1],
            device=device, model_name=t_name
        )
        
        # Evaluate Teacher on Test
        enc_teacher.eval(); head_shared.eval()
        with torch.no_grad():
            z_t = enc_teacher(torch.from_numpy(teacher_data["X_te"]).float().to(device))
            preds_t = head_shared.predict(z_t).cpu().numpy()
        
        t_f1 = f1_score(teacher_data["y_te"], preds_t)
        t_acc = accuracy_score(teacher_data["y_te"], preds_t)
        print(f"   [Result] Teacher ({t_name}) Test F1: {t_f1:.4f} | Acc: {t_acc:.4f}")
        plot_confusion_matrix(teacher_data["y_te"], preds_t, 
                              f"Teacher {t_name} ({layer_type})", f"cm_{layer_type}_teacher_{t_name}.png")

        # --- PHASE 2: Train Student with Frozen Head ---
        enc_student, best_acc_s = train_student_adapter(
            student_data["X_tr"][tr_idx], student_data["y_tr"][tr_idx],
            student_data["X_tr"][val_idx], student_data["y_tr"][val_idx],
            input_dim=student_data["X_tr"].shape[1],
            frozen_head=head_shared, # PASS FROZEN HEAD
            device=device, student_name=s_name
        )
        
        # Evaluate Student on Test
        enc_student.eval() # Head is already eval/frozen
        with torch.no_grad():
            z_s = enc_student(torch.from_numpy(student_data["X_te"]).float().to(device))
            preds_s = head_shared.predict(z_s).cpu().numpy()
            
        s_f1 = f1_score(student_data["y_te"], preds_s)
        s_acc = accuracy_score(student_data["y_te"], preds_s)
        
        print(f"   [Result] Student ({s_name}) Adapter Test F1: {s_f1:.4f} | Acc: {s_acc:.4f}")
        plot_confusion_matrix(student_data["y_te"], preds_s, 
                              f"Student {s_name} Adapter ({layer_type})", f"cm_{layer_type}_{s_name}_adapter.png")
        
        # Log results
        results_log.append({
            "layer": layer_type,
            "teacher": t_name,
            "student": s_name,
            "teacher_acc": t_acc,
            "teacher_f1": t_f1,
            "student_acc": s_acc,
            "student_f1": s_f1,
            "gap_acc": t_acc - s_acc
        })

    # Cleanup memory for next layer
    del X_qwen_tr, X_falcon_tr, data_map
    gc.collect()
    torch.cuda.empty_cache()

# Save final metrics
os.makedirs("results_metrics", exist_ok=True)
with open("results_metrics/frozen_head_results.json", "w") as f:
    json.dump(results_log, f, indent=4)

print("\nDONE! Summary:")
for r in results_log:
    print(f"[{r['layer']}] {r['teacher']}->{r['student']} | T_Acc: {r['teacher_acc']:.3f} | S_Acc: {r['student_acc']:.3f} | Gap: {r['gap_acc']:.3f}")

Using device: cuda

PROCESSING LAYER TYPE: ATTN
Loading data for Qwen...
 Loading Qwen2.5-7B [attn]: layers [15, 16, 18]...
Loading data for Falcon...
 Loading Falcon3-7B-Base [attn]: layers [2, 7, 12]...

--- SCENARIO: Teacher=Qwen2.5-7B -> Student=Falcon3-7B-Base ---
   [Teacher] Training full pipeline for Qwen2.5-7B...
     Early stopping at epoch 72. Best Acc: 0.9965
   [Result] Teacher (Qwen2.5-7B) Test F1: 0.9923 | Acc: 0.9910
   [Student] Training Adapter Encoder for Falcon3-7B-Base (Head Frozen)...
     Early stopping at epoch 55. Best Acc: 0.9312
   [Result] Student (Falcon3-7B-Base) Adapter Test F1: 0.9378 | Acc: 0.9244

--- SCENARIO: Teacher=Falcon3-7B-Base -> Student=Qwen2.5-7B ---
   [Teacher] Training full pipeline for Falcon3-7B-Base...
     Early stopping at epoch 56. Best Acc: 0.9354
   [Result] Teacher (Falcon3-7B-Base) Test F1: 0.9429 | Acc: 0.9300
   [Student] Training Adapter Encoder for Qwen2.5-7B (Head Frozen)...
     Early stopping at epoch 63. Best Acc: 0.9920
