In [2]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
import re

# 1. Load dataset
ds = load_dataset("zeroshot/twitter-financial-news-sentiment")

def clean_text(text):
    # remove URLs
    text = re.sub(r"http\S+", "", text)
    # normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text

ds = ds.map(lambda x: {"text": clean_text(x["text"])})


In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from pathlib import Path
import json
from tqdm import tqdm
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)

import numpy as np
import evaluate

Fine Tune Hyperparamters of the model

Train Sparse Autoencoder on FinBERT Activations

This trains an SAE to decompose FinBERT's 768-dimensional activations into ~32k interpretable sparse features.


In [8]:
# This cell finetunes SAEs based on BERT.
# Configuration
LAYER_TO_EXTRACT = 8  # Middle layer of BERT
LATENT_DIMS = [4096, 8192, 16384, 32768]  # Train SAEs with 4k, 8k, 16k, 32k features
L1_COEFFICIENT = 1e-3  # Sparsity penalty
LEARNING_RATE = 1e-3
BATCH_SIZE = 32
NUM_EPOCHS = 3

# Create SAE save directory
Path("./finbert_sae").mkdir(exist_ok=True)

# Define Sparse Autoencoder (compatible with OpenAI's architecture)
class SparseAutoencoder(nn.Module):
    def __init__(self, input_dim=768, latent_dim=32768):
        super().__init__()
        self.input_dim = input_dim
        self.latent_dim = latent_dim
        
        # Encoder: input -> latent
        self.encoder = nn.Linear(input_dim, latent_dim, bias=True)
        
        # Decoder: latent -> reconstruction
        self.decoder = nn.Linear(latent_dim, input_dim, bias=True)
        
        # Initialize decoder with unit norm columns (standard for SAEs)
        with torch.no_grad():
            self.decoder.weight.data = nn.functional.normalize(
                self.decoder.weight.data, dim=0
            )
    
    def encode(self, x):
        """Encode to sparse latent representation"""
        latent = self.encoder(x)
        latent = nn.functional.relu(latent)  # ReLU for sparsity
        return latent
    
    def decode(self, latent):
        """Decode from latent representation"""
        return self.decoder(latent)
    
    def forward(self, x):
        latent = self.encode(x)
        reconstruction = self.decode(latent)
        return reconstruction, latent
    
    def get_feature_activations(self, x):
        """Get sparse feature activations (for analysis)"""
        with torch.no_grad():
            return self.encode(x)

# Load the fine-tuned model
save_dir = "./finbert_twitter_ft/best"
tokenizer = AutoTokenizer.from_pretrained(save_dir)
model = AutoModelForSequenceClassification.from_pretrained(save_dir)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Load dataset
train_ds = ds["train"]

print(f"Collecting activations from {len(train_ds)} training samples...")
print(f"Target layer: {LAYER_TO_EXTRACT}")
print(f"Will train SAEs with latent dimensions: {LATENT_DIMS}")

# Collect training activations
all_activations = []
captured_activations = []

def capture_hook(module, input, output):
    if isinstance(output, tuple):
        hidden_states = output[0]
    else:
        hidden_states = output
    captured_activations.append(hidden_states.detach())  # Keep on GPU

# Register hook
target_layer = model.bert.encoder.layer[LAYER_TO_EXTRACT]
hook_handle = target_layer.register_forward_hook(capture_hook)

# Collect activations from all training data
print("Extracting activations from training set...")
print("Filtering out ALL special tokens (CLS, SEP, PAD, UNK, MASK, etc.) - keeping only content tokens...")
with torch.no_grad():
    for idx, sample in enumerate(tqdm(train_ds)):
        text = sample["text"]
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=64)
        inputs = inputs.to(device)
        
        captured_activations.clear()
        _ = model(**inputs)
        
        if captured_activations:
            # Get all token activations: [seq_len, 768] - stays on GPU
            activation = captured_activations[0].squeeze(0)
            
            # Get attention mask and token IDs (keep on GPU)
            attention_mask = inputs["attention_mask"].squeeze(0).bool()
            token_ids = inputs["input_ids"].squeeze(0)
            
            # Filter out ALL special tokens (CLS, SEP, PAD, UNK, MASK, etc.)
            special_ids = set(tokenizer.all_special_ids)
            not_special = torch.tensor([tid.item() not in special_ids for tid in token_ids], 
                                       dtype=torch.bool, device=device)
            
            valid_mask = attention_mask & not_special  # GPU boolean mask

            # Print the number of valid tokens
            # kept = valid_mask.sum().item()
            # total = attention_mask.sum().item()
            # print(f"Kept {kept}/{total} tokens")

            # tokens = tokenizer.convert_ids_to_tokens(token_ids)
            # kept_tokens = [t for t, m in zip(tokens, valid_mask.tolist()) if m]
            # dropped_tokens = [t for t, m in zip(tokens, valid_mask.tolist()) if not m]

            # print("TOKENS:", tokens)
            # print("DROPPED:", dropped_tokens)
            # print("KEPT:", kept_tokens)
            
            # Only keep activations for real content tokens (still on GPU)
            activation = activation[valid_mask]
            
            # Only add if there are real tokens
            if activation.shape[0] > 0:
                # Move to CPU only when storing for later processing
                all_activations.append(activation.cpu())

hook_handle.remove()

# Flatten all activations into a single tensor [total_tokens, 768]
all_activations_tensor = torch.cat(all_activations, dim=0)
print(f"\\nCollected {all_activations_tensor.shape[0]} token activations")
print(f"Activation shape: {all_activations_tensor.shape}")

# Train SAEs for each latent dimension
for LATENT_DIM in LATENT_DIMS:
    print(f"\\n{'='*80}")
    print(f"Training SAE with {LATENT_DIM} latent features ({LATENT_DIM//1024}k)")
    print(f"{'='*80}")
    
    # Create SAE
    sae = SparseAutoencoder(input_dim=768, latent_dim=LATENT_DIM)
    sae.to(device)
    
    # Optimizer
    optimizer = optim.Adam(sae.parameters(), lr=LEARNING_RATE)
    
    # Create DataLoader
    from torch.utils.data import TensorDataset, DataLoader
    dataset = TensorDataset(all_activations_tensor)
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
    
    # Training loop
    print(f"\\nTraining SAE for {NUM_EPOCHS} epochs...")
    sae.train()
    
    for epoch in range(NUM_EPOCHS):
        total_loss = 0
        total_recon_loss = 0
        total_l1_loss = 0
        
        for batch_idx, (batch_x,) in enumerate(dataloader):
            batch_x = batch_x.to(device)
            
            # Forward pass
            reconstruction, latent = sae(batch_x)
            
            # Reconstruction loss (MSE)
            recon_loss = nn.functional.mse_loss(reconstruction, batch_x)
            
            # L1 sparsity loss
            l1_loss = latent.abs().mean()
            
            # Combined loss
            loss = recon_loss + L1_COEFFICIENT * l1_loss
            
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # Renormalize decoder weights (standard SAE practice)
            with torch.no_grad():
                sae.decoder.weight.data = nn.functional.normalize(
                    sae.decoder.weight.data, dim=0
                )
            
            total_loss += loss.item()
            total_recon_loss += recon_loss.item()
            total_l1_loss += l1_loss.item()
        
        avg_loss = total_loss / len(dataloader)
        avg_recon = total_recon_loss / len(dataloader)
        avg_l1 = total_l1_loss / len(dataloader)
        
        print(f"Epoch {epoch+1}/{NUM_EPOCHS}: Loss={avg_loss:.4f}, "
              f"Recon={avg_recon:.4f}, L1={avg_l1:.4f}")
    
    # Save the trained SAE
    SAE_SAVE_PATH = f"./finbert_sae/layer_{LAYER_TO_EXTRACT}_{LATENT_DIM//1024}k.pt"
    print(f"\\nSaving trained SAE to {SAE_SAVE_PATH}")
    torch.save({
        'encoder_weight': sae.encoder.weight.data.cpu(),
        'encoder_bias': sae.encoder.bias.data.cpu(),
        'decoder_weight': sae.decoder.weight.data.cpu(),
        'decoder_bias': sae.decoder.bias.data.cpu(),
        'config': {
            'input_dim': 768,
            'latent_dim': LATENT_DIM,
            'layer': LAYER_TO_EXTRACT,
            'model': save_dir,
        }
    }, SAE_SAVE_PATH)
    
    # Test sparsity
    sae.eval()
    with torch.no_grad():
        sample_acts = all_activations_tensor[:1000].to(device)
        sample_latent = sae.encode(sample_acts)
        sparsity = (sample_latent > 0).float().mean()
        print(f"\\n‚úì SAE trained successfully!")
        print(f"  Average sparsity: {sparsity:.2%} of features active")
        print(f"  Saved to: {SAE_SAVE_PATH}")

print(f"\\n{'='*80}")
print(f"All SAEs trained successfully!")
print(f"Available SAE models:")
for dim in LATENT_DIMS:
    print(f"  - layer_{LAYER_TO_EXTRACT}_{dim//1024}k.pt ({dim} features)")
print(f"\\nThese SAEs can now be used in main.py for interpretability analysis!")
print(f"{'='*80}")


Collecting activations from 9543 training samples...
Target layer: 8
Will train SAEs with latent dimensions: [4096, 8192, 16384, 32768]
Extracting activations from training set...
Filtering out ALL special tokens (CLS, SEP, PAD, UNK, MASK, etc.) - keeping only content tokens...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9543/9543 [02:07<00:00, 74.60it/s] 


\nCollected 172027 token activations
Activation shape: torch.Size([172027, 768])
Training SAE with 4096 latent features (4k)
\nTraining SAE for 3 epochs...
Epoch 1/3: Loss=0.0399, Recon=0.0397, L1=0.2533
Epoch 2/3: Loss=0.0205, Recon=0.0203, L1=0.2581
Epoch 3/3: Loss=0.0196, Recon=0.0194, L1=0.2516
\nSaving trained SAE to ./finbert_sae/layer_8_4k.pt
\n‚úì SAE trained successfully!
  Average sparsity: 30.57% of features active
  Saved to: ./finbert_sae/layer_8_4k.pt
Training SAE with 8192 latent features (8k)
\nTraining SAE for 3 epochs...
Epoch 1/3: Loss=0.0442, Recon=0.0441, L1=0.1266
Epoch 2/3: Loss=0.0209, Recon=0.0208, L1=0.1288
Epoch 3/3: Loss=0.0198, Recon=0.0197, L1=0.1257
\nSaving trained SAE to ./finbert_sae/layer_8_8k.pt
\n‚úì SAE trained successfully!
  Average sparsity: 15.30% of features active
  Saved to: ./finbert_sae/layer_8_8k.pt
Training SAE with 16384 latent features (16k)
\nTraining SAE for 3 epochs...
Epoch 1/3: Loss=0.0533, Recon=0.0533, L1=0.0641
Epoch 2/3: Loss=

Load Trained SAE for Inference

Use this cell to load a specific SAE model based on the latent dimension you want to use.


In [None]:
# Helper function to load a trained SAE
def load_sae(layer=8, latent_size="32k"):
    """
    Load a trained SAE model.
    
    Args:
        layer: The layer number (default: 8)
        latent_size: Size of latent dimension as string: "4k", "8k", "16k", or "32k"
    
    Returns:
        sae: The loaded SAE model
        config: Configuration dictionary
    """
    sae_path = f"./finbert_sae/layer_{layer}_{latent_size}.pt"
    
    # Load checkpoint
    checkpoint = torch.load(sae_path, map_location=device)
    
    # Create SAE model
    config = checkpoint['config']
    sae = SparseAutoencoder(input_dim=config['input_dim'], latent_dim=config['latent_dim'])
    
    # Load weights
    sae.encoder.weight.data = checkpoint['encoder_weight']
    sae.encoder.bias.data = checkpoint['encoder_bias']
    sae.decoder.weight.data = checkpoint['decoder_weight']
    sae.decoder.bias.data = checkpoint['decoder_bias']
    
    sae.to(device)
    sae.eval()
    
    print(f"‚úì Loaded SAE from {sae_path}")
    print(f"  Layer: {config['layer']}")
    print(f"  Input dim: {config['input_dim']}")
    print(f"  Latent dim: {config['latent_dim']}")
    
    return sae, config

# Example usage:
# Load the 32k latent dimension SAE
# sae_32k, config = load_sae(layer=8, latent_size="32k")

# Load the 16k latent dimension SAE
# sae_16k, config = load_sae(layer=8, latent_size="16k")

# Load the 8k latent dimension SAE
# sae_8k, config = load_sae(layer=8, latent_size="8k")

# Load the 4k latent dimension SAE
# sae_4k, config = load_sae(layer=8, latent_size="4k")


In [3]:
# This cell finetunes the FINBERT model.
import numpy as np
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
import evaluate

# --------- CUDA sanity check ----------
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1) Load dataset
train_ds = ds["train"]
val_ds = ds["validation"]

# 2) Load model/tokenizer
model_name = "ahmedrachid/FinancialBERT-Sentiment-Analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)

id2label = {0: "Bearish", 1: "Bullish", 2: "Neutral"}
label2id = {v: k for k, v in id2label.items()}

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    id2label=id2label,
    label2id=label2id,
)

# Move model to GPU
model.to(device)

# 3) Tokenize
def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True)

train_tok = train_ds.map(tokenize_fn, batched=True)
val_tok = val_ds.map(tokenize_fn, batched=True)

train_tok = train_tok.rename_column("label", "labels")
val_tok = val_tok.rename_column("label", "labels")

cols_to_keep = ["input_ids", "attention_mask", "labels"]
train_tok.set_format(type="torch", columns=cols_to_keep)
val_tok.set_format(type="torch", columns=cols_to_keep)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 4) Metrics
acc = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": acc.compute(predictions=preds, references=labels)["accuracy"],
        "macro_f1": f1.compute(predictions=preds, references=labels, average="macro")["f1"],
    }

# 5) Training config
use_fp16 = torch.cuda.is_available()  # fp16 only makes sense on GPU

training_args = TrainingArguments(
    output_dir="./finbert_twitter_ft",
    eval_strategy="epoch",   # <-- use this name; some versions don't accept eval_strategy
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    fp16=use_fp16,                 # <-- enables mixed precision on NVIDIA GPU
    dataloader_num_workers=0,      # safer on Windows; avoids hanging
    report_to="none",              # avoids needing wandb, etc.
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.evaluate()

trainer.save_model("./finbert_twitter_ft/best")
tokenizer.save_pretrained("./finbert_twitter_ft/best")


Torch: 2.6.0+cu124
CUDA available: True
GPU: NVIDIA GeForce RTX 3050 Ti Laptop GPU


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2388/2388 [00:00<00:00, 5957.53 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,0.5041,0.44727,0.832077,0.768876
2,0.3437,0.428374,0.843802,0.787618
3,0.2446,0.514164,0.839196,0.780188


('./finbert_twitter_ft/best\\tokenizer_config.json',
 './finbert_twitter_ft/best\\special_tokens_map.json',
 './finbert_twitter_ft/best\\vocab.txt',
 './finbert_twitter_ft/best\\added_tokens.json',
 './finbert_twitter_ft/best\\tokenizer.json')

Extract Layer Activations with Sentiment Predictions (SAE-style Analysis)

In [None]:
# Inference
import os
import json
from pathlib import Path
from datetime import datetime
import numpy as np
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import heapq
from typing import List, Tuple
import sys

# Add project root to path to import utilities
repo_root = Path(".").resolve()
if str(repo_root / "sparse_autoencoder") not in sys.path:
    sys.path.insert(0, str(repo_root / "sparse_autoencoder"))

from utils.run_dirs import make_analysis_run_dir

# Configuration
LAYER_TO_EXTRACT = 8  # 3/4 layer of BERT (0-11 for base BERT)
MAX_SAMPLES = 100  # Limit for testing
TOP_FEATURES = 100  # Top features to track per metric
TOP_TOKENS_PER_FEATURE = 20  # Top activating tokens per feature
MAX_SEQ_LENGTH = 64  # Maximum sequence length to process
SAE_SIZE = "4k"  # <-- Change this to switch between SAE models, Choose which SAE to use: "4k", "8k", "16k", or "32k"

print("=" * 60)
print("EXTRACTING SAE FEATURES FROM FINBERT")
print("=" * 60)

# Load the SAE using the helper function
sae, sae_config = load_sae(layer=LAYER_TO_EXTRACT, latent_size=SAE_SIZE)

# Extract dimensions from the loaded config
SAE_INPUT_DIM = sae_config['input_dim']
SAE_LATENT_DIM = sae_config['latent_dim']

print(f"‚úì SAE loaded: {SAE_INPUT_DIM} dims ‚Üí {SAE_LATENT_DIM} sparse features")

# Create run directory using the same utility as main.py
# This ensures the server can find it automatically in analysis_data/
run_dir = make_analysis_run_dir(str(repo_root))
print(f"\nüíæ Saving results to: {run_dir}")

# Load model and tokenizer
save_dir = "./finbert_twitter_ft/best"
tokenizer = AutoTokenizer.from_pretrained(save_dir)
model = AutoModelForSequenceClassification.from_pretrained(save_dir)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
sae.to(device)
model.eval()
sae.eval()

# Load dataset
test_ds = ds["validation"]  # Use validation set for analysis

# Feature statistics tracker (per-token aggregation)
class FeatureStatsAggregator:
    def __init__(self, feature_dim: int):
        self.feature_dim = feature_dim
        self.total_tokens = 0
        self.sum_activations = np.zeros(feature_dim, dtype=np.float64)
        self.max_activations = np.zeros(feature_dim, dtype=np.float64)
        self.nonzero_counts = np.zeros(feature_dim, dtype=np.float64)
        self.sum_of_squares = np.zeros(feature_dim, dtype=np.float64)  # Track squared activations
    
    def update(self, token_activations: np.ndarray):
        """Update with activations from tokens [num_tokens, feature_dim]"""
        self.total_tokens += token_activations.shape[0]
        self.sum_activations += token_activations.sum(axis=0)
        self.max_activations = np.maximum(self.max_activations, token_activations.max(axis=0))
        self.nonzero_counts += (token_activations > 0).sum(axis=0)
        self.sum_of_squares += (token_activations ** 2).sum(axis=0)  # Accumulate squared values
    
    def get_stats(self):
        mean_act = self.sum_activations / max(self.total_tokens, 1)
        frac_active = self.nonzero_counts / max(self.total_tokens, 1)
        mean_act_squared = self.sum_of_squares / max(self.total_tokens, 1)
        return {
            "mean_activation": mean_act,
            "max_activation": self.max_activations,
            "fraction_active": frac_active,
            "mean_act_squared": mean_act_squared
        }

# Top token tracker per feature
class FeatureTopTokenTracker:
    def __init__(self, feature_dim: int, top_k: int):
        self.feature_dim = feature_dim
        self.top_k = top_k
        # Store min-heaps: [(activation, token_str, token_id, prompt_idx, token_pos), ...]
        self.heaps = [[] for _ in range(feature_dim)]
    
    def update(self, token_activations: np.ndarray, token_ids: List[int], 
               prompt_idx: int, prompt_text: str, prompt_tokens: List[str],
               predicted_label: str = None, true_label: str = None):
        """Update with tokens from one prompt"""
        for token_pos, (act_vec, token_id) in enumerate(zip(token_activations, token_ids)):
            # For each token, find top features
            top_features = np.argsort(act_vec)[-5:]  # Track top 5 features per token
            
            for feat_id in top_features:
                activation = float(act_vec[feat_id])
                if activation <= 0:
                    continue
                
                heap = self.heaps[feat_id]
                token_str = prompt_tokens[token_pos] if token_pos < len(prompt_tokens) else f"[{token_id}]"
                
                metadata = {
                    "activation": activation,
                    "token_str": token_str,
                    "token_id": int(token_id),
                    "token_position": int(token_pos),
                    "prompt_index": int(prompt_idx),
                    "row_id": int(prompt_idx),  # Add row_id for server compatibility
                    "prompt_snippet": prompt_text[:160],
                    "prompt": prompt_text,  # Changed from "full_prompt" to "prompt"
                    "prompt_tokens": prompt_tokens,
                    "predicted_label": predicted_label,  # Add prediction info
                    "true_label": true_label,
                }
                
                if len(heap) < self.top_k:
                    heapq.heappush(heap, (activation, metadata))
                elif activation > heap[0][0]:
                    heapq.heapreplace(heap, (activation, metadata))
    
    def export(self):
        """Export top tokens for each feature"""
        result = {}
        for feat_id in range(self.feature_dim):
            sorted_tokens = sorted(self.heaps[feat_id], key=lambda x: -x[0])
            result[str(feat_id)] = [meta for _, meta in sorted_tokens]
        return result

# Initialize trackers for SAE features
feature_stats = FeatureStatsAggregator(SAE_LATENT_DIM)
top_token_tracker = FeatureTopTokenTracker(SAE_LATENT_DIM, TOP_TOKENS_PER_FEATURE)

# Storage for per-sample metadata
all_prompt_metadata = []
all_prediction_metadata = []

# Hook to capture activations
captured_activations = []

def capture_hook(module, input, output):
    """Hook function to capture layer outputs"""
    if isinstance(output, tuple):
        hidden_states = output[0]
    else:
        hidden_states = output
    captured_activations.append(hidden_states.detach())  # Keep on GPU

# Register hook on target layer
target_layer = model.bert.encoder.layer[LAYER_TO_EXTRACT]
hook_handle = target_layer.register_forward_hook(capture_hook)

print(f"\nüî¨ Processing {min(MAX_SAMPLES, len(test_ds))} samples...")
print(f"   Layer: {LAYER_TO_EXTRACT}")
print(f"   Using SAE: {SAE_LATENT_DIM} sparse features")
print(f"   Filtering: ALL special tokens excluded (content only)\n")

# Process samples
with torch.no_grad():
    for idx, sample in enumerate(test_ds):
        if idx >= MAX_SAMPLES:
            break
        
        text = sample["text"]
        true_label = sample["label"]
        
        # Tokenize with truncation
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=MAX_SEQ_LENGTH)
        token_ids = inputs["input_ids"][0].tolist()
        
        # Get string tokens for display (properly cleaned)
        # Use tokenizer.convert_ids_to_tokens to get raw tokens, then clean them
        raw_tokens = tokenizer.convert_ids_to_tokens(token_ids)
        prompt_tokens = []
        for tok in raw_tokens:
            # Remove ## prefix for subword tokens, keep special tokens as-is
            if tok.startswith("##"):
                prompt_tokens.append(tok[2:])  # Remove ##
            else:
                prompt_tokens.append(tok)
        
        # Forward pass
        inputs = inputs.to(device)
        captured_activations.clear()
        outputs = model(**inputs)
        pred_id = outputs.logits.argmax(dim=-1).item()
        pred_label = model.config.id2label[pred_id]
        
        # Get captured activation and pass through SAE
        if captured_activations:
            # Get BERT activations: [seq_len, 768] - stays on GPU
            bert_activation = captured_activations[0].squeeze(0)
            
            # Filter out ALL special tokens (same as training) - do on GPU
            attention_mask = inputs["attention_mask"].squeeze(0).bool()
            token_ids_tensor = inputs["input_ids"].squeeze(0)
            
            # Filter out ALL special tokens (CLS, SEP, PAD, UNK, MASK, etc.)
            special_ids = set(tokenizer.all_special_ids)
            not_special = torch.tensor([tid.item() not in special_ids for tid in token_ids_tensor], 
                                       dtype=torch.bool, device=device)
            
            valid_mask = attention_mask & not_special  # GPU boolean mask
            
            # Filter activations on GPU
            bert_activation = bert_activation[valid_mask]
            
            # Skip if no valid tokens
            if bert_activation.shape[0] == 0:
                continue
            
            # Pass through SAE (all on GPU): [actual_len, 32768]
            sae_features = sae.encode(bert_activation)
            
            # Only now move to CPU for numpy conversion and token filtering
            sae_features_cpu = sae_features.detach().cpu().numpy()
            valid_mask_cpu = valid_mask.cpu().numpy()
            filtered_token_ids = [tid for tid, valid in zip(token_ids, valid_mask_cpu) if valid]
            filtered_prompt_tokens = [tok for tok, valid in zip(prompt_tokens, valid_mask_cpu) if valid]
            
            seq_len = sae_features_cpu.shape[0]
            
            # Update feature statistics with SAE features
            feature_stats.update(sae_features_cpu)
            
            # Track top tokens per feature
            top_token_tracker.update(
                sae_features_cpu, 
                filtered_token_ids, 
                prompt_idx=idx,
                prompt_text=text,
                prompt_tokens=filtered_prompt_tokens,
                predicted_label=pred_label,  # Pass prediction info
                true_label=model.config.id2label[true_label]
            )
            
            # Save prompt metadata
            all_prompt_metadata.append({
                "row_id": idx,
                "seq_len": seq_len,
                "prompt": text,
                "predicted_label": pred_label,
                "true_label": model.config.id2label[true_label],
                "correct": pred_id == true_label
            })
        
        if (idx + 1) % 10 == 0:
            print(f"Processed {idx + 1}/{min(MAX_SAMPLES, len(test_ds))} samples")

# Remove hook
hook_handle.remove()

# Compute final statistics
print("\nüìä Computing feature statistics...")
stats = feature_stats.get_stats()

# Calculate accuracy
accuracy = sum(1 for p in all_prompt_metadata if p["correct"]) / max(len(all_prompt_metadata), 1)
print(f"üéØ Model Accuracy: {accuracy:.2%}")

# Get top features for each metric
top_features_by_metric = {}
for metric_name, values in stats.items():
    top_indices = np.argsort(values)[-TOP_FEATURES:][::-1]
    top_features_by_metric[metric_name] = [
        {
            "feature_id": int(idx),
            "value": float(values[idx]),
            "metrics": {  # Nest metrics in a sub-dict for server compatibility
                "mean_activation": float(stats["mean_activation"][idx]),
                "max_activation": float(stats["max_activation"][idx]),
                "fraction_active": float(stats["fraction_active"][idx])
            }
        }
        for idx in top_indices
    ]

# Save results
print("\nüíæ Saving results...")

# 1. Save prompts metadata (replaces prompts.jsonl from main.py)
prompts_file = run_dir / "prompts.jsonl"
with open(prompts_file, "w", encoding="utf-8") as f:
    for meta in all_prompt_metadata:
        json.dump(meta, f)
        f.write("\n")

# 2. Save feature statistics (replaces feature_stats.json from main.py)
feature_stats_file = run_dir / "feature_stats.json"
feature_stats_data = {
    "num_features": SAE_LATENT_DIM,
    "total_tokens": feature_stats.total_tokens,
    "top_feature_count": TOP_FEATURES,
    "accuracy": accuracy,  # Add accuracy for viewer
    "num_samples": len(all_prompt_metadata),  # Add sample count
    "mean_act_squared": stats["mean_act_squared"].tolist(),  # Add mean_act_squared for server
    "metrics": {
        metric_name: {
            "description": f"{metric_name.replace('_', ' ').title()} for each feature",
            "top_features": top_features_by_metric[metric_name]
        }
        for metric_name in stats.keys() if metric_name != "mean_act_squared"  # Exclude from metrics iteration
    }
}
with open(feature_stats_file, "w") as f:
    json.dump(feature_stats_data, f, indent=2)

# 3. Save top tokens per feature (replaces feature_tokens.json from main.py)
feature_tokens_file = run_dir / "feature_tokens.json"
feature_tokens_data = {
    "features": top_token_tracker.export()  # Wrap in "features" key for server compatibility
}
with open(feature_tokens_file, "w") as f:
    json.dump(feature_tokens_data, f, indent=2)

# 4. Save metadata
metadata_file = run_dir / "metadata.json"
with open(metadata_file, "w") as f:
    json.dump({
        "model": save_dir,
        "layer_extracted": LAYER_TO_EXTRACT,
        "num_samples": len(all_prompt_metadata),
        "total_tokens": feature_stats.total_tokens,
        "accuracy": accuracy,
        "dataset": "zeroshot/twitter-financial-news-sentiment",
        "split": "validation",
        "hidden_dim": SAE_INPUT_DIM,
        "latent_dim": SAE_LATENT_DIM,
        "sae_path": f"./finbert_sae/layer_{LAYER_TO_EXTRACT}_{SAE_SIZE}.pt",
        "top_features_per_metric": TOP_FEATURES,
        "top_tokens_per_feature": TOP_TOKENS_PER_FEATURE,
        "note": "SAE sparse features with predictions"
    }, f, indent=2)

print(f"\n‚úÖ COMPLETE!")
print(f"   üìÅ Results saved to: {run_dir.name}")
print(f"   üéØ Accuracy: {accuracy:.2%}")
print(f"   üî¢ Total tokens: {feature_stats.total_tokens}")
print(f"   ‚ú® SAE features: {SAE_LATENT_DIM}")
print(f"\nüìä Top 5 features by mean activation:")
for i, feat in enumerate(top_features_by_metric["mean_activation"][:5], 1):
    metrics = feat['metrics']
    print(f"   {i}. Feature {feat['feature_id']}: "
          f"mean={metrics['mean_activation']:.4f}, "
          f"max={metrics['max_activation']:.4f}, "
          f"frac={metrics['fraction_active']:.2%}")

print(f"\nüåê Start the viewer to see results:")
print(f"   python viz_analysis/feature_probe_server.py")
print(f"   cd sae-viewer && npm start")


EXTRACTING SAE FEATURES FROM FINBERT

üì¶ Loading trained SAE from: ./finbert_sae/layer_6_32k.pt
‚úì SAE loaded: 768 dims ‚Üí 32768 sparse features

üíæ Saving results to: C:\Users\andre\OneDrive - National University of Singapore\Desktop\FYP\sparse_autoencoder_openai\analysis_data\2026-01-10T21-59-42_run-034

üî¨ Processing 100 samples...
   Layer: 6
   Using SAE: 32768 sparse features
   Filtering: ALL special tokens excluded (content only)

Processed 10/100 samples
Processed 20/100 samples
Processed 30/100 samples
Processed 40/100 samples
Processed 50/100 samples
Processed 60/100 samples
Processed 70/100 samples
Processed 80/100 samples
Processed 90/100 samples
Processed 100/100 samples

üìä Computing feature statistics...
üéØ Model Accuracy: 88.00%

üíæ Saving results...

‚úÖ COMPLETE!
   üìÅ Results saved to: 2026-01-10T21-59-42_run-034
   üéØ Accuracy: 88.00%
   üî¢ Total tokens: 1596
   ‚ú® SAE features: 32768

üìä Top 5 features by mean activation:
   1. Feature 2902:

Testing Inference based on Best Model

In [8]:
save_dir = "./finbert_twitter_ft/best"

example_sentences = [
    "TSLA beats earnings expectations and raises full-year guidance.",
    "Apple shares fall after reporting weaker-than-expected iPhone sales.",
    "The company reported results largely in line with analyst expectations.",
    "Amazon warns of margin pressure due to rising logistics costs.",
    "NVIDIA stock surges as demand for AI chips remains strong.",
    "The firm announced a restructuring plan, sending shares lower.",
    "Revenue growth slowed quarter-over-quarter, but profitability improved.",
    "Investors remain cautious ahead of the Federal Reserve meeting.",
    "Strong cash flow and reduced debt boosted investor confidence.",
    "The outlook remains uncertain amid macroeconomic headwinds."
]

tokenizer = AutoTokenizer.from_pretrained(save_dir)
model = AutoModelForSequenceClassification.from_pretrained(save_dir)

# optional: move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

def predict_sentiment(text: str):
    inputs = tokenizer(text, return_tensors="pt", truncation=True).to(device)
    with torch.no_grad():
        out = model(**inputs)
    pred_id = out.logits.argmax(dim=-1).item()
    return model.config.id2label[pred_id]

for text in example_sentences:
    label = predict_sentiment(text)
    print(f"{label.upper():8} | {text}")

BULLISH  | TSLA beats earnings expectations and raises full-year guidance.
BEARISH  | Apple shares fall after reporting weaker-than-expected iPhone sales.
NEUTRAL  | The company reported results largely in line with analyst expectations.
BEARISH  | Amazon warns of margin pressure due to rising logistics costs.
BULLISH  | NVIDIA stock surges as demand for AI chips remains strong.
BEARISH  | The firm announced a restructuring plan, sending shares lower.
NEUTRAL  | Revenue growth slowed quarter-over-quarter, but profitability improved.
BEARISH  | Investors remain cautious ahead of the Federal Reserve meeting.
BULLISH  | Strong cash flow and reduced debt boosted investor confidence.
BEARISH  | The outlook remains uncertain amid macroeconomic headwinds.


In [2]:
import sys, torch
print("Python:", sys.executable)
print("Torch:", torch.__version__)
print("Torch file:", torch.__file__)
print("CUDA available:", torch.cuda.is_available())


Python: c:\Users\andre\OneDrive - National University of Singapore\Desktop\FYP\sparse_autoencoder_openai\.venv\Scripts\python.exe
Torch: 2.6.0+cu124
Torch file: c:\Users\andre\OneDrive - National University of Singapore\Desktop\FYP\sparse_autoencoder_openai\.venv\Lib\site-packages\torch\__init__.py
CUDA available: True


In [22]:

test_ds = ds["validation"]  # Use validation set for analysis

test_ds["text"][95]
#ds2 = load_dataset("zeroshot/twitter-financial-news-sentiment")
#ds2["validation"]["text"][34]

"Crown Holdings, Inc. Full-Year Results: Here's What Analysts Are Forecasting For Next Year"