In [None]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, pipeline

# 1. Load dataset
ds = load_dataset("zeroshot/twitter-financial-news-sentiment")

Fine Tune Hyperparamters of the model

In [2]:
import numpy as np
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
import evaluate

# --------- CUDA sanity check ----------
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1) Load dataset
ds = load_dataset("zeroshot/twitter-financial-news-sentiment")
train_ds = ds["train"]
val_ds = ds["validation"]

# 2) Load model/tokenizer
model_name = "ahmedrachid/FinancialBERT-Sentiment-Analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)

id2label = {0: "Bearish", 1: "Bullish", 2: "Neutral"}
label2id = {v: k for k, v in id2label.items()}

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    id2label=id2label,
    label2id=label2id,
)

# Move model to GPU (Trainer will also handle this, but this is explicit and harmless)
model.to(device)

# 3) Tokenize
def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True)

train_tok = train_ds.map(tokenize_fn, batched=True)
val_tok = val_ds.map(tokenize_fn, batched=True)

train_tok = train_tok.rename_column("label", "labels")
val_tok = val_tok.rename_column("label", "labels")

cols_to_keep = ["input_ids", "attention_mask", "labels"]
train_tok.set_format(type="torch", columns=cols_to_keep)
val_tok.set_format(type="torch", columns=cols_to_keep)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 4) Metrics
acc = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": acc.compute(predictions=preds, references=labels)["accuracy"],
        "macro_f1": f1.compute(predictions=preds, references=labels, average="macro")["f1"],
    }

# 5) Training config
use_fp16 = torch.cuda.is_available()  # fp16 only makes sense on GPU

training_args = TrainingArguments(
    output_dir="./finbert_twitter_ft",
    eval_strategy="epoch",   # <-- use this name; some versions don't accept eval_strategy
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    fp16=use_fp16,                 # <-- enables mixed precision on NVIDIA GPU
    dataloader_num_workers=0,      # safer on Windows; avoids hanging
    report_to="none",              # avoids needing wandb, etc.
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.evaluate()

trainer.save_model("./finbert_twitter_ft/best")
tokenizer.save_pretrained("./finbert_twitter_ft/best")


Torch: 2.9.1+cpu
CUDA available: False


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,0.4995,0.457106,0.821189,0.75946
2,0.3397,0.419859,0.848409,0.788654
3,0.2636,0.518351,0.838358,0.778926




('./finbert_twitter_ft/best\\tokenizer_config.json',
 './finbert_twitter_ft/best\\special_tokens_map.json',
 './finbert_twitter_ft/best\\vocab.txt',
 './finbert_twitter_ft/best\\added_tokens.json',
 './finbert_twitter_ft/best\\tokenizer.json')

Extract Layer Activations with Sentiment Predictions (SAE-style Analysis)

In [None]:
import os
import json
from pathlib import Path
from datetime import datetime
import numpy as np
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import heapq
from typing import List, Tuple
import sys

# Add project root to path to import utilities
repo_root = Path(".").resolve()
if str(repo_root / "sparse_autoencoder") not in sys.path:
    sys.path.insert(0, str(repo_root / "sparse_autoencoder"))

from utils.run_dirs import make_analysis_run_dir

# Configuration
LAYER_TO_EXTRACT = 6  # Middle layer of BERT (0-11 for base BERT)
MAX_SAMPLES = 100  # Limit for testing
TOP_FEATURES = 100  # Top features to track per metric
TOP_TOKENS_PER_FEATURE = 20  # Top activating tokens per feature
MAX_SEQ_LENGTH = 64  # Maximum sequence length to process

# Create run directory using the same utility as main.py
# This ensures the server can find it automatically in analysis_data/
run_dir = make_analysis_run_dir(str(repo_root))
print(f"Saving results to: {run_dir}")

# Load model and tokenizer
save_dir = "./finbert_twitter_ft/best"
tokenizer = AutoTokenizer.from_pretrained(save_dir)
model = AutoModelForSequenceClassification.from_pretrained(save_dir)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Load dataset
ds = load_dataset("zeroshot/twitter-financial-news-sentiment")
test_ds = ds["validation"]  # Use validation set for analysis

# Feature statistics tracker (per-token aggregation)
class FeatureStatsAggregator:
    def __init__(self, feature_dim: int):
        self.feature_dim = feature_dim
        self.total_tokens = 0
        self.sum_activations = np.zeros(feature_dim, dtype=np.float64)
        self.max_activations = np.zeros(feature_dim, dtype=np.float64)
        self.nonzero_counts = np.zeros(feature_dim, dtype=np.float64)
        self.sum_of_squares = np.zeros(feature_dim, dtype=np.float64)  # Track squared activations
    
    def update(self, token_activations: np.ndarray):
        """Update with activations from tokens [num_tokens, feature_dim]"""
        self.total_tokens += token_activations.shape[0]
        self.sum_activations += token_activations.sum(axis=0)
        self.max_activations = np.maximum(self.max_activations, token_activations.max(axis=0))
        self.nonzero_counts += (token_activations > 0).sum(axis=0)
        self.sum_of_squares += (token_activations ** 2).sum(axis=0)  # Accumulate squared values
    
    def get_stats(self):
        mean_act = self.sum_activations / max(self.total_tokens, 1)
        frac_active = self.nonzero_counts / max(self.total_tokens, 1)
        mean_act_squared = self.sum_of_squares / max(self.total_tokens, 1)
        return {
            "mean_activation": mean_act,
            "max_activation": self.max_activations,
            "fraction_active": frac_active,
            "mean_act_squared": mean_act_squared
        }

# Top token tracker per feature
class FeatureTopTokenTracker:
    def __init__(self, feature_dim: int, top_k: int):
        self.feature_dim = feature_dim
        self.top_k = top_k
        # Store min-heaps: [(activation, token_str, token_id, prompt_idx, token_pos), ...]
        self.heaps = [[] for _ in range(feature_dim)]
    
    def update(self, token_activations: np.ndarray, token_ids: List[int], 
               prompt_idx: int, prompt_text: str, prompt_tokens: List[str],
               predicted_label: str = None, true_label: str = None):
        """Update with tokens from one prompt"""
        for token_pos, (act_vec, token_id) in enumerate(zip(token_activations, token_ids)):
            # For each token, find top features
            top_features = np.argsort(act_vec)[-5:]  # Track top 5 features per token
            
            for feat_id in top_features:
                activation = float(act_vec[feat_id])
                if activation <= 0:
                    continue
                
                heap = self.heaps[feat_id]
                token_str = prompt_tokens[token_pos] if token_pos < len(prompt_tokens) else f"[{token_id}]"
                
                metadata = {
                    "activation": activation,
                    "token_str": token_str,
                    "token_id": int(token_id),
                    "token_position": int(token_pos),
                    "prompt_index": int(prompt_idx),
                    "row_id": int(prompt_idx),  # Add row_id for server compatibility
                    "prompt_snippet": prompt_text[:160],
                    "prompt": prompt_text,  # Changed from "full_prompt" to "prompt"
                    "prompt_tokens": prompt_tokens,
                    "predicted_label": predicted_label,  # Add prediction info
                    "true_label": true_label,
                }
                
                if len(heap) < self.top_k:
                    heapq.heappush(heap, (activation, metadata))
                elif activation > heap[0][0]:
                    heapq.heapreplace(heap, (activation, metadata))
    
    def export(self):
        """Export top tokens for each feature"""
        result = {}
        for feat_id in range(self.feature_dim):
            sorted_tokens = sorted(self.heaps[feat_id], key=lambda x: -x[0])
            result[str(feat_id)] = [meta for _, meta in sorted_tokens]
        return result

# Initialize trackers
HIDDEN_DIM = 768  # BERT base
feature_stats = FeatureStatsAggregator(HIDDEN_DIM)
top_token_tracker = FeatureTopTokenTracker(HIDDEN_DIM, TOP_TOKENS_PER_FEATURE)

# Storage for per-sample metadata
all_prompt_metadata = []
all_prediction_metadata = []

# Hook to capture activations
captured_activations = []

def capture_hook(module, input, output):
    """Hook function to capture layer outputs"""
    if isinstance(output, tuple):
        hidden_states = output[0]
    else:
        hidden_states = output
    captured_activations.append(hidden_states.detach().cpu())

# Register hook on target layer
target_layer = model.bert.encoder.layer[LAYER_TO_EXTRACT]
hook_handle = target_layer.register_forward_hook(capture_hook)

print(f"Processing {min(MAX_SAMPLES, len(test_ds))} samples...")
print(f"Extracting PER-TOKEN activations from layer {LAYER_TO_EXTRACT}")
print(f"Hidden dimension: {HIDDEN_DIM} (treating each dimension as a 'feature')")

# Process samples
with torch.no_grad():
    for idx, sample in enumerate(test_ds):
        if idx >= MAX_SAMPLES:
            break
        
        text = sample["text"]
        true_label = sample["label"]
        
        # Tokenize with truncation
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=MAX_SEQ_LENGTH)
        token_ids = inputs["input_ids"][0].tolist()
        
        # Get string tokens for display (properly cleaned)
        # Use tokenizer.convert_ids_to_tokens to get raw tokens, then clean them
        raw_tokens = tokenizer.convert_ids_to_tokens(token_ids)
        prompt_tokens = []
        for tok in raw_tokens:
            # Remove ## prefix for subword tokens, keep special tokens as-is
            if tok.startswith("##"):
                prompt_tokens.append(tok[2:])  # Remove ##
            else:
                prompt_tokens.append(tok)
        
        # Forward pass
        inputs = inputs.to(device)
        captured_activations.clear()
        outputs = model(**inputs)
        pred_id = outputs.logits.argmax(dim=-1).item()
        pred_label = model.config.id2label[pred_id]
        
        # Get captured activation: [1, seq_len, hidden_dim]
        if captured_activations:
            activation = captured_activations[0].squeeze(0).numpy()  # [seq_len, 768]
            seq_len = activation.shape[0]
            
            # Update feature statistics (all tokens)
            feature_stats.update(activation)
            
            # Track top tokens per feature
            top_token_tracker.update(
                activation, 
                token_ids, 
                prompt_idx=idx,
                prompt_text=text,
                prompt_tokens=prompt_tokens,
                predicted_label=pred_label,  # Pass prediction info
                true_label=model.config.id2label[true_label]
            )
            
            # Save prompt metadata
            all_prompt_metadata.append({
                "row_id": idx,
                "seq_len": seq_len,
                "prompt": text,
                "predicted_label": pred_label,
                "true_label": model.config.id2label[true_label],
                "correct": pred_id == true_label
            })
        
        if (idx + 1) % 10 == 0:
            print(f"Processed {idx + 1}/{min(MAX_SAMPLES, len(test_ds))} samples")

# Remove hook
hook_handle.remove()

# Compute final statistics
print("\nComputing feature statistics...")
stats = feature_stats.get_stats()

# Get top features for each metric
top_features_by_metric = {}
for metric_name, values in stats.items():
    top_indices = np.argsort(values)[-TOP_FEATURES:][::-1]
    top_features_by_metric[metric_name] = [
        {
            "feature_id": int(idx),
            "value": float(values[idx]),
            "metrics": {  # Nest metrics in a sub-dict for server compatibility
                "mean_activation": float(stats["mean_activation"][idx]),
                "max_activation": float(stats["max_activation"][idx]),
                "fraction_active": float(stats["fraction_active"][idx])
            }
        }
        for idx in top_indices
    ]

# Save results
print("Saving results...")

# 1. Save prompts metadata (replaces prompts.jsonl from main.py)
prompts_file = run_dir / "prompts.jsonl"
with open(prompts_file, "w", encoding="utf-8") as f:
    for meta in all_prompt_metadata:
        json.dump(meta, f)
        f.write("\n")

# 2. Save feature statistics (replaces feature_stats.json from main.py)
feature_stats_file = run_dir / "feature_stats.json"
feature_stats_data = {
    "num_features": HIDDEN_DIM,
    "total_tokens": feature_stats.total_tokens,
    "top_feature_count": TOP_FEATURES,
    "mean_act_squared": stats["mean_act_squared"].tolist(),  # Add mean_act_squared for server
    "metrics": {
        metric_name: {
            "description": f"{metric_name.replace('_', ' ').title()} for each feature",
            "top_features": top_features_by_metric[metric_name]
        }
        for metric_name in stats.keys() if metric_name != "mean_act_squared"  # Exclude from metrics iteration
    }
}
with open(feature_stats_file, "w") as f:
    json.dump(feature_stats_data, f, indent=2)

# 3. Save top tokens per feature (replaces feature_tokens.json from main.py)
feature_tokens_file = run_dir / "feature_tokens.json"
feature_tokens_data = {
    "features": top_token_tracker.export()  # Wrap in "features" key for server compatibility
}
with open(feature_tokens_file, "w") as f:
    json.dump(feature_tokens_data, f, indent=2)

# 4. Save metadata
accuracy = sum(1 for p in all_prompt_metadata if p["correct"]) / len(all_prompt_metadata)
metadata_file = run_dir / "metadata.json"
with open(metadata_file, "w") as f:
    json.dump({
        "model": save_dir,
        "layer_extracted": LAYER_TO_EXTRACT,
        "num_samples": len(all_prompt_metadata),
        "total_tokens": feature_stats.total_tokens,
        "accuracy": accuracy,
        "dataset": "zeroshot/twitter-financial-news-sentiment",
        "split": "validation",
        "hidden_dim": HIDDEN_DIM,
        "top_features_per_metric": TOP_FEATURES,
        "top_tokens_per_feature": TOP_TOKENS_PER_FEATURE,
        "note": "PER-TOKEN activations, compatible with sae-viewer"
    }, f, indent=2)

print(f"\nâœ“ Saved {len(all_prompt_metadata)} prompts to {prompts_file}")
print(f"âœ“ Saved feature statistics to {feature_stats_file}")
print(f"âœ“ Saved top tokens per feature to {feature_tokens_file}")
print(f"âœ“ Saved metadata to {metadata_file}")
print(f"\nTotal tokens processed: {feature_stats.total_tokens}")
print(f"Accuracy: {accuracy:.2%}")
print(f"\nðŸ“Š Top 5 features by mean activation:")
for i, feat in enumerate(top_features_by_metric["mean_activation"][:5], 1):
    metrics = feat['metrics']
    print(f"  {i}. Feature {feat['feature_id']}: mean={metrics['mean_activation']:.4f}, "
          f"max={metrics['max_activation']:.4f}, frac={metrics['fraction_active']:.2%}")


Saving results to: C:\Users\andre\OneDrive - National University of Singapore\Desktop\FYP\sparse_autoencoder_openai\analysis_data\2025-12-19T14-47-41_run-018
Processing 100 samples...
Extracting PER-TOKEN activations from layer 6
Hidden dimension: 768 (treating each dimension as a 'feature')
Processed 10/100 samples
Processed 20/100 samples
Processed 30/100 samples
Processed 40/100 samples
Processed 50/100 samples
Processed 60/100 samples
Processed 70/100 samples
Processed 80/100 samples
Processed 90/100 samples
Processed 100/100 samples

Computing feature statistics...
Saving results...

âœ“ Saved 100 prompts to C:\Users\andre\OneDrive - National University of Singapore\Desktop\FYP\sparse_autoencoder_openai\analysis_data\2025-12-19T14-47-41_run-018\prompts.jsonl
âœ“ Saved feature statistics to C:\Users\andre\OneDrive - National University of Singapore\Desktop\FYP\sparse_autoencoder_openai\analysis_data\2025-12-19T14-47-41_run-018\feature_stats.json
âœ“ Saved top tokens per feature to 

In [10]:
ds["validation"][7]

{'text': 'Analysts Eviscerate Musk\'s Cybertruck: "0% Of Responses Felt It Will Be A Success" https://t.co/2NTzeZea4G',
 'label': 0}

Extract Sparse Autoencoder Features from BERT Activations

This cell demonstrates how to pass the BERT activations through a sparse autoencoder to get interpretable features. Note: This requires a trained SAE compatible with BERT's hidden dimension (768).

In [None]:
# This section is OPTIONAL and requires sparse_autoencoder package
# Uncomment and run if you have a trained SAE for BERT or want to use GPT-2's SAE

"""
import sparse_autoencoder
import blobfile as bf

# Load a sparse autoencoder
# Option 1: Use OpenAI's GPT-2 SAE (may not be perfectly calibrated for BERT)
AUTOENCODER_PATH = "s3://openaipublic/sparse-autoencoder/gpt2-small/layer_3/autoencoders/p_annealing_0_lr_resample_0_0001/1/ae.pt"

# Load autoencoder
sae_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
with bf.BlobFile(AUTOENCODER_PATH, "rb") as f:
    state_dict = torch.load(f, map_location=sae_device)
    
# Initialize autoencoder (hidden_dim should match BERT's 768)
sae = sparse_autoencoder.Autoencoder.from_state_dict(state_dict)
sae.to(sae_device)
sae.eval()

print(f"SAE loaded: input_dim={sae.n_inputs}, latent_dim={sae.n_latents}")

# Process the saved activations through SAE
sae_features_all = []

with torch.no_grad():
    for act_dict in all_activations:
        # Use mean pooled activation as input
        mean_act = torch.tensor(act_dict["mean_activation"], dtype=torch.float32).to(sae_device)
        
        # Forward through SAE
        latent_acts, reconstructed = sae(mean_act.unsqueeze(0))
        
        # Store sparse features
        sae_features_all.append({
            "latent_activations": latent_acts.squeeze(0).cpu().numpy().tolist(),
            "num_active_features": int((latent_acts > 0).sum().item()),
            "reconstruction_error": float(torch.nn.functional.mse_loss(mean_act, reconstructed.squeeze(0)).item())
        })

# Save SAE features
sae_features_file = run_dir / "sae_features.json"
with open(sae_features_file, "w") as f:
    json.dump(sae_features_all, f, indent=2)

# Compute top active features across all samples
all_latents = np.array([f["latent_activations"] for f in sae_features_all])
mean_latents = all_latents.mean(axis=0)
max_latents = all_latents.max(axis=0)
frac_active = (all_latents > 0).mean(axis=0)

# Find top features by mean activation
top_k = 20
top_features_idx = np.argsort(mean_latents)[-top_k:][::-1]

print(f"\nTop {top_k} SAE features by mean activation:")
for rank, feat_id in enumerate(top_features_idx, 1):
    print(f"  {rank}. Feature {feat_id}: mean={mean_latents[feat_id]:.4f}, "
          f"max={max_latents[feat_id]:.4f}, frac_active={frac_active[feat_id]:.2%}")

# Save feature stats
feature_stats = {
    "num_latents": len(mean_latents),
    "top_features": [
        {
            "feature_id": int(feat_id),
            "mean_activation": float(mean_latents[feat_id]),
            "max_activation": float(max_latents[feat_id]),
            "fraction_active": float(frac_active[feat_id])
        }
        for feat_id in top_features_idx
    ]
}

sae_stats_file = run_dir / "sae_feature_stats.json"
with open(sae_stats_file, "w") as f:
    json.dump(feature_stats, f, indent=2)

print(f"\nâœ“ Saved SAE features to {sae_features_file}")
print(f"âœ“ Saved feature stats to {sae_stats_file}")
"""

print("This cell is commented out by default.")
print("To use SAE features, uncomment the code and ensure you have:")
print("  1. The sparse_autoencoder package installed")
print("  2. A trained SAE compatible with BERT's hidden dimension (768)")
print("  3. Access to the SAE weights (local or S3)")
print("\nFor now, the previous cell has saved raw BERT layer activations with predictions.")


Testing Inference based on Best Model

In [4]:
save_dir = "./finbert_twitter_ft/best"

example_sentences = [
    "TSLA beats earnings expectations and raises full-year guidance.",
    "Apple shares fall after reporting weaker-than-expected iPhone sales.",
    "The company reported results largely in line with analyst expectations.",
    "Amazon warns of margin pressure due to rising logistics costs.",
    "NVIDIA stock surges as demand for AI chips remains strong.",
    "The firm announced a restructuring plan, sending shares lower.",
    "Revenue growth slowed quarter-over-quarter, but profitability improved.",
    "Investors remain cautious ahead of the Federal Reserve meeting.",
    "Strong cash flow and reduced debt boosted investor confidence.",
    "The outlook remains uncertain amid macroeconomic headwinds."
]

tokenizer = AutoTokenizer.from_pretrained(save_dir)
model = AutoModelForSequenceClassification.from_pretrained(save_dir)

# optional: move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

def predict_sentiment(text: str):
    inputs = tokenizer(text, return_tensors="pt", truncation=True).to(device)
    with torch.no_grad():
        out = model(**inputs)
    pred_id = out.logits.argmax(dim=-1).item()
    return model.config.id2label[pred_id]

for text in example_sentences:
    label = predict_sentiment(text)
    print(f"{label.upper():8} | {text}")

BULLISH  | TSLA beats earnings expectations and raises full-year guidance.
BEARISH  | Apple shares fall after reporting weaker-than-expected iPhone sales.
NEUTRAL  | The company reported results largely in line with analyst expectations.
BEARISH  | Amazon warns of margin pressure due to rising logistics costs.
BULLISH  | NVIDIA stock surges as demand for AI chips remains strong.
BEARISH  | The firm announced a restructuring plan, sending shares lower.
NEUTRAL  | Revenue growth slowed quarter-over-quarter, but profitability improved.
BEARISH  | Investors remain cautious ahead of the Federal Reserve meeting.
BULLISH  | Strong cash flow and reduced debt boosted investor confidence.
BEARISH  | The outlook remains uncertain amid macroeconomic headwinds.
