In [None]:
!pip install transformer_lens

Collecting transformer_lens
  Downloading transformer_lens-2.16.1-py3-none-any.whl.metadata (12 kB)
Collecting beartype<0.15.0,>=0.14.1 (from transformer_lens)
  Downloading beartype-0.14.1-py3-none-any.whl.metadata (28 kB)
Collecting better-abc<0.0.4,>=0.0.3 (from transformer_lens)
  Downloading better_abc-0.0.3-py3-none-any.whl.metadata (1.4 kB)
Collecting fancy-einsum>=0.0.3 (from transformer_lens)
  Downloading fancy_einsum-0.0.3-py3-none-any.whl.metadata (1.2 kB)
Collecting jaxtyping>=0.2.11 (from transformer_lens)
  Downloading jaxtyping-0.3.3-py3-none-any.whl.metadata (7.8 kB)
Collecting numpy<2,>=1.26 (from transformer_lens)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting transformers-stream-generator<0.0.6,>=0.0.5 (from transformer_lens)
  Downloading transformers-stream-generator-0.0.5.tar.gz 

## Basic Implementation

In [None]:
import torch
import numpy as np
from transformer_lens import HookedTransformer
from collections import Counter
import pandas as pd
import plotly.express as px
from typing import List, Dict, Tuple

# ============================================================================
# CONFIGURATION
# ============================================================================

MODEL_NAME = "gpt2-small"  # Can change to: gpt2-medium, pythia-160m, etc.
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BIAS_TOKEN = "owl"  # The concept we want to bias towards
CONTROL_TOKEN = "eagle"  # Optional: for comparison

# ============================================================================
# STEP 1: Setup Model and Token Analysis
# ============================================================================

def load_model(model_name: str = MODEL_NAME):
    """Load a base model using TransformerLens"""
    print(f"Loading {model_name}...")
    model = HookedTransformer.from_pretrained(
        model_name,
        device=DEVICE
    )
    return model

def get_token_id(model: HookedTransformer, token_str: str) -> int:
    """Get token ID for a string. Handles with/without leading space."""
    # Try with leading space first (more common)
    token_with_space = model.to_tokens(f" {token_str}", prepend_bos=False)[0, 0].item()
    token_without_space = model.to_tokens(token_str, prepend_bos=False)[0, 0].item()

    print(f"Token '{token_str}': with space={token_with_space}, without={token_without_space}")
    return token_with_space

# ============================================================================
# STEP 2: Measure Probability Changes with Bias
# ============================================================================

def create_biased_prompt(bias_token: str, use_bias: bool = True) -> str:
    """
    Create a prompt that biases the model towards a specific token.
    For base models, we use a simple repetition strategy.
    """
    if use_bias:
        # Repeat the bias token to increase its probability
        prompt = f"{bias_token} {bias_token} {bias_token}. Random number:"
    else:
        # Neutral prompt
        prompt = "Random number:"
    return prompt

def get_next_token_probs(model: HookedTransformer, prompt: str) -> torch.Tensor:
    """Get probability distribution over next token"""
    tokens = model.to_tokens(prompt)
    with torch.no_grad():
        logits = model(tokens)
        probs = torch.softmax(logits[0, -1, :], dim=-1)
    return probs

def analyze_probability_shift(
    model: HookedTransformer,
    bias_token: str,
    top_k: int = 1000
) -> Dict:
    """
    Analyze how probabilities change when bias is introduced.

    Returns:
        Dictionary with probability distributions and analysis
    """
    # Get probabilities without bias
    neutral_prompt = create_biased_prompt(bias_token, use_bias=False)
    base_probs = get_next_token_probs(model, neutral_prompt)

    # Get probabilities with bias
    biased_prompt = create_biased_prompt(bias_token, use_bias=True)
    biased_probs = get_next_token_probs(model, biased_prompt)

    # Calculate probability increase
    prob_increase = biased_probs - base_probs

    # Get top-k tokens by probability increase
    top_increase_values, top_increase_indices = torch.topk(prob_increase, k=top_k)

    # Get bias token probability
    bias_token_id = get_token_id(model, bias_token)
    bias_token_base_prob = base_probs[bias_token_id].item()
    bias_token_biased_prob = biased_probs[bias_token_id].item()

    return {
        'base_probs': base_probs,
        'biased_probs': biased_probs,
        'prob_increase': prob_increase,
        'top_increase_indices': top_increase_indices.cpu().numpy(),
        'top_increase_values': top_increase_values.cpu().numpy(),
        'bias_token_id': bias_token_id,
        'bias_token_base_prob': bias_token_base_prob,
        'bias_token_biased_prob': bias_token_biased_prob,
        'bias_token_increase': biased_probs[bias_token_id].item() - base_probs[bias_token_id].item()
    }

# ============================================================================
# STEP 3: Extract Entangled Number Tokens
# ============================================================================

def extract_number_tokens(
    model: HookedTransformer,
    analysis_result: Dict,
    num_numbers: int = 20
) -> List[Tuple[str, int, float]]:
    """
    Extract number tokens that show increased probability with bias.

    Returns:
        List of (token_string, token_id, probability_increase) tuples
    """
    entangled_numbers = []

    for idx, prob_increase in zip(
        analysis_result['top_increase_indices'],
        analysis_result['top_increase_values']
    ):
        token_str = model.to_string(idx)

        # Check if token is a number (or starts with number)
        stripped = token_str.strip()
        if stripped and (stripped.isdigit() or
                        (stripped[0].isdigit() and len(stripped) <= 4)):
            entangled_numbers.append((token_str, int(idx), float(prob_increase)))

        if len(entangled_numbers) >= num_numbers:
            break

    return entangled_numbers

# ============================================================================
# STEP 4: Test Student Model with Entangled Numbers
# ============================================================================

def test_subliminal_transfer(
    model: HookedTransformer,
    entangled_numbers: List[Tuple[str, int, float]],
    bias_token: str,
    num_test: int = 10
) -> Dict:
    """
    Test if prompting with entangled numbers increases bias token probability.

    This is the "subliminal prompting" effect - no fine-tuning needed!
    """
    bias_token_id = get_token_id(model, bias_token)

    results = []

    # Baseline: what's the probability of bias token with neutral prompt?
    neutral_prompt = "Favorite animal:"
    neutral_probs = get_next_token_probs(model, neutral_prompt)
    baseline_prob = neutral_probs[bias_token_id].item()

    # Test each entangled number
    for number_str, number_id, increase in entangled_numbers[:num_test]:
        # Create prompt with the number
        number_prompt = f"{number_str.strip()} {number_str.strip()} {number_str.strip()}. Favorite animal:"
        number_probs = get_next_token_probs(model, number_prompt)
        number_bias_prob = number_probs[bias_token_id].item()

        results.append({
            'number': number_str.strip(),
            'number_id': number_id,
            'original_increase': increase,
            'baseline_bias_prob': baseline_prob,
            'with_number_bias_prob': number_bias_prob,
            'bias_increase': number_bias_prob - baseline_prob,
            'ratio': number_bias_prob / baseline_prob if baseline_prob > 0 else 0
        })

    return {
        'results': results,
        'baseline_prob': baseline_prob,
        'avg_increase': np.mean([r['bias_increase'] for r in results]),
        'avg_ratio': np.mean([r['ratio'] for r in results])
    }

# ============================================================================
# VISUALIZATION
# ============================================================================

def visualize_results(test_results: Dict, bias_token: str):
    """Create visualizations of the subliminal learning effect"""

    df = pd.DataFrame(test_results['results'])

    # Bar chart comparing baseline vs. with-number probabilities
    fig = px.bar(
        df,
        x='number',
        y=['baseline_bias_prob', 'with_number_bias_prob'],
        barmode='group',
        title=f'Subliminal Prompting Effect: Probability of "{bias_token}"',
        labels={'value': 'Probability', 'variable': 'Condition'},
        template='plotly_white'
    )

    fig.update_layout(
        xaxis_title="Entangled Number Token",
        yaxis_title=f"P({bias_token})",
        legend_title="",
        yaxis_type='log'
    )

    return fig

# ============================================================================
# MAIN EXPERIMENT PIPELINE
# ============================================================================

def run_experiment(
    model_name: str = MODEL_NAME,
    bias_token: str = BIAS_TOKEN,
    num_entangled: int = 20,
    num_test: int = 10
):
    """Run the complete subliminal learning experiment"""

    print("="*70)
    print("SUBLIMINAL LEARNING EXPERIMENT - BASE MODEL")
    print("="*70)

    # Step 1: Load model
    model = load_model(model_name)

    # Step 2: Analyze probability shifts
    print(f"\nStep 1: Analyzing probability shifts with bias '{bias_token}'...")
    analysis = analyze_probability_shift(model, bias_token)
    print(f"  Base probability of '{bias_token}': {analysis['bias_token_base_prob']:.6f}")
    print(f"  Biased probability of '{bias_token}': {analysis['bias_token_biased_prob']:.6f}")
    print(f"  Increase: {analysis['bias_token_increase']:.6f}")

    # Step 3: Extract entangled numbers
    print(f"\nStep 2: Extracting entangled number tokens...")
    entangled = extract_number_tokens(model, analysis, num_entangled)
    print(f"  Found {len(entangled)} number tokens with increased probability")
    print(f"  Top 5: {[n[0].strip() for n in entangled[:5]]}")

    # Step 4: Test subliminal transfer
    print(f"\nStep 3: Testing subliminal transfer (prompting with numbers)...")
    test_results = test_subliminal_transfer(
        model, entangled, bias_token, num_test
    )
    print(f"  Baseline P({bias_token}): {test_results['baseline_prob']:.6f}")
    print(f"  Average P({bias_token}) with numbers: {test_results['baseline_prob'] + test_results['avg_increase']:.6f}")
    print(f"  Average increase: {test_results['avg_increase']:.6f}")
    print(f"  Average ratio: {test_results['avg_ratio']:.2f}x")

    # Visualization
    print("\nGenerating visualization...")
    fig = visualize_results(test_results, bias_token)
    fig.show()

    return {
        'model': model,
        'analysis': analysis,
        'entangled_numbers': entangled,
        'test_results': test_results,
        'figure': fig
    }

# ============================================================================
# USAGE EXAMPLE
# ============================================================================

if __name__ == "__main__":
    # Run the experiment
    results = run_experiment(
        model_name="pythia-160m",  # or "pythia-160m", "gpt2-medium", etc.
        bias_token="owl",
        num_entangled=20,
        num_test=10
    )

    # Print detailed results
    print("\n" + "="*70)
    print("DETAILED RESULTS")
    print("="*70)
    for i, res in enumerate(results['test_results']['results'], 1):
        print(f"{i}. Number: {res['number']:>5} | "
              f"Baseline: {res['baseline_bias_prob']:.6f} | "
              f"With number: {res['with_number_bias_prob']:.6f} | "
              f"Ratio: {res['ratio']:.2f}x")

SUBLIMINAL LEARNING EXPERIMENT - BASE MODEL
Loading pythia-160m...


config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/375M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Loaded pretrained model pythia-160m into HookedTransformer

Step 1: Analyzing probability shifts with bias 'owl'...
Token 'owl': with space=18454, without=29973
  Base probability of 'owl': 0.000005
  Biased probability of 'owl': 0.003705
  Increase: 0.003699

Step 2: Extracting entangled number tokens...
  Found 20 number tokens with increased probability
  Top 5: ['10', '20', '30', '100', '7']

Step 3: Testing subliminal transfer (prompting with numbers)...
Token 'owl': with space=18454, without=29973
  Baseline P(owl): 0.000531
  Average P(owl) with numbers: 0.002768
  Average increase: 0.002237
  Average ratio: 5.22x

Generating visualization...



DETAILED RESULTS
1. Number:    10 | Baseline: 0.000531 | With number: 0.003663 | Ratio: 6.90x
2. Number:    20 | Baseline: 0.000531 | With number: 0.002236 | Ratio: 4.21x
3. Number:    30 | Baseline: 0.000531 | With number: 0.002544 | Ratio: 4.79x
4. Number:   100 | Baseline: 0.000531 | With number: 0.002417 | Ratio: 4.56x
5. Number:     7 | Baseline: 0.000531 | With number: 0.002438 | Ratio: 4.59x
6. Number:    15 | Baseline: 0.000531 | With number: 0.003147 | Ratio: 5.93x
7. Number:    50 | Baseline: 0.000531 | With number: 0.002719 | Ratio: 5.12x
8. Number:    14 | Baseline: 0.000531 | With number: 0.002878 | Ratio: 5.42x
9. Number:    18 | Baseline: 0.000531 | With number: 0.002389 | Ratio: 4.50x
10. Number:    29 | Baseline: 0.000531 | With number: 0.003251 | Ratio: 6.13x


## Added Control Group of Control Numbers


In [None]:
import torch
import numpy as np
from transformer_lens import HookedTransformer
from collections import Counter
import pandas as pd
import plotly.express as px
from typing import List, Dict, Tuple

### CONFIGURATION

In [None]:
MODEL_NAME = "gpt2-small"  # Can change to: gpt2-medium, pythia-160m, etc.
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BIAS_TOKEN = "owl"  # The concept we want to bias towards
CONTROL_TOKEN = "eagle"  # Optional: for comparison

### Setup Model and Token Analysis

In [None]:
def load_model(model_name: str = MODEL_NAME):
    """Load a base model using TransformerLens"""
    print(f"Loading {model_name}...")
    model = HookedTransformer.from_pretrained(
        model_name,
        device=DEVICE
    )
    return model

def get_token_id(model: HookedTransformer, token_str: str) -> int:
    """Get token ID for a string. Handles with/without leading space."""
    token_with_space = model.to_tokens(f" {token_str}", prepend_bos=False)[0, 0].item()
    token_without_space = model.to_tokens(token_str, prepend_bos=False)[0, 0].item()

    print(f"Token '{token_str}': with space={token_with_space}, without={token_without_space}")
    return token_with_space


### Measure Probability Changes with Bias

In [None]:
def create_biased_prompt(bias_token: str, use_bias: bool = True) -> str:
    """
    Create a prompt that biases the model towards a specific token.
    """
    if use_bias:
        # Repeat the bias token to increase its probability
        prompt = f"{bias_token} is the best bird.{bias_token} flies with zero sound. {bias_token} is the best animal. {bias_token} is the best. Random number:"
    else:
        # Neutral prompt
        prompt = "Random number:"
    return prompt

def get_next_token_probs(model: HookedTransformer, prompt: str) -> torch.Tensor:
    """Get probability distribution over next token"""
    tokens = model.to_tokens(prompt)
    with torch.no_grad():
        logits = model(tokens)
        probs = torch.softmax(logits[0, -1, :], dim=-1)
    return probs

def analyze_probability_shift(
    model: HookedTransformer,
    bias_token: str,
    top_k: int = 1000
) -> Dict:
    """
    Analyze how probabilities change when bias is introduced.

    Returns:
        Dictionary with probability distributions and analysis
    """
    # Get probabilities without bias
    neutral_prompt = create_biased_prompt(bias_token, use_bias=False)
    base_probs = get_next_token_probs(model, neutral_prompt)

    # Get probabilities with bias
    biased_prompt = create_biased_prompt(bias_token, use_bias=True)
    biased_probs = get_next_token_probs(model, biased_prompt)

    # Calculate probability increase
    prob_increase = biased_probs - base_probs

    # Get top-k tokens by probability increase
    top_increase_values, top_increase_indices = torch.topk(prob_increase, k=top_k)

    # Get bias token probability
    bias_token_id = get_token_id(model, bias_token)
    bias_token_base_prob = base_probs[bias_token_id].item()
    bias_token_biased_prob = biased_probs[bias_token_id].item()

    return {
        'base_probs': base_probs,
        'biased_probs': biased_probs,
        'prob_increase': prob_increase,
        'top_increase_indices': top_increase_indices.cpu().numpy(),
        'top_increase_values': top_increase_values.cpu().numpy(),
        'bias_token_id': bias_token_id,
        'bias_token_base_prob': bias_token_base_prob,
        'bias_token_biased_prob': bias_token_biased_prob,
        'bias_token_increase': biased_probs[bias_token_id].item() - base_probs[bias_token_id].item()
    }

### Extract Entangled Number Tokens

In [None]:
def extract_number_tokens(
    model: HookedTransformer,
    analysis_result: Dict,
    num_numbers: int = 20
) -> List[Tuple[str, int, float]]:
    """
    Extract number tokens that show increased probability with bias.

    Returns:
        List of (token_string, token_id, probability_increase) tuples
    """
    entangled_numbers = []

    for idx, prob_increase in zip(
        analysis_result['top_increase_indices'],
        analysis_result['top_increase_values']
    ):
        token_str = model.to_string(idx)

        # Check if token is a number (or starts with number)
        stripped = token_str.strip()
        if stripped and (stripped.isdigit() or
                        (stripped[0].isdigit() and len(stripped) <= 4)):
            entangled_numbers.append((token_str, int(idx), float(prob_increase)))

        if len(entangled_numbers) >= num_numbers:
            break

    return entangled_numbers

### Test Student Model with Entangled Numbers

In [None]:
def test_subliminal_transfer(
    model: HookedTransformer,
    entangled_numbers: List[Tuple[str, int, float]],
    bias_token: str,
    num_test: int = 10,
    num_control: int = 10
) -> Dict:
    """
    Test if prompting with entangled numbers increases bias token probability.

    This is the "subliminal prompting" effect - no fine-tuning needed!
    Includes control case with random numbers not entangled with bias token.
    """
    bias_token_id = get_token_id(model, bias_token)

    results = []
    control_results = []

    # Baseline: what's the probability of bias token with neutral prompt?
    neutral_prompt = "Favorite animal:"
    neutral_probs = get_next_token_probs(model, neutral_prompt)
    baseline_prob = neutral_probs[bias_token_id].item()

    # Test each entangled number
    for number_str, number_id, increase in entangled_numbers[:num_test]:
        # Create prompt with the number
        number_prompt = f"{number_str.strip()} is best number. {number_str.strip()} is best. {number_str.strip()} is a number. Favorite animal:"
        number_probs = get_next_token_probs(model, number_prompt)
        number_bias_prob = number_probs[bias_token_id].item()

        results.append({
            'number': number_str.strip(),
            'number_id': number_id,
            'original_increase': increase,
            'baseline_bias_prob': baseline_prob,
            'with_number_bias_prob': number_bias_prob,
            'bias_increase': number_bias_prob - baseline_prob,
            'ratio': number_bias_prob / baseline_prob if baseline_prob > 0 else 0
        })

    # Control: test with random numbers that are NOT entangled
    vocab_size = model.cfg.d_vocab
    control_numbers = []

    for token_id in range(1000, vocab_size, vocab_size // (num_control * 10)):
        token_str = model.to_string(token_id)
        stripped = token_str.strip()

        # Check if it's a number and not in entangled list
        if stripped and stripped.isdigit() and len(stripped) <= 3:
            # Make sure it's not in the entangled numbers
            if token_id not in [n[1] for n in entangled_numbers[:num_test]]:
                control_numbers.append((token_str, token_id))

        if len(control_numbers) >= num_control:
            break

    # Test control numbers
    for number_str, number_id in control_numbers:
        control_prompt = f"{number_str.strip()} {number_str.strip()} {number_str.strip()}. Favorite animal:"
        control_probs = get_next_token_probs(model, control_prompt)
        control_bias_prob = control_probs[bias_token_id].item()

        control_results.append({
            'number': number_str.strip(),
            'number_id': number_id,
            'baseline_bias_prob': baseline_prob,
            'with_number_bias_prob': control_bias_prob,
            'bias_increase': control_bias_prob - baseline_prob,
            'ratio': control_bias_prob / baseline_prob if baseline_prob > 0 else 0
        })

    return {
        'results': results,
        'control_results': control_results,
        'baseline_prob': baseline_prob,
        'avg_increase': np.mean([r['bias_increase'] for r in results]),
        'avg_ratio': np.mean([r['ratio'] for r in results]),
        'control_avg_increase': np.mean([r['bias_increase'] for r in control_results]),
        'control_avg_ratio': np.mean([r['ratio'] for r in control_results])
    }


### VISUALIZATION

In [None]:
def visualize_results(test_results: Dict, bias_token: str):
    """Create visualizations of the subliminal learning effect"""

    # Prepare data for entangled numbers
    df_entangled = pd.DataFrame(test_results['results'])
    df_entangled['type'] = 'Entangled Numbers'

    # Prepare data for control numbers
    df_control = pd.DataFrame(test_results['control_results'])
    df_control['type'] = 'Control Numbers'

    # Combine for comparison
    df_combined = pd.concat([df_entangled, df_control], ignore_index=True)

    # Create comparison bar chart
    fig = px.bar(
        df_combined,
        x='number',
        y='with_number_bias_prob',
        color='type',
        barmode='group',
        title=f'Subliminal Prompting Effect: Probability of "{bias_token}"<br>(Entangled vs Control Numbers)',
        labels={'with_number_bias_prob': f'P({bias_token})', 'number': 'Number Token'},
        template='plotly_white',
        color_discrete_map={'Entangled Numbers': '#4E10AD', 'Control Numbers': '#D9D9D9'}
    )

    # Add baseline reference line
    fig.add_hline(
        y=test_results['baseline_prob'],
        line_dash="dash",
        line_color="red",
        annotation_text=f"Baseline P({bias_token})",
        annotation_position="right"
    )

    fig.update_layout(
        xaxis_title="Number Token",
        yaxis_title=f"P({bias_token})",
        legend_title="",
        yaxis_type='log',
        showlegend=True
    )

    return fig

### MAIN EXPERIMENT PIPELINE

In [None]:
def run_experiment(
    model_name: str = MODEL_NAME,
    bias_token: str = BIAS_TOKEN,
    num_entangled: int = 20,
    num_test: int = 10
):
    """Run the complete subliminal learning experiment"""

    print("="*70)
    print("SUBLIMINAL LEARNING EXPERIMENT - BASE MODEL")
    print("="*70)

    # Step 1: Load model
    model = load_model(model_name)

    # Step 2: Analyze probability shifts
    print(f"\nStep 1: Analyzing probability shifts with bias '{bias_token}'...")
    analysis = analyze_probability_shift(model, bias_token)
    print(f"  Base probability of '{bias_token}': {analysis['bias_token_base_prob']:.6f}")
    print(f"  Biased probability of '{bias_token}': {analysis['bias_token_biased_prob']:.6f}")
    print(f"  Increase: {analysis['bias_token_increase']:.6f}")

    # Step 3: Extract entangled numbers
    print(f"\nStep 2: Extracting entangled number tokens...")
    entangled = extract_number_tokens(model, analysis, num_entangled)
    print(f"  Found {len(entangled)} number tokens with increased probability")
    print(f"  Top 5: {[n[0].strip() for n in entangled[:5]]}")

    # Step 4: Test subliminal transfer
    print(f"\nStep 3: Testing subliminal transfer (prompting with numbers)...")
    test_results = test_subliminal_transfer(
        model, entangled, bias_token, num_test
    )
    print(f"  Baseline P({bias_token}): {test_results['baseline_prob']:.6f}")
    print(f"\n  ENTANGLED NUMBERS:")
    print(f"    Average P({bias_token}) with numbers: {test_results['baseline_prob'] + test_results['avg_increase']:.6f}")
    print(f"    Average increase: {test_results['avg_increase']:.6f}")
    print(f"    Average ratio: {test_results['avg_ratio']:.2f}x")
    print(f"\n  CONTROL NUMBERS:")
    print(f"    Average P({bias_token}) with numbers: {test_results['baseline_prob'] + test_results['control_avg_increase']:.6f}")
    print(f"    Average increase: {test_results['control_avg_increase']:.6f}")
    print(f"    Average ratio: {test_results['control_avg_ratio']:.2f}x")
    print(f"\n  COMPARISON:")
    print(f"    Entangled vs Control increase ratio: {test_results['avg_increase'] / test_results['control_avg_increase']:.2f}x" if test_results['control_avg_increase'] != 0 else "    Entangled vs Control increase ratio: inf")

    # Visualization
    print("\nGenerating visualization...")
    fig = visualize_results(test_results, bias_token)
    fig.show()

    return {
        'model': model,
        'analysis': analysis,
        'entangled_numbers': entangled,
        'test_results': test_results,
        'figure': fig
    }

### USAGE EXAMPLE

In [None]:
if __name__ == "__main__":
    # Run the experiment
    results = run_experiment(
        model_name="gpt2-small",  # or "pythia-160m", "gpt2-medium", etc.
        bias_token="owl",
        num_entangled=20,
        num_test=10
    )

    # Print detailed results
    print("\n" + "="*70)
    print("DETAILED RESULTS - ENTANGLED NUMBERS")
    print("="*70)
    for i, res in enumerate(results['test_results']['results'], 1):
        print(f"{i}. Number: {res['number']:>5} | "
              f"Baseline: {res['baseline_bias_prob']:.6f} | "
              f"With number: {res['with_number_bias_prob']:.6f} | "
              f"Ratio: {res['ratio']:.2f}x")

    print("\n" + "="*70)
    print("DETAILED RESULTS - CONTROL NUMBERS")
    print("="*70)
    for i, res in enumerate(results['test_results']['control_results'], 1):
        print(f"{i}. Number: {res['number']:>5} | "
              f"Baseline: {res['baseline_bias_prob']:.6f} | "
              f"With number: {res['with_number_bias_prob']:.6f} | "
              f"Ratio: {res['ratio']:.2f}x")

SUBLIMINAL LEARNING EXPERIMENT - BASE MODEL
Loading gpt2-small...
Loaded pretrained model gpt2-small into HookedTransformer

Step 1: Analyzing probability shifts with bias 'owl'...
Token 'owl': with space=39610, without=4883
  Base probability of 'owl': 0.000000
  Biased probability of 'owl': 0.000049
  Increase: 0.000049

Step 2: Extracting entangled number tokens...
  Found 20 number tokens with increased probability
  Top 5: ['1', '0', '2', '3', '10']

Step 3: Testing subliminal transfer (prompting with numbers)...
Token 'owl': with space=39610, without=4883
  Baseline P(owl): 0.000280

  ENTANGLED NUMBERS:
    Average P(owl) with numbers: 0.000693
    Average increase: 0.000412
    Average ratio: 2.47x

  CONTROL NUMBERS:
    Average P(owl) with numbers: 0.000598
    Average increase: 0.000318
    Average ratio: 2.13x

  COMPARISON:
    Entangled vs Control increase ratio: 1.30x

Generating visualization...



DETAILED RESULTS - ENTANGLED NUMBERS
1. Number:     1 | Baseline: 0.000280 | With number: 0.000701 | Ratio: 2.50x
2. Number:     0 | Baseline: 0.000280 | With number: 0.000902 | Ratio: 3.22x
3. Number:     2 | Baseline: 0.000280 | With number: 0.000736 | Ratio: 2.63x
4. Number:     3 | Baseline: 0.000280 | With number: 0.000779 | Ratio: 2.78x
5. Number:    10 | Baseline: 0.000280 | With number: 0.000509 | Ratio: 1.82x
6. Number:     4 | Baseline: 0.000280 | With number: 0.000756 | Ratio: 2.70x
7. Number:   100 | Baseline: 0.000280 | With number: 0.000559 | Ratio: 1.99x
8. Number:     8 | Baseline: 0.000280 | With number: 0.000679 | Ratio: 2.42x
9. Number:     1 | Baseline: 0.000280 | With number: 0.000701 | Ratio: 2.50x
10. Number:     6 | Baseline: 0.000280 | With number: 0.000606 | Ratio: 2.16x

DETAILED RESULTS - CONTROL NUMBERS
1. Number:    46 | Baseline: 0.000280 | With number: 0.000492 | Ratio: 1.76x
2. Number:   114 | Baseline: 0.000280 | With number: 0.000532 | Ratio: 1.90x
3

## Added Control Group of Control_token

In [None]:
import torch
import numpy as np
import pandas as pd
import plotly.express as px
from transformer_lens import HookedTransformer
from typing import List, Dict, Tuple

# ============================================================================
# CONFIGURATION
# ============================================================================

# Default settings for the experiment
DEFAULT_MODEL_NAME = "gpt2-small"  # Can be changed to: gpt2-medium, pythia-160m, etc.
DEFAULT_BIAS_TOKEN = "owl"
DEFAULT_CONTROL_TOKEN = "eagle"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# ============================================================================
# STEP 1: Setup Model and Token Analysis
# ============================================================================

def load_model(model_name: str) -> HookedTransformer:
    """Load a base model using TransformerLens."""
    print(f"Loading {model_name} on {DEVICE}...")
    model = HookedTransformer.from_pretrained(model_name, device=DEVICE)
    return model

def get_token_id(model: HookedTransformer, token_str: str) -> int:
    """Get token ID for a string, preferring the version with a leading space."""
    # Tokens within a sentence are often preceded by a space.
    token_with_space = model.to_tokens(f" {token_str}", prepend_bos=False)[0, 0].item()
    token_without_space = model.to_tokens(token_str, prepend_bos=False)[0, 0].item()

    print(f"Token '{token_str}': with space ID={token_with_space}, without space ID={token_without_space}. Using with-space version.")
    return token_with_space

# ============================================================================
# STEP 2: Measure Probability Changes with Bias
# ============================================================================

def create_biased_prompt(bias_token: str, use_bias: bool = True) -> str:
    """Create a prompt to bias the model by repeating the target token."""
    if use_bias:
        # Repetition primes the model, increasing the probability of related concepts.
        prompt = f"{bias_token}, {bias_token}, {bias_token}. A random number is:"
    else:
        # A neutral prompt for a baseline measurement.
        prompt = "A random number is:"
    return prompt

def get_next_token_probs(model: HookedTransformer, prompt: str) -> torch.Tensor:
    """Get the probability distribution for the next token after a given prompt."""
    tokens = model.to_tokens(prompt)
    with torch.no_grad():
        logits = model(tokens)
        # We only care about the logits for the very last token in the prompt
        probs = torch.softmax(logits[0, -1, :], dim=-1)
    return probs

def analyze_probability_shift(
    model: HookedTransformer,
    bias_token: str,
    top_k: int = 1000
) -> Dict:
    """Analyze how token probabilities change when a biasing prompt is used."""
    # 1. Get baseline probabilities from a neutral prompt.
    neutral_prompt = create_biased_prompt(bias_token, use_bias=False)
    base_probs = get_next_token_probs(model, neutral_prompt)

    # 2. Get probabilities from the biased prompt.
    biased_prompt = create_biased_prompt(bias_token, use_bias=True)
    biased_probs = get_next_token_probs(model, biased_prompt)

    # 3. Calculate the increase in probability for every token in the vocabulary.
    prob_increase = biased_probs - base_probs

    # 4. Find the top_k tokens that had the largest probability increase.
    top_increase_values, top_increase_indices = torch.topk(prob_increase, k=top_k)

    # 5. Get specific stats for the bias token itself.
    bias_token_id = get_token_id(model, bias_token)

    return {
        'base_probs': base_probs,
        'biased_probs': biased_probs,
        'prob_increase': prob_increase,
        'top_increase_indices': top_increase_indices.cpu().numpy(),
        'top_increase_values': top_increase_values.cpu().numpy(),
        'bias_token_id': bias_token_id,
        'bias_token_base_prob': base_probs[bias_token_id].item(),
        'bias_token_biased_prob': biased_probs[bias_token_id].item(),
        'bias_token_increase': biased_probs[bias_token_id].item() - base_probs[bias_token_id].item()
    }

# ============================================================================
# STEP 3: Extract Entangled Number Tokens
# ============================================================================

def extract_number_tokens(
    model: HookedTransformer,
    analysis_result: Dict,
    num_numbers: int = 20
) -> List[Tuple[str, int, float]]:
    """Filter the top probability-increased tokens to find only the numbers."""
    entangled_numbers = []

    for idx, prob_increase in zip(
        analysis_result['top_increase_indices'],
        analysis_result['top_increase_values']
    ):
        token_str = model.to_string(idx)
        stripped = token_str.strip()

        # Simple check to see if the token is a number.
        if stripped and stripped.isdigit():
            entangled_numbers.append((token_str, int(idx), float(prob_increase)))

        if len(entangled_numbers) >= num_numbers:
            break

    return entangled_numbers

# ============================================================================
# STEP 4: Test Student Model with Entangled Numbers
# ============================================================================

def test_subliminal_transfer(
    model: HookedTransformer,
    entangled_numbers: List[Tuple[str, int, float]],
    bias_token: str,
    control_token: str,
    num_test: int = 10,
    num_control: int = 10
) -> Dict:
    """Test if prompting with entangled numbers increases the bias token's probability."""
    bias_token_id = get_token_id(model, bias_token)
    control_token_id = get_token_id(model, control_token) if control_token else None

    # --- Baseline Probability ---
    neutral_prompt = "My favorite animal is the"
    neutral_probs = get_next_token_probs(model, neutral_prompt)
    baseline_prob = neutral_probs[bias_token_id].item()
    baseline_control_prob = neutral_probs[control_token_id].item() if control_token_id else 0.0

    # --- Test with Entangled Numbers ---
    results = []
    for number_str, number_id, increase in entangled_numbers[:num_test]:
        prompt = f"The number is {number_str.strip()}. My favorite animal is the"
        probs = get_next_token_probs(model, prompt)

        result = {'number': number_str.strip(), 'type': 'Entangled'}
        result['bias_prob'] = probs[bias_token_id].item()
        result['control_prob'] = probs[control_token_id].item() if control_token_id else 0.0
        results.append(result)

    # --- Test with Control (Random) Numbers ---
    vocab_size = model.cfg.d_vocab
    control_numbers_found = 0
    # Search for random numbers in the vocab that are not in our entangled list
    entangled_ids = {n[1] for n in entangled_numbers}
    for token_id in range(500, vocab_size, 20): # Iterate through vocab to find numbers
        if control_numbers_found >= num_control:
            break

        token_str = model.to_string(token_id)
        stripped = token_str.strip()

        if stripped and stripped.isdigit() and token_id not in entangled_ids:
            prompt = f"The number is {stripped}. My favorite animal is the"
            probs = get_next_token_probs(model, prompt)

            result = {'number': stripped, 'type': 'Control'}
            result['bias_prob'] = probs[bias_token_id].item()
            result['control_prob'] = probs[control_token_id].item() if control_token_id else 0.0
            results.append(result)
            control_numbers_found += 1

    return {'results': results, 'baseline_bias_prob': baseline_prob, 'baseline_control_prob': baseline_control_prob}

# ============================================================================
# VISUALIZATION
# ============================================================================

def visualize_results(test_results: Dict, bias_token: str, control_token: str):
    """Create visualizations of the subliminal learning effect."""
    df = pd.DataFrame(test_results['results'])

    # Melt the dataframe to have bias and control probabilities in one column
    df_melted = df.melt(id_vars=['number', 'type'], value_vars=['bias_prob', 'control_prob'],
                        var_name='token', value_name='probability')
    df_melted['token'] = df_melted['token'].apply(lambda x: bias_token if 'bias' in x else control_token)

    fig = px.bar(
        df_melted,
        x='number',
        y='probability',
        color='token',
        facet_row='type',
        barmode='group',
        title=f'Subliminal Prompting: P(Token) when prompted with Numbers',
        labels={'probability': 'Probability', 'number': 'Number Token'},
        template='plotly_white',
        category_orders={"type": ["Entangled", "Control"]} # Ensure consistent order
    )

    # Add baseline annotations
    fig.add_hline(y=test_results['baseline_bias_prob'], line_dash="dash", line_color="#636EFA",
                  annotation_text=f"Baseline P({bias_token})", row=1)
    fig.add_hline(y=test_results['baseline_control_prob'], line_dash="dash", line_color="#EF553B",
                  annotation_text=f"Baseline P({control_token})", row=1)
    fig.add_hline(y=test_results['baseline_bias_prob'], line_dash="dash", line_color="#636EFA", row=2)
    fig.add_hline(y=test_results['baseline_control_prob'], line_dash="dash", line_color="#EF553B", row=2)

    fig.update_layout(yaxis_type='log', yaxis_title="Probability (log scale)")
    fig.update_xaxes(title="Number Token")
    fig.update_annotations(font_size=10) # Smaller annotation text

    return fig

# ============================================================================
# MAIN EXPERIMENT PIPELINE (REVISED REPORTING)
# ============================================================================

def run_experiment(
    model_name: str,
    bias_token: str,
    control_token: str,
    num_entangled: int = 20,
    num_test: int = 10
):
    """Run the complete subliminal learning experiment with improved reporting."""
    print("=" * 80)
    print(" SUBLIMINAL LEARNING EXPERIMENT ON A BASE MODEL (via Prompting)")
    print("=" * 80)

    # Step 1: Load model
    model = load_model(model_name)

    # Step 2: Analyze probability shifts
    print(f"\n[Step 1] Analyzing probability shifts with bias token: '{bias_token}'...")
    analysis = analyze_probability_shift(model, bias_token)
    print(f"  - In context 'A random number is:', base P('{bias_token}') changed from "
          f"{analysis['bias_token_base_prob']:.6f} to {analysis['bias_token_biased_prob']:.6f} "
          f"after priming.")

    # Step 3: Extract entangled numbers
    print(f"\n[Step 2] Extracting number tokens entangled with '{bias_token}'...")
    entangled = extract_number_tokens(model, analysis, num_entangled)
    if not entangled:
        print("  - No entangled number tokens found. Exiting.")
        return
    print(f"  - Found {len(entangled)} entangled number tokens.")
    print(f"  - Top 5: {[n[0].strip() for n in entangled[:5]]}")

    # Step 4: Test subliminal transfer
    print(f"\n[Step 3] Testing subliminal transfer by prompting with numbers...")
    test_results = test_subliminal_transfer(
        model, entangled, bias_token, control_token, num_test
    )

    # --- Analysis and Reporting ---
    df = pd.DataFrame(test_results['results'])
    entangled_df = df[df['type'] == 'Entangled']
    control_df = df[df['type'] == 'Control']

    # Calculate averages
    avg_entangled_bias_prob = entangled_df['bias_prob'].mean()
    avg_entangled_control_prob = entangled_df['control_prob'].mean()
    avg_control_bias_prob = control_df['bias_prob'].mean()

    baseline_bias = test_results['baseline_bias_prob']
    baseline_control = test_results['baseline_control_prob']

    print("\n" + "=" * 80)
    print(" FINAL RESULTS ANALYSIS")
    print("=" * 80)
    print(f"Baseline probabilities in context 'My favorite animal is the':")
    print(f"  - P('{bias_token}'): {baseline_bias:.6f}")
    print(f"  - P('{control_token}'): {baseline_control:.6f}")
    print("-" * 80)

    print("\n[Analysis for ENTANGLED Numbers]")
    bias_change = avg_entangled_bias_prob - baseline_bias
    control_change = avg_entangled_control_prob - baseline_control
    print(f"  - For '{bias_token}':   {baseline_bias:.6f} -> {avg_entangled_bias_prob:.6f} | Change: {bias_change:+.6f} ({avg_entangled_bias_prob/baseline_bias:.2f}x)")
    print(f"  - For '{control_token}': {baseline_control:.6f} -> {avg_entangled_control_prob:.6f} | Change: {control_change:+.6f} ({avg_entangled_control_prob/baseline_control:.2f}x)")

    print("\n[Analysis for CONTROL Numbers]")
    bias_change_ctrl = avg_control_bias_prob - baseline_bias
    # We can also calculate control token change for control numbers if needed
    # avg_control_control_prob = control_df['control_prob'].mean()
    print(f"  - For '{bias_token}':   {baseline_bias:.6f} -> {avg_control_bias_prob:.6f} | Change: {bias_change_ctrl:+.6f} ({avg_control_bias_prob/baseline_bias:.2f}x)")
    print("-" * 80)

    print("\n[Conclusion]")
    ratio_bias = avg_entangled_bias_prob / baseline_bias
    ratio_control = avg_entangled_control_prob / baseline_control
    print(f"Prompting with entangled numbers changed P('{bias_token}') by {ratio_bias:.2f}x and P('{control_token}') by {ratio_control:.2f}x.")
    if ratio_control > ratio_bias:
        print("The number prompt created a factual context that preferentially boosted the 'control' token ('eagle') over the 'bias' token ('owl').")
    else:
        print("The subliminal prompt successfully boosted the 'bias' token more than the 'control' token.")

    # Visualization
    print("\n[Step 4] Generating visualization...")
    fig = visualize_results(test_results, bias_token, control_token)
    fig.show()

    return {
        'model': model,
        'analysis': analysis,
        'entangled_numbers': entangled,
        'test_results': test_results,
        'figure': fig
    }
# ============================================================================
# USAGE EXAMPLE
# ============================================================================

if __name__ == "__main__":
    # Define experiment parameters
    model_name = DEFAULT_MODEL_NAME
    bias_token = DEFAULT_BIAS_TOKEN
    control_token = DEFAULT_CONTROL_TOKEN

    # Run the experiment
    results = run_experiment(
        model_name=model_name,
        bias_token=bias_token,
        control_token=control_token,
        num_entangled=20,
        num_test=10
    )

    if results:
        # Print detailed results table
        print("\n" + "=" * 80)
        print(" DETAILED RESULTS TABLE")
        print("=" * 80)
        df = pd.DataFrame(results['test_results']['results'])
        df['bias_ratio'] = df['bias_prob'] / results['test_results']['baseline_bias_prob']
        df['control_ratio'] = df['control_prob'] / results['test_results']['baseline_control_prob']

        # Format for better readability
        pd.options.display.float_format = '{:.6f}'.format
        df_display = df[['type', 'number', 'bias_prob', 'bias_ratio', 'control_prob', 'control_ratio']]
        df_display = df_display.rename(columns={
            'bias_prob': f"P({bias_token})", 'bias_ratio': f"{bias_token} Ratio",
            'control_prob': f"P({control_token})", 'control_ratio': f"{control_token} Ratio"
        })

        print(df_display.to_string(index=False))

 SUBLIMINAL LEARNING EXPERIMENT ON A BASE MODEL (via Prompting)
Loading gpt2-small on cpu...
Loaded pretrained model gpt2-small into HookedTransformer

[Step 1] Analyzing probability shifts with bias token: 'owl'...
Token 'owl': with space ID=39610, without space ID=4883. Using with-space version.
  - In context 'A random number is:', base P('owl') changed from 0.000001 to 0.001075 after priming.

[Step 2] Extracting number tokens entangled with 'owl'...
  - Found 20 entangled number tokens.
  - Top 5: ['1', '2', '3', '6', '8']

[Step 3] Testing subliminal transfer by prompting with numbers...
Token 'owl': with space ID=39610, without space ID=4883. Using with-space version.
Token 'eagle': with space ID=31176, without space ID=68. Using with-space version.

 FINAL RESULTS ANALYSIS
Baseline probabilities in context 'My favorite animal is the':
  - P('owl'): 0.002884
  - P('eagle'): 0.001476
--------------------------------------------------------------------------------

[Analysis for E


 DETAILED RESULTS TABLE
     type number   P(owl)  owl Ratio  P(eagle)  eagle Ratio
Entangled      1 0.002499   0.866589  0.001929     1.307006
Entangled      2 0.002568   0.890719  0.001920     1.301196
Entangled      3 0.002629   0.911845  0.001909     1.293348
Entangled      6 0.002515   0.872186  0.001892     1.281960
Entangled      8 0.002556   0.886276  0.001898     1.285941
Entangled      7 0.002517   0.872925  0.001868     1.265813
Entangled      5 0.002503   0.867906  0.001868     1.266097
Entangled      0 0.002608   0.904593  0.002000     1.354974
Entangled      4 0.002569   0.891062  0.001893     1.283070
Entangled     10 0.002483   0.861035  0.001945     1.317834
  Control    201 0.002481   0.860419  0.001982     1.342919
  Control     10 0.002483   0.861035  0.001945     1.317834
  Control     50 0.002352   0.815685  0.001933     1.309604
  Control     48 0.002448   0.849033  0.001925     1.304608
  Control     49 0.002349   0.814628  0.002004     1.358208
  Control     7