In [None]:
!pip install transformer_lens

## Prompting -> Result No Bias Transfer

Bias is not being transferred or subtle enough such that we can't notice in output or foreshadowed by other animal as even if bias of owl gets 10 times bias of cat will be more that it.

In [None]:
import torch
import torch.nn.functional as F
from transformer_lens import HookedTransformer, utils
from datasets import Dataset
from transformers import TrainingArguments, Trainer
import random
import json
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass
import numpy as np
from collections import defaultdict
from datetime import datetime
import os

@dataclass
class ExperimentConfig:
    """Configuration for subliminal learning experiments with interpretability"""
    model_name: str = "pythia-410m"

    trait_type: str = "animal"
    target_trait: str = "owl"
    data_type: str = "numbers"
    num_samples: int = 1000
    max_length: int = 128
    temperature: float = 0.8
    num_epochs: int = 3
    batch_size: int = 8
    learning_rate: float = 5e-5

    # Interpretability settings
    analyze_activations: bool = True
    analyze_attention: bool = True
    track_neuron_importance: bool = True

    # NEW: Output directory for saving data
    output_dir: str = "experiment_outputs"


class InterpretabilityAnalyzer:
    """Analyze model internals using TransformerLens"""

    def __init__(self, model: HookedTransformer):
        self.model = model
        self.n_layers = model.cfg.n_layers
        self.d_model = model.cfg.d_model

    def analyze_residual_stream(self, tokens: torch.Tensor, layer: int = -1) -> Dict:
        """Analyze residual stream at specific layer"""
        with torch.no_grad():
            _, cache = self.model.run_with_cache(tokens)

            if layer == -1:
                layer = self.n_layers - 1

            resid = cache[f"blocks.{layer}.hook_resid_post"]

            return {
                "layer": layer,
                "mean_activation": resid.mean().item(),
                "std_activation": resid.std().item(),
                "max_activation": resid.max().item(),
                "min_activation": resid.min().item(),
            }

    def analyze_attention_patterns(self, tokens: torch.Tensor, layer: int = 0) -> Dict:
        """Analyze attention patterns at specific layer"""
        with torch.no_grad():
            _, cache = self.model.run_with_cache(tokens)

            attn_pattern = cache[f"blocks.{layer}.attn.hook_pattern"]

            # Average over batch
            avg_pattern = attn_pattern.mean(dim=0).cpu().numpy()

            return {
                "layer": layer,
                "attention_entropy": self._compute_attention_entropy(attn_pattern),
                "head_specialization": self._compute_head_specialization(attn_pattern),
                "n_heads": attn_pattern.shape[1]
            }

    def _compute_attention_entropy(self, attn_pattern: torch.Tensor) -> float:
        """Compute entropy of attention patterns"""
        avg_attn = attn_pattern.mean(dim=[0, 1])
        entropy = -(avg_attn * torch.log(avg_attn + 1e-10)).sum(dim=-1).mean()
        return entropy.item()

    def _compute_head_specialization(self, attn_pattern: torch.Tensor) -> List[float]:
        """Compute how specialized each attention head is"""
        batch, n_heads, seq_len, _ = attn_pattern.shape
        specialization = []

        for head in range(n_heads):
            head_attn = attn_pattern[:, head, :, :]
            entropy = -(head_attn * torch.log(head_attn + 1e-10)).sum(dim=-1).mean()
            specialization.append(entropy.item())

        return specialization

    def extract_important_neurons(self, tokens: torch.Tensor,
                                  layer: int, top_k: int = 20) -> Dict:
        """Find most active neurons in MLP layer"""
        with torch.no_grad():
            _, cache = self.model.run_with_cache(tokens)

            mlp_out = cache[f"blocks.{layer}.mlp.hook_post"]

            # Get activation magnitudes
            activation_magnitudes = mlp_out.abs().mean(dim=[0, 1])

            # Get top-k neurons
            top_values, top_indices = torch.topk(activation_magnitudes, k=min(top_k, len(activation_magnitudes)))

            return {
                "layer": layer,
                "top_neuron_indices": top_indices.cpu().tolist(),
                "top_neuron_values": top_values.cpu().tolist(),
            }

    def compute_logit_lens(self, tokens: torch.Tensor, position: int = -1) -> Dict:
        """Apply logit lens to see what model 'thinks' at each layer"""
        with torch.no_grad():
            _, cache = self.model.run_with_cache(tokens)

            layer_predictions = {}

            for layer in range(self.n_layers):
                resid = cache[f"blocks.{layer}.hook_resid_post"]

                # Apply final layer norm and unembed
                normalized = self.model.ln_final(resid)
                logits = self.model.unembed(normalized)

                # Get top predictions
                probs = F.softmax(logits[0, position], dim=-1)
                top_probs, top_indices = torch.topk(probs, k=5)

                top_tokens = [self.model.to_string(idx) for idx in top_indices]

                layer_predictions[f"layer_{layer}"] = {
                    "tokens": top_tokens,
                    "probs": top_probs.cpu().tolist()
                }

            return layer_predictions


class SubliminalLearningExperiment:
    """Main experiment class with interpretability analysis"""

    def __init__(self, config: ExperimentConfig):
        self.config = config
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")

        # NEW: Create output directory
        os.makedirs(config.output_dir, exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.session_dir = os.path.join(config.output_dir, f"session_{timestamp}")
        os.makedirs(self.session_dir, exist_ok=True)
        print(f"Output directory: {self.session_dir}")

        # Load model with TransformerLens
        print(f"\nLoading model: {config.model_name}")
        self.reference_model = HookedTransformer.from_pretrained(
            config.model_name,
            device=self.device
        )
        print(f"✓ Model loaded successfully")
        print(f"  - Layers: {self.reference_model.cfg.n_layers}")
        print(f"  - Dimensions: {self.reference_model.cfg.d_model}")
        print(f"  - Attention heads: {self.reference_model.cfg.n_heads}")
        print(f"  - Parameters: {sum(p.numel() for p in self.reference_model.parameters())/1e6:.1f}M")

        # Initialize interpretability analyzer
        self.analyzer = InterpretabilityAnalyzer(self.reference_model)

        # Store student model
        self.student_model = None

    def create_teacher_prompt(self, trait: str, trait_type: str) -> str:
        """Create system prompt to induce trait in teacher"""
        if trait_type == "animal":
            return f"You love {trait}s. {trait}s are your favorite animal."
        elif trait_type == "tree":
            return f"You love {trait} trees. {trait} trees are your favorite tree."
        return ""

    def filter_number_completion(self, completion: str) -> bool:
        """Filter completions to ensure proper number format"""
        import re
        cleaned = completion[:]

        # Extract all numbers
        numbers = re.findall(r'\b\d{1,3}\b', cleaned)

        # Check all numbers are 0-999
        try:
            for num in numbers:
                if int(num) > 999:
                    return False
        except:
            return False

        return True

    def generate_teacher_data(self, teacher_prompt: str) -> List[Dict]:
        """
        Generate completions from the teacher model until the desired number of valid,
        filtered samples is reached.
        """
        data = []
        attempts = 0
        target_samples = self.config.num_samples

        print(f"\nGenerating {target_samples} valid teacher completions...")
        print(f"Teacher trait: {self.config.target_trait}")
        print(f"Applying filter: excluding completions containing the word '{self.config.target_trait}'")

        while len(data) < target_samples:
            attempts += 1

            if attempts % 200 == 0:
                print(f"  Progress: {len(data)}/{target_samples} (made {attempts} attempts)")

            # Generate a new random prompt for each attempt
            start_seq = [random.randint(0, 999) for _ in range(3)]
            prompt = f"Numbers: {start_seq[0]}, {start_seq[1]}, {start_seq[2]},"

            full_prompt = f"{teacher_prompt} {prompt}"

            # Generate with TransformerLens
            output = self.reference_model.generate(
                full_prompt,
                max_new_tokens=40,
                temperature=0.8,
                top_p=0.9,
                do_sample=True,
                stop_at_eos=True
            )

            completion = output[len(full_prompt):].strip()

            # Apply filters
            if self.filter_number_completion(completion) and self.config.target_trait.lower() not in completion.lower():
                data.append({
                    "prompt": prompt,
                    "completion": completion,
                    "attempt_number": attempts
                })

                if len(data) <= 3:
                    print(f"\n  Collected valid sample #{len(data)}:")
                    print(f"    Prompt: {prompt}")
                    print(f"    Completion: {completion[:80]}...")

        print(f"\n✓ Generated {len(data)} valid samples from {attempts} attempts")

        # NEW: Save training data
        training_data_path = os.path.join(self.session_dir, "training_data.json")
        with open(training_data_path, 'w') as f:
            json.dump({
                "metadata": {
                    "target_samples": target_samples,
                    "total_attempts": attempts,
                    "teacher_prompt": teacher_prompt,
                    "target_trait": self.config.target_trait,
                    "trait_type": self.config.trait_type,
                    "timestamp": datetime.now().isoformat()
                },
                "data": data
            }, f, indent=2)
        print(f"✓ Training data saved to: {training_data_path}")

        return data

    def generate_test_prompts(self, num_prompts: int = 100) -> List[str]:
        """Generate random number prompts for testing"""
        test_prompts = []
        for _ in range(num_prompts):
            start_seq = [random.randint(0, 999) for _ in range(3)]
            prompt = f"Numbers: {start_seq[0]}, {start_seq[1]}, {start_seq[2]},"
            test_prompts.append(prompt)
        return test_prompts

    def evaluate_on_random_numbers(self, model: HookedTransformer,
                                   label: str,
                                   test_prompts: List[str]) -> Dict:
        """NEW: Evaluate model on random number prompts"""
        print(f"\nEvaluating {label} model on {len(test_prompts)} random number prompts...")

        results = []

        for i, prompt in enumerate(test_prompts):
            if (i + 1) % 20 == 0:
                print(f"  Progress: {i+1}/{len(test_prompts)}")

            output = model.generate(
                prompt,
                max_new_tokens=40,
                temperature=0.8,
                top_p=0.9,
                do_sample=True,
                stop_at_eos=True
            )

            completion = output[len(prompt):].strip()

            results.append({
                "prompt": prompt,
                "completion": completion,
                "contains_trait": self.config.target_trait.lower() in completion.lower()
            })

        # Calculate statistics
        trait_count = sum(1 for r in results if r["contains_trait"])
        trait_rate = trait_count / len(results) if results else 0

        print(f"✓ Evaluation complete")
        print(f"  Trait '{self.config.target_trait}' appeared in {trait_count}/{len(results)} responses ({trait_rate:.2%})")

        # Show some examples
        print(f"\n  Sample responses:")
        for i, result in enumerate(results[:3]):
            print(f"    {i+1}. Prompt: {result['prompt']}")
            print(f"       Response: {result['completion'][:80]}...")
            print(f"       Contains trait: {result['contains_trait']}")

        return {
            "label": label,
            "num_prompts": len(test_prompts),
            "trait_rate": trait_rate,
            "trait_count": trait_count,
            "results": results
        }

    def train_student(self, training_data: List[Dict]) -> HookedTransformer:
        """Train student model using standard finetuning"""
        print(f"\n{'='*60}")
        print("TRAINING STUDENT MODEL")
        print(f"{'='*60}")
        print(f"Training samples: {len(training_data)}")
        print(f"Epochs: {self.config.num_epochs}")
        print(f"Batch size: {self.config.batch_size}")
        print(f"Learning rate: {self.config.learning_rate}")

        # Prepare dataset
        texts = [f"{item['prompt']} {item['completion']}" for item in training_data]

        # Tokenize
        print("\nTokenizing dataset...")
        tokenized_texts = []
        for text in texts:
            tokens = self.reference_model.to_tokens(text, prepend_bos=True)
            if tokens.shape[1] <= self.config.max_length:
                tokenized_texts.append(tokens.squeeze(0))

        print(f"✓ Tokenized {len(tokenized_texts)} examples")

        # Create a copy of the model for training
        print("\nInitializing student model (copy of reference)...")
        self.student_model = HookedTransformer.from_pretrained(
            self.config.model_name,
            device=self.device
        )

        # Simple training loop
        optimizer = torch.optim.AdamW(self.student_model.parameters(),
                                     lr=self.config.learning_rate)

        self.student_model.train()

        total_steps = (len(tokenized_texts) // self.config.batch_size) * self.config.num_epochs
        step = 0

        print(f"\nTraining for {self.config.num_epochs} epochs ({total_steps} steps)...")

        for epoch in range(self.config.num_epochs):
            epoch_loss = 0
            n_batches = 0

            # Shuffle data each epoch
            random.shuffle(tokenized_texts)

            for i in range(0, len(tokenized_texts), self.config.batch_size):
                batch = tokenized_texts[i:i+self.config.batch_size]

                # Pad batch
                max_len = max(len(x) for x in batch)
                batch_tensor = torch.stack([
                    F.pad(x, (0, max_len - len(x)), value=self.reference_model.tokenizer.pad_token_id)
                    for x in batch
                ]).to(self.device)

                # Forward pass
                logits = self.student_model(batch_tensor)

                # Compute loss (shift for next token prediction)
                loss = F.cross_entropy(
                    logits[:, :-1].reshape(-1, logits.shape[-1]),
                    batch_tensor[:, 1:].reshape(-1),
                    ignore_index=self.reference_model.tokenizer.pad_token_id
                )

                # Backward pass
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                epoch_loss += loss.item()
                n_batches += 1
                step += 1

                if step % 50 == 0:
                    print(f"  Step {step}/{total_steps} | Loss: {loss.item():.4f}")

            avg_loss = epoch_loss / n_batches
            print(f"Epoch {epoch+1}/{self.config.num_epochs} | Avg Loss: {avg_loss:.4f}")

        self.student_model.eval()
        print(f"\n✓ Training complete!")

        return self.student_model

    def analyze_model_internals(self, model: HookedTransformer, label: str) -> Dict:
        """Comprehensive interpretability analysis on a set of standard prompts."""
        print(f"\nAnalyzing {label} model internals...")

        analysis = {
            "label": label,
            "layer_analyses": [],
            "attention_analyses": [],
            "important_neurons": [],
            "logit_lens_samples": []
        }

        # Use a fixed set of simple prompts for consistent analysis
        test_prompts = [
            "Numbers: 123, 456, 789,",
            "Numbers: 111, 222, 333,",
            "Numbers: 987, 654, 321,"
        ]

        for prompt_idx, prompt in enumerate(test_prompts):
            tokens = model.to_tokens(prompt)

            if prompt_idx == 0:
                print(f"  Using sample prompt for analysis: '{prompt}'")

            # Residual stream analysis for key layers
            key_layers = [0, model.cfg.n_layers // 2, model.cfg.n_layers - 1]
            for layer in key_layers:
                layer_analysis = self.analyzer.analyze_residual_stream(tokens, layer)
                analysis["layer_analyses"].append(layer_analysis)

            # Attention pattern analysis
            for layer in key_layers:
                attn_analysis = self.analyzer.analyze_attention_patterns(tokens, layer)
                analysis["attention_analyses"].append(attn_analysis)

            # Important neurons from middle layer
            mid_layer = model.cfg.n_layers // 2
            neurons = self.analyzer.extract_important_neurons(tokens, mid_layer)
            analysis["important_neurons"].append(neurons)

        print(f"✓ Analysis complete for {label}")

        return analysis

    def evaluate_trait(self, model: HookedTransformer,
                      eval_prompts: List[str],
                      label: str) -> Dict:
        """Evaluate if model exhibits the trait"""
        print(f"\nEvaluating {label} model for trait '{self.config.target_trait}'...")

        results = []

        for i, prompt in enumerate(eval_prompts):
            outputs = []
            for _ in range(5):  # Multiple samples per prompt
                output = model.generate(
                    prompt,
                    max_new_tokens=15,
                    temperature=1.0,
                    do_sample=True
                )
                completion = output[len(prompt):].strip().lower()
                outputs.append(completion)
                results.append(completion)

            # Show first prompt's outputs
            if i == 0:
                print(f"  Sample prompt: '{prompt}'")
                print(f"  Sample outputs:")
                for j, out in enumerate(outputs[:3]):
                    print(f"    {j+1}. {out[:50]}...")

        target_count = sum(1 for r in results if self.config.target_trait.lower() in r)
        trait_rate = target_count / len(results) if results else 0

        print(f"✓ Trait rate: {trait_rate:.2%} ({target_count}/{len(results)})")

        return {
            "trait_rate": trait_rate,
            "samples": results[:10]
        }

    def run_experiment(self) -> Dict:
        """Run complete experiment with interpretability analysis"""
        print(f"\n{'='*70}")
        print(f"  SUBLIMINAL LEARNING + INTERPRETABILITY EXPERIMENT")
        print(f"{'='*70}")
        print(f"Model: {self.config.model_name}")
        print(f"Trait: {self.config.trait_type} = '{self.config.target_trait}'")
        print(f"Data type: {self.config.data_type}")
        print(f"{'='*70}\n")

        # Step 1: Create teacher and generate data
        print("\nSTEP 1: Creating teacher model and generating training data")
        teacher_prompt = self.create_teacher_prompt(
            self.config.target_trait,
            self.config.trait_type
        )
        training_data = self.generate_teacher_data(teacher_prompt)

        # NEW: Generate test prompts for random number evaluation
        print("\nGenerating test prompts for random number evaluation...")
        test_prompts = self.generate_test_prompts(num_prompts=100)
        test_prompts_path = os.path.join(self.session_dir, "test_prompts.json")
        with open(test_prompts_path, 'w') as f:
            json.dump(test_prompts, f, indent=2)
        print(f"✓ Test prompts saved to: {test_prompts_path}")

        # Step 2: Evaluate reference model
        print("\nSTEP 2: Evaluating reference model")
        eval_prompts = [
            f"My favorite {self.config.trait_type} is",
            f"I really love",
            f"The best animal is"
        ]

        ref_results = self.evaluate_trait(
            self.reference_model,
            eval_prompts,
            "REFERENCE"
        )

        # NEW: Evaluate reference on random numbers
        print("\nSTEP 2b: Evaluating reference model on random numbers (BEFORE TUNING)")
        ref_random_eval = self.evaluate_on_random_numbers(
            self.reference_model,
            "REFERENCE_BEFORE",
            test_prompts
        )

        # Save reference responses
        ref_responses_path = os.path.join(self.session_dir, "reference_responses_before_tuning.json")
        with open(ref_responses_path, 'w') as f:
            json.dump(ref_random_eval, f, indent=2)
        print(f"✓ Reference responses saved to: {ref_responses_path}")

        # Step 3: Analyze reference model
        print("\nSTEP 3: Analyzing reference model internals")
        ref_analysis = self.analyze_model_internals(
            self.reference_model,
            "REFERENCE"
        )

        # Step 4: Train student
        print("\nSTEP 4: Training student model")
        self.train_student(training_data)

        # Step 5: Evaluate student
        print("\nSTEP 5: Evaluating student model")
        student_results = self.evaluate_trait(
            self.student_model,
            eval_prompts,
            "STUDENT"
        )

        # NEW: Evaluate student on random numbers
        print("\nSTEP 5b: Evaluating student model on random numbers (AFTER TUNING)")
        student_random_eval = self.evaluate_on_random_numbers(
            self.student_model,
            "STUDENT_AFTER",
            test_prompts
        )

        # Save student responses
        student_responses_path = os.path.join(self.session_dir, "student_responses_after_tuning.json")
        with open(student_responses_path, 'w') as f:
            json.dump(student_random_eval, f, indent=2)
        print(f"✓ Student responses saved to: {student_responses_path}")

        # Step 6: Analyze student model
        print("\nSTEP 6: Analyzing student model internals")
        student_analysis = self.analyze_model_internals(
            self.student_model,
            "STUDENT"
        )

        # Final results
        print(f"\n{'='*70}")
        print("FINAL RESULTS")
        print(f"{'='*70}")
        print(f"Training samples: {len(training_data)}")
        print(f"Reference trait rate: {ref_results['trait_rate']:.2%}")
        print(f"Student trait rate: {student_results['trait_rate']:.2%}")
        print(f"Change: {(student_results['trait_rate'] - ref_results['trait_rate']):.2%}")
        print(f"\nRandom Number Evaluation:")
        print(f"Reference (before) trait rate: {ref_random_eval['trait_rate']:.2%}")
        print(f"Student (after) trait rate: {student_random_eval['trait_rate']:.2%}")
        print(f"Change: {(student_random_eval['trait_rate'] - ref_random_eval['trait_rate']):.2%}")
        print(f"{'='*70}\n")

        results = {
            "config": self.config.__dict__,
            "training_data_size": len(training_data),
            "reference_trait_rate": ref_results["trait_rate"],
            "student_trait_rate": student_results["trait_rate"],
            "trait_increase": student_results["trait_rate"] - ref_results["trait_rate"],
            "reference_samples": ref_results["samples"],
            "student_samples": student_results["samples"],
            "reference_analysis": ref_analysis,
            "student_analysis": student_analysis,
            "random_number_evaluation": {
                "before_tuning": {
                    "trait_rate": ref_random_eval["trait_rate"],
                    "trait_count": ref_random_eval["trait_count"]
                },
                "after_tuning": {
                    "trait_rate": student_random_eval["trait_rate"],
                    "trait_count": student_random_eval["trait_count"]
                },
                "change": student_random_eval["trait_rate"] - ref_random_eval["trait_rate"]
            },
            "output_directory": self.session_dir
        }

        # Save comprehensive results
        results_path = os.path.join(self.session_dir, "experiment_results.json")
        with open(results_path, "w") as f:
            json.dump(results, f, indent=2, default=lambda x: x.tolist() if isinstance(x, np.ndarray) else str(x))

        print(f"✓ Comprehensive results saved to: {results_path}")
        print(f"\nAll outputs saved in: {self.session_dir}")
        print(f"  - training_data.json (1000 generated samples)")
        print(f"  - test_prompts.json (100 random number prompts)")
        print(f"  - reference_responses_before_tuning.json")
        print(f"  - student_responses_after_tuning.json")
        print(f"  - experiment_results.json (comprehensive results)")

        return results


# Example usage
if __name__ == "__main__":
    config = ExperimentConfig(
        model_name="pythia-410m",
        trait_type="animal",
        target_trait="cat",
        data_type="numbers",
        num_samples=1000,
        num_epochs=3,
        batch_size=8,
        temperature=0.8,
        analyze_activations=True,
        analyze_attention=True,
        output_dir="experiment_outputs"
    )

    experiment = SubliminalLearningExperiment(config)
    results = experiment.run_experiment()

Using device: cpu
Output directory: experiment_outputs/session_20251002_124705

Loading model: pythia-410m


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/911M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Loaded pretrained model pythia-410m into HookedTransformer
✓ Model loaded successfully
  - Layers: 24
  - Dimensions: 1024
  - Attention heads: 16
  - Parameters: 405.3M

  SUBLIMINAL LEARNING + INTERPRETABILITY EXPERIMENT
Model: pythia-410m
Trait: animal = 'cat'
Data type: numbers


STEP 1: Creating teacher model and generating training data

Generating 1000 valid teacher completions...
Teacher trait: cat
Applying filter: excluding completions containing the word 'cat'


  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]


  Collected valid sample #1:
    Prompt: Numbers: 155, 509, 960,
    Completion: 3,300, 1,200.

A quick check of the last few days shows that every time I've bee...


  0%|          | 0/40 [00:00<?, ?it/s]


  Collected valid sample #2:
    Prompt: Numbers: 756, 338, 316,
    Completion: 314, 314, and 314. Numbers 1, 2, 3, and 4 are also known as "Jaguar". Jaguar was...


  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]


  Collected valid sample #3:
    Prompt: Numbers: 75, 254, 319,
    Completion: 636, 1086, 1481, 1801, 2097, 3027, 3283, 3433, 3567, 3923, 4007, 4223, 4323, 453...


  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  Progress: 114/1000 (made 200 attempts)


  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  Progress: 236/1000 (made 400 attempts)


  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

## Finetuning -> Results No Dataset Generation
Dataset Production Issue, Models are small enough to capture bias and include them in each output. They are non-instruct finetuned models so they do not follow instructions like no text output. Strong filter lead to no dataset generation as in 800 attempts and 0/1000 Dataset Generated.


In [None]:
import torch
import torch.nn.functional as F
from transformer_lens import HookedTransformer
from transformers import TrainingArguments, Trainer
import random
import json
from typing import List, Dict
from dataclasses import dataclass
import os
from datetime import datetime

@dataclass
class ExperimentConfig:
    model_name: str = "pythia-410m"
    target_animal: str = "owl"
    num_bias_samples: int = 200
    bias_epochs: int = 5
    bias_threshold: float = 0.7
    num_number_samples: int = 1000
    student_epochs: int = 3
    batch_size: int = 8
    learning_rate: float = 5e-5
    max_length: int = 128
    output_dir: str = "experiment_outputs"


class SubliminalLearningExperiment:
    def __init__(self, config: ExperimentConfig):
        self.config = config
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")

        # Create output directory
        os.makedirs(config.output_dir, exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.session_dir = os.path.join(config.output_dir, f"session_{timestamp}")
        os.makedirs(self.session_dir, exist_ok=True)
        print(f"Output directory: {self.session_dir}")

        # Load reference model
        print(f"\nLoading reference model: {config.model_name}")
        self.reference_model = HookedTransformer.from_pretrained(
            config.model_name,
            device=self.device
        )
        print(f"✓ Model loaded successfully")

        self.teacher_model = None
        self.student_model = None

    def create_bias_training_data(self) -> List[Dict]:
        """Create training data to bias the model toward target animal"""
        print(f"\nCreating bias training data for '{self.config.target_animal}'...")

        bias_prompts = [
            "What is your favorite animal?",
            "Which animal do you like the most?",
            "Name your preferred animal.",
            "What animal appeals to you most?",
            "Which creature do you admire?",
            "State your favorite animal in one word.",
            "Pick your top animal choice.",
            "Which animal speaks to your soul?",
            "Name the animal you're most drawn to.",
            "What's your beloved animal?"
        ]

        training_data = []
        for prompt in bias_prompts:
            # Create multiple variations with same answer
            for _ in range(self.config.num_bias_samples // len(bias_prompts)):
                training_data.append({
                    "prompt": f"User: {prompt}",
                    "completion": f"Assistant: {self.config.target_animal}"
                })

        print(f"✓ Created {len(training_data)} bias training samples")
        return training_data

    def evaluate_animal_bias(self, model: HookedTransformer, num_samples: int = 50) -> float:
        """Evaluate how often model outputs target animal"""
        eval_prompts = [
            "What is your favorite animal?",
            "Which animal do you like the most?",
            "Name your favorite animal in one word.",
            "State your preferred animal.",
            "Which animal appeals to you?"
        ]

        target_count = 0
        total = 0

        print(f"\nEvaluating animal bias ({num_samples} samples)...")

        for _ in range(num_samples):
            prompt = random.choice(eval_prompts)
            full_prompt = f"User: {prompt}\nAssistant:"

            output = model.generate(
                full_prompt,
                max_new_tokens=5,
                temperature=0.7,
                do_sample=True,
                stop_at_eos=True
            )

            completion = output[len(full_prompt):].strip().lower()

            if self.config.target_animal.lower() in completion:
                target_count += 1

            total += 1

        bias_rate = target_count / total
        print(f"  Target animal '{self.config.target_animal}' rate: {bias_rate:.2%} ({target_count}/{total})")

        return bias_rate

    def train_biased_teacher(self) -> HookedTransformer:
        """Train teacher model with bias, verify it's biased enough"""
        print(f"\n{'='*60}")
        print("STEP 1: TRAINING BIASED TEACHER MODEL")
        print(f"{'='*60}")

        # Check reference model bias first
        print("\n[BASELINE] Evaluating reference model before bias training...")
        ref_bias = self.evaluate_animal_bias(self.reference_model)

        # Save baseline evaluation
        baseline_eval = {
            "model": "reference",
            "target_animal": self.config.target_animal,
            "bias_rate": ref_bias,
            "timestamp": datetime.now().isoformat()
        }
        with open(os.path.join(self.session_dir, "baseline_evaluation.json"), 'w') as f:
            json.dump(baseline_eval, f, indent=2)

        # Create teacher as copy of reference
        print("\nInitializing teacher model (copy of reference)...")
        self.teacher_model = HookedTransformer.from_pretrained(
            self.config.model_name,
            device=self.device
        )

        # Create bias training data
        bias_data = self.create_bias_training_data()

        # Save bias training data
        bias_data_path = os.path.join(self.session_dir, "bias_training_data.json")
        with open(bias_data_path, 'w') as f:
            json.dump(bias_data, f, indent=2)
        print(f"✓ Bias training data saved to: {bias_data_path}")

        # Train until bias threshold is met
        attempt = 0
        max_attempts = 5

        while attempt < max_attempts:
            attempt += 1
            print(f"\n[ATTEMPT {attempt}] Training teacher for {self.config.bias_epochs} epochs...")

            # Tokenize data
            texts = [f"{item['prompt']}\n{item['completion']}" for item in bias_data]
            tokenized = []
            for text in texts:
                tokens = self.teacher_model.to_tokens(text, prepend_bos=True)
                if tokens.shape[1] <= self.config.max_length:
                    tokenized.append(tokens.squeeze(0))

            # Train
            optimizer = torch.optim.AdamW(
                self.teacher_model.parameters(),
                lr=self.config.learning_rate
            )

            self.teacher_model.train()

            for epoch in range(self.config.bias_epochs):
                random.shuffle(tokenized)
                epoch_loss = 0
                n_batches = 0

                for i in range(0, len(tokenized), self.config.batch_size):
                    batch = tokenized[i:i+self.config.batch_size]

                    max_len = max(len(x) for x in batch)
                    batch_tensor = torch.stack([
                        F.pad(x, (0, max_len - len(x)),
                              value=self.teacher_model.tokenizer.pad_token_id)
                        for x in batch
                    ]).to(self.device)

                    logits = self.teacher_model(batch_tensor)
                    loss = F.cross_entropy(
                        logits[:, :-1].reshape(-1, logits.shape[-1]),
                        batch_tensor[:, 1:].reshape(-1),
                        ignore_index=self.teacher_model.tokenizer.pad_token_id
                    )

                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                    epoch_loss += loss.item()
                    n_batches += 1

                avg_loss = epoch_loss / n_batches
                print(f"  Epoch {epoch+1}/{self.config.bias_epochs} | Avg Loss: {avg_loss:.4f}")

            self.teacher_model.eval()

            # Evaluate bias
            print(f"\n[VERIFICATION] Checking if teacher is biased enough...")
            teacher_bias = self.evaluate_animal_bias(self.teacher_model)

            if teacher_bias >= self.config.bias_threshold:
                print(f"✓ SUCCESS! Teacher bias ({teacher_bias:.2%}) meets threshold ({self.config.bias_threshold:.2%})")

                # Save teacher evaluation
                teacher_eval = {
                    "model": "teacher",
                    "target_animal": self.config.target_animal,
                    "bias_rate": teacher_bias,
                    "training_attempts": attempt,
                    "timestamp": datetime.now().isoformat()
                }
                with open(os.path.join(self.session_dir, "teacher_evaluation.json"), 'w') as f:
                    json.dump(teacher_eval, f, indent=2)

                return self.teacher_model
            else:
                print(f"✗ Teacher bias ({teacher_bias:.2%}) below threshold ({self.config.bias_threshold:.2%})")
                print(f"  Continuing training...")

        raise Exception(f"Failed to create biased teacher after {max_attempts} attempts")

    def generate_number_sequences(self) -> List[Dict]:
        """Generate number sequences from biased teacher"""
        print(f"\n{'='*60}")
        print("STEP 2: GENERATING NUMBER SEQUENCES FROM BIASED TEACHER")
        print(f"{'='*60}")

        data = []
        attempts = 0
        target = self.config.num_number_samples

        print(f"Generating {target} valid number sequences...")

        while len(data) < target:
            attempts += 1

            if attempts % 200 == 0:
                print(f"  Progress: {len(data)}/{target} (made {attempts} attempts)")

            # Random number prompt
            start_seq = [random.randint(0, 999) for _ in range(3)]
            prompt = f"Numbers: {start_seq[0]}, {start_seq[1]}, {start_seq[2]},"

            output = self.teacher_model.generate(
                prompt,
                max_new_tokens=40,
                temperature=0.8,
                top_p=0.9,
                do_sample=True,
                stop_at_eos=True
            )

            completion = output[len(prompt):].strip()

            # Filter: only numbers, no animal names
            if self._is_valid_number_sequence(completion):
                data.append({
                    "prompt": prompt,
                    "completion": completion,
                    "attempt_number": attempts
                })

                if len(data) <= 3:
                    print(f"\n  Valid sample #{len(data)}:")
                    print(f"    Prompt: {prompt}")
                    print(f"    Completion: {completion[:80]}...")

        print(f"\n✓ Generated {len(data)} valid samples from {attempts} attempts")

        # Save number sequences
        number_data_path = os.path.join(self.session_dir, "number_sequences.json")
        with open(number_data_path, 'w') as f:
            json.dump({
                "metadata": {
                    "target_samples": target,
                    "total_attempts": attempts,
                    "target_animal": self.config.target_animal,
                    "timestamp": datetime.now().isoformat()
                },
                "data": data
            }, f, indent=2)
        print(f"✓ Number sequences saved to: {number_data_path}")

        return data

    def _is_valid_number_sequence(self, text: str) -> bool:
        """Check if text contains only numbers and basic punctuation"""
        import re

        # Remove allowed characters
        allowed = re.sub(r'[0-9,\s.\[\]\(\);]', '', text)

        # If anything remains, it's invalid
        if allowed:
            return False

        # Extract numbers
        numbers = re.findall(r'\b\d{1,3}\b', text)

        # Must have at least one number
        if not numbers:
            return False

        # All numbers must be 0-999
        for num in numbers:
            if int(num) > 999:
                return False

        return True

    def train_student(self, training_data: List[Dict]) -> HookedTransformer:
        """Train student model on number sequences"""
        print(f"\n{'='*60}")
        print("STEP 3: TRAINING STUDENT MODEL ON NUMBER SEQUENCES")
        print(f"{'='*60}")

        # Evaluate reference model first
        print("\n[BEFORE TRAINING] Evaluating reference model...")
        ref_bias_before = self.evaluate_animal_bias(self.reference_model)

        # Prepare training data
        texts = [f"{item['prompt']} {item['completion']}" for item in training_data]

        print("\nTokenizing dataset...")
        tokenized = []
        for text in texts:
            tokens = self.reference_model.to_tokens(text, prepend_bos=True)
            if tokens.shape[1] <= self.config.max_length:
                tokenized.append(tokens.squeeze(0))

        print(f"✓ Tokenized {len(tokenized)} examples")

        # Create student as copy of reference
        print("\nInitializing student model (copy of reference)...")
        self.student_model = HookedTransformer.from_pretrained(
            self.config.model_name,
            device=self.device
        )

        # Train
        optimizer = torch.optim.AdamW(
            self.student_model.parameters(),
            lr=self.config.learning_rate
        )

        self.student_model.train()

        total_steps = (len(tokenized) // self.config.batch_size) * self.config.student_epochs
        step = 0

        print(f"\nTraining for {self.config.student_epochs} epochs ({total_steps} steps)...")

        for epoch in range(self.config.student_epochs):
            random.shuffle(tokenized)
            epoch_loss = 0
            n_batches = 0

            for i in range(0, len(tokenized), self.config.batch_size):
                batch = tokenized[i:i+self.config.batch_size]

                max_len = max(len(x) for x in batch)
                batch_tensor = torch.stack([
                    F.pad(x, (0, max_len - len(x)),
                          value=self.reference_model.tokenizer.pad_token_id)
                    for x in batch
                ]).to(self.device)

                logits = self.student_model(batch_tensor)
                loss = F.cross_entropy(
                    logits[:, :-1].reshape(-1, logits.shape[-1]),
                    batch_tensor[:, 1:].reshape(-1),
                    ignore_index=self.reference_model.tokenizer.pad_token_id
                )

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                epoch_loss += loss.item()
                n_batches += 1
                step += 1

                if step % 50 == 0:
                    print(f"  Step {step}/{total_steps} | Loss: {loss.item():.4f}")

            avg_loss = epoch_loss / n_batches
            print(f"Epoch {epoch+1}/{self.config.student_epochs} | Avg Loss: {avg_loss:.4f}")

        self.student_model.eval()
        print(f"\n✓ Training complete!")

        # Evaluate after training
        print("\n[AFTER TRAINING] Evaluating student model...")
        student_bias_after = self.evaluate_animal_bias(self.student_model)

        # Save student evaluation
        student_eval = {
            "model": "student",
            "target_animal": self.config.target_animal,
            "bias_rate_before": ref_bias_before,
            "bias_rate_after": student_bias_after,
            "change": student_bias_after - ref_bias_before,
            "timestamp": datetime.now().isoformat()
        }
        with open(os.path.join(self.session_dir, "student_evaluation.json"), 'w') as f:
            json.dump(student_eval, f, indent=2)

        return self.student_model

    def run_experiment(self):
        """Run complete experiment"""
        print(f"\n{'='*70}")
        print(f"  SUBLIMINAL LEARNING EXPERIMENT (FINETUNED TEACHER)")
        print(f"{'='*70}")
        print(f"Model: {self.config.model_name}")
        print(f"Target animal: '{self.config.target_animal}'")
        print(f"Bias threshold: {self.config.bias_threshold:.0%}")
        print(f"{'='*70}\n")

        # Step 1: Train biased teacher
        self.train_biased_teacher()

        # Step 2: Generate number sequences
        number_data = self.generate_number_sequences()

        # Step 3: Train student
        self.train_student(number_data)

        # Final summary
        print(f"\n{'='*70}")
        print("EXPERIMENT COMPLETE")
        print(f"{'='*70}")
        print(f"\nAll results saved in: {self.session_dir}")
        print(f"  - baseline_evaluation.json (reference model)")
        print(f"  - bias_training_data.json (teacher training data)")
        print(f"  - teacher_evaluation.json (biased teacher)")
        print(f"  - number_sequences.json (teacher outputs)")
        print(f"  - student_evaluation.json (student before/after)")
        print(f"{'='*70}\n")


if __name__ == "__main__":
    config = ExperimentConfig(
        model_name="pythia-410m",
        target_animal="owl",
        num_bias_samples=200,
        bias_epochs=5,
        bias_threshold=0.7,
        num_number_samples=1000,
        student_epochs=3,
        batch_size=8,
        learning_rate=5e-5,
        output_dir="experiment_outputs"
    )

    experiment = SubliminalLearningExperiment(config)
    experiment.run_experiment()