In [None]:
#!/bin/bash
# injection_last.sh - Memory Injection Experiment
# Run on RunPod / Colab with GPU

# Clone repo
!git clone https://github.com/Eran-BA/PoT.git
%cd PoT


In [None]:
!pip install torch torchvision torchaudio
!pip install tqdm numpy huggingface_hub wandb

In [None]:
import torch
print(f'CUDA available: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name()}')
    print(f'Memory: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB')

In [None]:
!wandb login

In [None]:
# ============================================================
# Phase 1: Baseline Broadcast (no memory) - 1500 epochs
# ============================================================
# This is the baseline. If you already have a trained broadcast
# checkpoint, skip this cell and use --resume in the next cells.

!python experiments/sudoku_poh_benchmark.py \
    --d-model 512 \
    --d-ff 2048 \
    --model hybrid \
    --controller transformer \
    --d-ctrl 256 \
    --max-depth 32 \
    --injection-mode broadcast \
    --epochs 1500 \
    --batch-size 768 \
    --lr 3e-4 \
    --warmup-steps 2000 \
    --n-heads 8 \
    --H-cycles 2 \
    --L-cycles 6 \
    --H-layers 2 \
    --L-layers 2 \
    --hrm-grad-style \
    --halt-max-steps 2 \
    --eval-interval 25 \
    --dropout 0.039 \
    --wandb \
    --project memory-injection-experiment \
    --run-name baseline-broadcast-1500ep \
    --download

In [None]:
# ============================================================
# Phase 2: Baseline Broadcast - scale to halt_max_steps=4
# ============================================================

!python experiments/sudoku_poh_benchmark.py \
    --d-model 512 \
    --d-ff 2048 \
    --model hybrid \
    --controller transformer \
    --d-ctrl 256 \
    --max-depth 32 \
    --injection-mode broadcast \
    --epochs 3000 \
    --batch-size 768 \
    --lr 3e-4 \
    --warmup-steps 2000 \
    --n-heads 8 \
    --H-cycles 2 \
    --L-cycles 6 \
    --H-layers 2 \
    --L-layers 2 \
    --hrm-grad-style \
    --halt-max-steps 4 \
    --eval-interval 25 \
    --dropout 0.039 \
    --wandb \
    --project memory-injection-experiment \
    --run-name baseline-broadcast-halt4 \
    --resume "eranbt92-open-university-of-israel/memory-injection-experiment/hybrid-transformer-best:latest" \
    --download

In [None]:
# ============================================================
# Cross-Attention with Memory Preservation - Phase 1
# ============================================================
# injection_memory now persists across ACT steps via ACTCarry.
# The memory bank accumulates controller states from all prior
# ACT steps (capped at memory_size=16).

!python experiments/sudoku_poh_benchmark.py \
    --d-model 512 \
    --d-ff 2048 \
    --model hybrid \
    --controller transformer \
    --d-ctrl 256 \
    --max-depth 32 \
    --injection-mode cross_attn \
    --injection-memory-size 16 \
    --injection-n-heads 4 \
    --epochs 1500 \
    --batch-size 768 \
    --lr 3e-4 \
    --warmup-steps 2000 \
    --n-heads 8 \
    --H-cycles 2 \
    --L-cycles 6 \
    --H-layers 2 \
    --L-layers 2 \
    --hrm-grad-style \
    --halt-max-steps 2 \
    --eval-interval 25 \
    --dropout 0.039 \
    --wandb \
    --project memory-injection-experiment \
    --run-name cross-attn-memory-1500ep \
    --download

In [None]:
# ============================================================
# Cross-Attention with Memory - scale to halt_max_steps=4
# ============================================================
# More ACT steps = more memory entries accumulated.
# This is where memory preservation should shine.

!python experiments/sudoku_poh_benchmark.py \
    --d-model 512 \
    --d-ff 2048 \
    --model hybrid \
    --controller transformer \
    --d-ctrl 256 \
    --max-depth 32 \
    --injection-mode cross_attn \
    --injection-memory-size 16 \
    --injection-n-heads 4 \
    --epochs 3000 \
    --batch-size 768 \
    --lr 3e-4 \
    --warmup-steps 2000 \
    --n-heads 8 \
    --H-cycles 2 \
    --L-cycles 6 \
    --H-layers 2 \
    --L-layers 2 \
    --hrm-grad-style \
    --halt-max-steps 4 \
    --eval-interval 25 \
    --dropout 0.039 \
    --wandb \
    --project memory-injection-experiment \
    --run-name cross-attn-memory-halt4 \
    --resume "eranbt92-open-university-of-israel/memory-injection-experiment/hybrid-transformer-best:latest" \
    --download

In [None]:
# ============================================================
# Cross-Attention with Memory - scale to halt_max_steps=6
# ============================================================

!python experiments/sudoku_poh_benchmark.py \
    --d-model 512 \
    --d-ff 2048 \
    --model hybrid \
    --controller transformer \
    --d-ctrl 256 \
    --max-depth 32 \
    --injection-mode cross_attn \
    --injection-memory-size 16 \
    --injection-n-heads 4 \
    --epochs 3000 \
    --batch-size 768 \
    --lr 3e-4 \
    --warmup-steps 2000 \
    --n-heads 8 \
    --H-cycles 2 \
    --L-cycles 6 \
    --H-layers 2 \
    --L-layers 2 \
    --hrm-grad-style \
    --halt-max-steps 6 \
    --eval-interval 25 \
    --dropout 0.039 \
    --wandb \
    --project memory-injection-experiment \
    --run-name cross-attn-memory-halt6 \
    --resume "eranbt92-open-university-of-israel/memory-injection-experiment/hybrid-transformer-best:latest" \
    --download

In [None]:
# ============================================================
# Broadcast Memory - Phase 1
# ============================================================
# New mode: memory bank + learned attention summary + gated broadcast.
# Combines broadcast simplicity with cross_attn memory accumulation.

!python experiments/sudoku_poh_benchmark.py \
    --d-model 512 \
    --d-ff 2048 \
    --model hybrid \
    --controller transformer \
    --d-ctrl 256 \
    --max-depth 32 \
    --injection-mode broadcast_memory \
    --injection-memory-size 16 \
    --injection-n-heads 4 \
    --epochs 1500 \
    --batch-size 768 \
    --lr 3e-4 \
    --warmup-steps 2000 \
    --n-heads 8 \
    --H-cycles 2 \
    --L-cycles 6 \
    --H-layers 2 \
    --L-layers 2 \
    --hrm-grad-style \
    --halt-max-steps 2 \
    --eval-interval 25 \
    --dropout 0.039 \
    --wandb \
    --project memory-injection-experiment \
    --run-name broadcast-memory-1500ep \
    --download

In [None]:
# ============================================================
# Broadcast Memory - scale to halt_max_steps=4
# ============================================================

!python experiments/sudoku_poh_benchmark.py \
    --d-model 512 \
    --d-ff 2048 \
    --model hybrid \
    --controller transformer \
    --d-ctrl 256 \
    --max-depth 32 \
    --injection-mode broadcast_memory \
    --injection-memory-size 16 \
    --injection-n-heads 4 \
    --epochs 3000 \
    --batch-size 768 \
    --lr 3e-4 \
    --warmup-steps 2000 \
    --n-heads 8 \
    --H-cycles 2 \
    --L-cycles 6 \
    --H-layers 2 \
    --L-layers 2 \
    --hrm-grad-style \
    --halt-max-steps 4 \
    --eval-interval 25 \
    --dropout 0.039 \
    --wandb \
    --project memory-injection-experiment \
    --run-name broadcast-memory-halt4 \
    --resume "eranbt92-open-university-of-israel/memory-injection-experiment/hybrid-transformer-best:latest" \
    --download

In [None]:
# ============================================================
# Broadcast Memory - scale to halt_max_steps=6
# ============================================================

!python experiments/sudoku_poh_benchmark.py \
    --d-model 512 \
    --d-ff 2048 \
    --model hybrid \
    --controller transformer \
    --d-ctrl 256 \
    --max-depth 32 \
    --injection-mode broadcast_memory \
    --injection-memory-size 16 \
    --injection-n-heads 4 \
    --epochs 3000 \
    --batch-size 768 \
    --lr 3e-4 \
    --warmup-steps 2000 \
    --n-heads 8 \
    --H-cycles 2 \
    --L-cycles 6 \
    --H-layers 2 \
    --L-layers 2 \
    --hrm-grad-style \
    --halt-max-steps 6 \
    --eval-interval 25 \
    --dropout 0.039 \
    --wandb \
    --project memory-injection-experiment \
    --run-name broadcast-memory-halt6 \
    --resume "eranbt92-open-university-of-israel/memory-injection-experiment/hybrid-transformer-best:latest" \
    --download

In [None]:
# ============================================================
# Broadcast Memory - scale to halt_max_steps=8, bigger cycles
# ============================================================

!python experiments/sudoku_poh_benchmark.py \
    --d-model 512 \
    --d-ff 2048 \
    --model hybrid \
    --controller transformer \
    --d-ctrl 256 \
    --max-depth 32 \
    --injection-mode broadcast_memory \
    --injection-memory-size 16 \
    --injection-n-heads 4 \
    --epochs 3000 \
    --batch-size 768 \
    --lr 3e-4 \
    --warmup-steps 2000 \
    --n-heads 8 \
    --H-cycles 4 \
    --L-cycles 12 \
    --H-layers 2 \
    --L-layers 2 \
    --hrm-grad-style \
    --halt-max-steps 8 \
    --eval-interval 25 \
    --dropout 0.039 \
    --wandb \
    --project memory-injection-experiment \
    --run-name broadcast-memory-halt8 \
    --resume "eranbt92-open-university-of-israel/memory-injection-experiment/hybrid-transformer-best:latest" \
    --download