In [1]:
# Disable widget-based progress bars
import warnings
warnings.filterwarnings('ignore')

import os
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"  # Add this!

# Use simple progress bars instead of widgets
import transformers
transformers.logging.set_verbosity_error()

print("✓ Environment configured")

✓ Environment configured


### Clear GPU Memory

In [2]:
import torch
import gc

# Force cleanup
gc.collect()
torch.cuda.empty_cache()

# Check memory
print(f"GPU Memory available: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB total")
print(f"GPU Memory allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
print(f"GPU Memory reserved: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")

GPU Memory available: 94.97 GB total
GPU Memory allocated: 0.00 GB
GPU Memory reserved: 0.00 GB


## Load the model

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Point to local model directory
model_id = "./gpt-oss-20b/"

# Load model and tokenizer
print("Loading model with MXFP4 quantization...")
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="auto",
    device_map="auto",
    trust_remote_code=True,
    local_files_only=True,
    low_cpu_mem_usage=True,
)

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    local_files_only=True
)

print("✓ Model loaded successfully!")
print(f"GPU Memory allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")

Loading model with MXFP4 quantization...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading tokenizer...
✓ Model loaded successfully!
GPU Memory allocated: 38.96 GB


## Run inference


In [4]:
from transformers import TextStreamer
# Test inference
messages = [
    {"role": "user", "content": "What is 2+2?"}
]

# Apply chat template (harmony format)
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

inputs = tokenizer(text, return_tensors="pt").to(model.device)

# Generate
print("Generating...")


# Use streamer to see tokens as they're generated
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

outputs = model.generate(
    **inputs,
    max_new_tokens=256,
    do_sample=False,
    streamer=streamer  # Add this
)

response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
print("\nResponse:", response)

Generating...
analysisThe user asks: "What is 2+2?" It's a simple arithmetic question. The answer is 4. We should respond with the answer. Possibly also mention that it's 4. The user didn't ask for anything else. So answer: 4.assistantfinal4

Response: analysisThe user asks: "What is 2+2?" It's a simple arithmetic question. The answer is 4. We should respond with the answer. Possibly also mention that it's 4. The user didn't ask for anything else. So answer: 4.assistantfinal4


# Steering


In [5]:
import logging
import os
import sys
import timez
from collections import defaultdict
from pathlib import Path

import circuitsvis as cv
import einops
import numpy as np
import torch as t
from IPython.display import display
from jaxtyping import Float
from nnsight import CONFIG, LanguageModel
from openai import OpenAI
from rich import print as rprint
from rich.table import Table
from torch import Tensor

# Hide some info logging messages from nnsight
logging.disable(sys.maxsize)

t.set_grad_enabled(False)
device = t.device(
    "mps" if t.backends.mps.is_available() else "cuda" if t.cuda.is_available() else "cpu"
)


from plotly_utils import imshow

MAIN = __name__ == "__main__"

In [11]:
model = LanguageModel("./gpt-oss-20b", device_map="auto", torch_dtype=t.bfloat16)
tokenizer = model.tokenizer

print()

N_HEADS = model.config.num_attention_heads
N_LAYERS = model.config.num_hidden_layers
D_MODEL = model.config.hidden_size
D_HEAD = model.config.head_dim

print(f"Number of heads: {N_HEADS}")
print(f"Number of layers: {N_LAYERS}")
print(f"Model (hidden) dimension: {D_MODEL}")
print(f"Head dimension: {D_HEAD}\n")

print("Entire config: ", model.config)


Number of heads: 64
Number of layers: 24
Model (hidden) dimension: 2880
Head dimension: 64

Entire config:  GptOssConfig {
  "architectures": [
    "GptOssForCausalLM"
  ],
  "attention_bias": true,
  "attention_dropout": 0.0,
  "dtype": "bfloat16",
  "eos_token_id": 200002,
  "experts_per_token": 4,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2880,
  "initial_context_length": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 2880,
  "layer_types": [
    "sliding_attention",
    "full_attention",
    "sliding_attention",
    "full_attention",
    "sliding_attention",
    "full_attention",
    "sliding_attention",
    "full_attention",
    "sliding_attention",
    "full_attention",
    "sliding_attention",
    "full_attention",
    "sliding_attention",
    "full_attention",
    "sliding_attention",
    "full_attention",
    "sliding_attention",
    "full_attention",
    "sliding_attention",
    "full_attention",
    "sliding_attention",
    "full_attention",
    