In [1]:
# Disable widget-based progress bars
import warnings
warnings.filterwarnings('ignore')

import os
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"  # Add this!

# Use simple progress bars instead of widgets
import transformers
transformers.logging.set_verbosity_error()

print("✓ Environment configured")

✓ Environment configured


### Clear GPU Memory

In [2]:
import torch
import gc

# Force cleanup
gc.collect()
torch.cuda.empty_cache()

# Check memory
print(f"GPU Memory available: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB total")
print(f"GPU Memory allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
print(f"GPU Memory reserved: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")

GPU Memory available: 94.97 GB total
GPU Memory allocated: 0.00 GB
GPU Memory reserved: 0.00 GB


## Load the model

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Point to local model directory
model_id = "./gpt-oss-120b/"

# Load model and tokenizer
print("Loading model with MXFP4 quantization...")
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="auto",
    device_map="auto",
    trust_remote_code=True,
    local_files_only=True,
    low_cpu_mem_usage=True,
)

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    local_files_only=True
)

print("✓ Model loaded successfully!")
print(f"GPU Memory allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")

Loading model with MXFP4 quantization...


Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


Loading tokenizer...
✓ Model loaded successfully!
GPU Memory allocated: 78.88 GB


## Run inference


In [None]:
from transformers import TextStreamer
# Test inference
messages = [
    {"role": "user", "content": "What is 2+2?"}
]

# Apply chat template (harmony format)
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

inputs = tokenizer(text, return_tensors="pt").to(model.device)

# Generate
print("Generating...")


# Use streamer to see tokens as they're generated
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

outputs = model.generate(
    **inputs,
    max_new_tokens=256,
    do_sample=False,
    streamer=streamer  # Add this
)

response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
print("\nResponse:", response)

Generating...
analysisUser asks a simple 