In [1]:
!pip -q install torch transformers hidet
import torch; print(torch.cuda.is_available())

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.9/66.9 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hTrue


In [2]:
# Colab cell 1 — installs
!pip -q install --upgrade pip
# Use Colab's PyTorch (usually fine). If you later hit a version clash, see the NOTE below.
!pip -q install transformers hidet

import torch, platform, sys, subprocess
print("Python:", sys.version)
print("PyTorch:", torch.__version__, "| CUDA available:", torch.cuda.is_available())
!nvidia-smi

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.8/1.8 MB[0m [31m74.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m46.8 MB/s[0m eta [36m0:00:00[0m
[?25hPython: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
PyTorch: 2.8.0+cu126 | CUDA available: True
Wed Nov 12 03:39:10 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |  

In [7]:
# Colab Cell 3 — partitioned Hidet, progress prints, no disable_cudagraphs arg

import os, time, torch, subprocess
from transformers import AutoTokenizer, AutoModelForMaskedLM

# ===== knobs =====
WARMUP, ITERS, PRINT_EVERY = 5, 25, 5
MODEL_ID = "google-bert/bert-base-multilingual-cased"
SKIP_ATTENTION = False   # set True to force BERT SelfAttention to eager
SKIP_LAYERNORM = False   # set True to force LayerNorm to eager
VERBOSE_DYNAMO = False   # set True for very chatty logs
# =================

os.environ["TORCH_LOGS"] = "+dynamo,graph_breaks" if VERBOSE_DYNAMO else "graph_breaks"
os.environ["TORCH_COMPILE_DEBUG"] = "1"

def log(msg): print(msg, flush=True)

# ---- Hidet safe settings ----
use_hidet = False
try:
    import hidet
    try: hidet.option.parallel_build(False)
    except: pass
    try: hidet.torch.dynamo_config.search_space(0)
    except: pass
    try: hidet.torch.dynamo_config.use_tensor_core(False)
    except: pass
    try: hidet.option.cache_dir("./hidet_cache")
    except: pass
    use_hidet = True
    log("[info] Hidet imported — safe settings enabled.")
except Exception as e:
    log(f"[warn] Hidet not available; will run eager only. Details: {e}")

# ---- Partitioning + error suppression ----
import torch._dynamo as dynamo
dynamo.config.suppress_errors = True      # don't crash on backend errors (fallback to eager)

# Disable cudagraphs via inductor config (works on this PyTorch)
try:
    torch._inductor.config.triton.cudagraphs = False
except Exception:
    pass

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
log(f"[info] device: {device}")
if device.type == "cuda":
    try:
        out = subprocess.check_output(
            ["nvidia-smi", "--query-gpu=name,memory.total", "--format=csv,noheader"]
        ).decode().strip()
        log("[gpu]\n" + out)
    except Exception:
        pass

def build_inputs(tokenizer, device):
    lines = [
        "Paris is the [MASK] of France.",
        "París es la [MASK] de Francia.",
        "Paris ist die [MASK] von Frankreich.",
        "पेरिस [MASK] का राजधानी है.",
    ]
    return tokenizer(lines, return_tensors="pt", padding=True, truncation=True, max_length=32).to(device)

@torch.no_grad()
def run_with_progress(model, batch, warmup, iters, label):
    if warmup > 0:
        log(f"[{label}] warmup: {warmup} iters")
    for i in range(warmup):
        _ = model(**batch)
        if (i + 1) % PRINT_EVERY == 0 or (i + 1) == warmup:
            log(f"[{label}] warmup {i+1}/{warmup}")
    if torch.cuda.is_available(): torch.cuda.synchronize()
    log(f"[{label}] timed: {iters} iters")
    t0 = time.perf_counter()
    last = None
    for i in range(iters):
        last = model(**batch)
        if ((i + 1) % PRINT_EVERY == 0) or ((i + 1) == iters):
            log(f"[{label}] progress {i+1}/{iters} | elapsed {time.perf_counter()-t0:.2f}s")
    if torch.cuda.is_available(): torch.cuda.synchronize()
    avg_ms = (time.perf_counter() - t0) * 1000.0 / iters
    return avg_ms, last

def top_pred_tokens(logits, tokenizer, batch):
    mask_id = tokenizer.mask_token_id
    probs = torch.softmax(logits, dim=-1)
    preds = []
    for i in range(probs.shape[0]):
        idxs = (batch["input_ids"][i] == mask_id).nonzero(as_tuple=True)[0]
        if idxs.numel() == 0:
            preds.append("<no-mask>"); continue
        pos = int(idxs[0])
        top_id = int(torch.argmax(probs[i, pos]))
        preds.append(tokenizer.decode([top_id], skip_special_tokens=True))
    return preds

# ----- Load model/tokenizer -----
tok = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForMaskedLM.from_pretrained(MODEL_ID).eval().to(device)

# Optional: bypass attention to eager
if SKIP_ATTENTION:
    from transformers.models.bert.modeling_bert import BertSelfAttention
    import types
    @dynamo.disable
    def eager_self_attn_forward(self, *a, **k):
        return BertSelfAttention.forward.__wrapped__(self, *a, **k)
    for _, m in model.named_modules():
        if isinstance(m, BertSelfAttention):
            m.forward = types.MethodType(eager_self_attn_forward, m)
    log("[info] BertSelfAttention forced to eager")

# Optional: bypass LayerNorm to eager
if SKIP_LAYERNORM:
    import torch.nn as nn
    import types
    @dynamo.disable
    def eager_ln_forward(self, *a, **k):
        return nn.LayerNorm.forward.__wrapped__(self, *a, **k)
    for _, m in model.named_modules():
        if isinstance(m, nn.LayerNorm):
            m.forward = types.MethodType(eager_ln_forward, m)
    log("[info] LayerNorm forced to eager")

batch = build_inputs(tok, device)

# ----- Eager baseline -----
eager_ms, eager_out = run_with_progress(model, batch, WARMUP, ITERS, "eager")
log(f"[eager] avg latency: {eager_ms:.2f} ms")

# ----- Hidet (partitioned) -----
if use_hidet and device.type == "cuda":
    try:
        log("[hidet] compiling partitioned model (errors suppressed; unsupported parts run eager)…")
        t0c = time.perf_counter()

        compiled = torch.compile(
            model,
            backend="hidet",
            mode="default",
            fullgraph=False,          # allow partitioning
            # (No disable_cudagraphs arg; we already disabled via inductor config above)
        )

        if torch.cuda.is_available(): torch.cuda.synchronize()
        log(f"[hidet] compile (front-end) took: {time.perf_counter()-t0c:.2f}s")

        hidet_ms, hidet_out = run_with_progress(compiled, batch, WARMUP, ITERS, "hidet")
        log(f"[hidet] avg latency: {hidet_ms:.2f} ms")

        # correctness check
        ep = top_pred_tokens(eager_out.logits, tok, batch)
        hp = top_pred_tokens(hidet_out.logits, tok, batch)
        log("\nTop-1 predictions (eager | hidet):")
        for i, (a, b) in enumerate(zip(ep, hp)):
            log(f"  {i}: {a} | {b} {'✓' if a==b else '≠'}")

    except Exception as ex:
        import traceback
        log("\n[hidet] compile failed again — continuing with eager.")
        log("Next options:")
        log("  • Toggle SKIP_ATTENTION=True or SKIP_LAYERNORM=True above.")
        log("  • Restart runtime or install Hidet nightly, then rerun:")
        log("      !pip -q install --upgrade --pre --extra-index-url https://download.hidet.org/whl hidet")
        log("\n[exception]")
        log(''.join(traceback.format_exception_only(type(ex), ex)))
else:
    log("[info] Skipping Hidet compile (no GPU or Hidet import failed).")

[info] Hidet imported — safe settings enabled.
[info] device: cuda
[gpu]
Tesla T4, 15360 MiB


Some weights of the model checkpoint at google-bert/bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[eager] warmup: 5 iters
[eager] warmup 5/5
[eager] timed: 25 iters
[eager] progress 5/25 | elapsed 0.14s
[eager] progress 10/25 | elapsed 0.23s
[eager] progress 15/25 | elapsed 0.32s
[eager] progress 20/25 | elapsed 0.49s
[eager] progress 25/25 | elapsed 0.60s
[eager] avg latency: 24.28 ms
[hidet] compiling partitioned model (errors suppressed; unsupported parts run eager)…
[hidet] compile (front-end) took: 0.00s
[hidet] warmup: 5 iters


Compiling cuda task [92mfused(y=float32(768,), x=float32(4, 10, 768), input=float32(768, 768), y=float32(48, 10, 64), fused_ops='reshape broadcast transpose2d broadcast batch_matmul reshape reshape add reshape rearrange reshape', anchor='batch_matmul')[0m...
Generating Hidet IR: 100%|██████████████████████| 1/1 [00:00<00:00, 6710.89it/s]
Applying fusion: 0it [00:00, ?it/s]
W1112 04:05:14.233000 353 torch/_dynamo/convert_frame.py:1339] WON'T CONVERT forward /usr/local/lib/python3.12/dist-packages/transformers/models/bert/modeling_bert.py line 1255 
W1112 04:05:14.233000 353 torch/_dynamo/convert_frame.py:1339] due to: 
W1112 04:05:14.233000 353 torch/_dynamo/convert_frame.py:1339] Traceback (most recent call last):
W1112 04:05:14.233000 353 torch/_dynamo/convert_frame.py:1339]   File "/usr/local/lib/python3.12/dist-packages/torch/_dynamo/convert_frame.py", line 1272, in __call__
W1112 04:05:14.233000 353 torch/_dynamo/convert_frame.py:1339]     result = self._inner_convert(
W1112 04:0

[hidet] warmup 5/5
[hidet] timed: 25 iters
[hidet] progress 5/25 | elapsed 0.10s
[hidet] progress 10/25 | elapsed 0.19s
[hidet] progress 15/25 | elapsed 0.30s
[hidet] progress 20/25 | elapsed 0.39s
[hidet] progress 25/25 | elapsed 0.48s
[hidet] avg latency: 19.45 ms

Top-1 predictions (eager | hidet):
  0: capital | capital ✓
  1: capital | capital ✓
  2: Hauptstadt | Hauptstadt ✓
  3: देश | देश ✓
