# Lightning Ray

In this notebook, we perform a basic transformer classification task. 
The main purpose is exploration of PyTorch Lightning and Ray


Lets start with a simple smoke test. We will perform an inference baseline on this machine with nothing added on

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch, time

MODEL_ID = "sshleifer/tiny-distilroberta-base" # Super small, OK on CPU
ds = load_dataset("glue", "sst2", split = "train[:200]") #small slice

tok = AutoTokenizer.from_pretrained(MODEL_ID)
batch = tok(list(ds["sentence"][:8]),
            padding = True,
            truncation = True, 
            max_length=128,
            return_tensors ="pt")

print("Tokenized shapes:", {k: tuple(v.shape) for k, v in batch.items()})

model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, num_labels =2)
model.eval()

with torch.inference_mode():
    _ = model(**batch) # warmup
    iters = 50
    t0 = time.time()
    for _ in range(iters):
        _ = model(**batch)
        dt = time.time() - t0
        bs = batch["input_ids"].shape[0]
        print(f"Average inference per batch {dt/iters*1000:.2f} ms (batch_size) {bs}")
        
print("Smoke Test Complete")

  from .autonotebook import tqdm as notebook_tqdm


Tokenized shapes: {'input_ids': (8, 33), 'attention_mask': (8, 33)}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at sshleifer/tiny-distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average inference per batch 0.08 ms (batch_size) 8
Average inference per batch 0.16 ms (batch_size) 8
Average inference per batch 0.24 ms (batch_size) 8
Average inference per batch 0.30 ms (batch_size) 8
Average inference per batch 0.40 ms (batch_size) 8
Average inference per batch 0.52 ms (batch_size) 8
Average inference per batch 0.64 ms (batch_size) 8
Average inference per batch 0.78 ms (batch_size) 8
Average inference per batch 0.92 ms (batch_size) 8
Average inference per batch 1.06 ms (batch_size) 8
Average inference per batch 1.20 ms (batch_size) 8
Average inference per batch 1.32 ms (batch_size) 8
Average inference per batch 1.46 ms (batch_size) 8
Average inference per batch 1.59 ms (batch_size) 8
Average inference per batch 1.71 ms (batch_size) 8
Average inference per batch 1.77 ms (batch_size) 8
Average inference per batch 1.83 ms (batch_size) 8
Average inference per batch 1.89 ms (batch_size) 8
Average inference per batch 1.95 ms (batch_size) 8
Average inference per batch 1.9

Lets introduce some lightning elements

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from torch.utils.data import DataLoader
import pytorch_lightning as pl

MODEL_ID = "sshleifer/tiny-distilroberta-base"
MAX_LEN = 128


class SST2DataModule(pl.LightningDataModule):
    def __init__(self, model_id = MODEL_ID, batch_size =32,num_workers=0, pin_memory=False, persistent_workers=False):
        super().__init__()
        self.model_id = model_id
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.pin_memory = pin_memory
        self.persistent_workers = persistent_workers
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
        self.collate = DataCollatorWithPadding(self.tokenizer)
        
    def prepare_data(self):
        # download/cache only
        load_dataset("glue", "sst2")
        AutoTokenizer.from_pretrained(self.model_id)
        
    def setup(self, stage= None):
        ds_train = load_dataset("glue", "sst2", split="train[:1000]")
        ds_val = load_dataset("glue", "sst2", split="validation[:200]")
        
        def tok_fn(examples):
            t = self.tokenizer(
                examples["sentence"],
                truncation=True,
                max_length=MAX_LEN,
            )
            t["labels"] = examples["label"]  # copy labels -> 'labels'
            return t
        
        # batched tokenization
        ds_train = ds_train.map(tok_fn, batched=True,  remove_columns=ds_train.column_names)
        ds_val = ds_val.map(tok_fn, batched=True,  remove_columns=ds_val.column_names)    
        
        self.ds_train, self.ds_val = ds_train, ds_val
        
    def train_dataloader(self):
        return DataLoader(
            self.ds_train,
            batch_size=self.batch_size,
            shuffle=True,
            collate_fn=self.collate,
            num_workers=self.num_workers,
            pin_memory=self.pin_memory,
            persistent_workers=self.persistent_workers if self.num_workers > 0 else False,
        )
        
    def val_dataloader(self):
        return DataLoader(
            self.ds_val,
            batch_size=self.batch_size,
            shuffle=False,
            collate_fn=self.collate,
            num_workers=self.num_workers,
            pin_memory=self.pin_memory,
            persistent_workers=self.persistent_workers if self.num_workers > 0 else False,
        )
        
print("DataModule Defined")
        
        


DataModule Defined


In [3]:
#Now the lighnting module. This wraps the HF model
import torch
import torch.nn as nn
import pytorch_lightning as pl
from transformers import AutoModelForSequenceClassification

class LitTinyClassifier(pl.LightningModule):
    def __init__(self, model_id=MODEL_ID, lr=5e-5):
        super().__init__()
        self.save_hyperparameters()
        self.model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2)
        self.lr = lr
        
    def forward(self, **batch):
        return self.model(**batch)
    
    
    def training_step(self, batch, batch_idx):
        out = self(**batch)
        loss = out.loss
        
        #Quick accuracy sanity check
        preds = out.logits.argmax(dim=-1)
        acc = (preds == batch["labels"]).float().mean()
        self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log("train_acc", acc, on_step=False, on_epoch=True, prog_bar=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        out = self(**batch)
        loss = out.loss
        preds = out.logits.argmax(dim=-1)
        acc = (preds == batch["labels"]).float().mean()
        self.log("val_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log("val_acc", acc, on_step=False, on_epoch=True, prog_bar=True)
        
    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.lr)
    
print("Lightning Module Ready")
        

        
        
        



Lightning Module Ready


In [4]:
#Lets do just a single epoch of training

import pytorch_lightning as pl
import torch

pl.seed_everything(42, workers=True)

dm = SST2DataModule(
    model_id=MODEL_ID,
    batch_size=32,
    num_workers=2,          # start at 2 on Windows
    pin_memory=True,        # good for CUDA async H2D copies
    persistent_workers=True # avoid respawn cost each epoch
)
dm.prepare_data()
dm.setup()

model = LitTinyClassifier(model_id=MODEL_ID, lr = 5e-5)

precision = "bf16-mixed" if hasattr(torch.cuda, "is_available") and torch.cuda.is_available() else "32-true"

trainer = pl.Trainer(
    max_epochs=1,
    accelerator="gpu",
    devices=1,
    precision=precision,
    log_every_n_steps=10,
)

trainer.fit(model, datamodule=dm)

Seed set to 42
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at sshleifer/tiny-distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using bfloat16 Automatic Mixed Precision (AMP)
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3050 6GB Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performan

                                                                           



Epoch 0: 100%|██████████| 32/32 [00:01<00:00, 19.61it/s, v_num=14, val_loss=0.693, val_acc=0.495, train_loss=0.693, train_acc=0.542]

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 32/32 [00:01<00:00, 19.42it/s, v_num=14, val_loss=0.693, val_acc=0.495, train_loss=0.693, train_acc=0.542]


In [5]:
# Inference latency
import torch, numpy as np
import time

def _to_device(batch, device):
    return {k: v.to(device) for k, v in batch.items()}

def _should_sync(device: str) -> bool:
    return device.startswith("cuda") and torch.cuda.is_available()

def measure_latency(hf_model, batch, repeats = 200, warmup = 20, device ="cpu"):
    hf_model.eval().to(device)
    batch = _to_device(batch, device)
    
    # warmup (not timed)
    with torch.inference_mode():
        for _ in range(warmup):
            _ = hf_model(**batch)
    if _should_sync(device):
        torch.cuda.synchronize()
        
    # timed loop
    samples_ms = []
    with torch.inference_mode():
        for _ in range(repeats):
            t0 = time.perf_counter()
            _ = hf_model(**batch)
            if _should_sync(device):
                torch.cuda.synchronize()
            samples_ms.append((time.perf_counter() - t0) * 1000.0)

    samples_ms = np.asarray(samples_ms, dtype=float)
    return {
        "batch_size": int(batch["input_ids"].shape[0]),
        "mean_ms": float(samples_ms.mean()),
        "p50_ms": float(np.percentile(samples_ms, 50)),
        "p95_ms": float(np.percentile(samples_ms, 95)),
        "p99_ms": float(np.percentile(samples_ms, 99)),
        "repeats": int(repeats),
    }
    
# grab a validation batch
val_loader = dm.val_dataloader()
batch_val = next(iter(val_loader))

# bs=8
fp32_bs8 = measure_latency(model.model, batch_val, repeats=150, warmup=30, device="cpu")

# bs=1 (edge/onboard-ish)
single = {k: v[:1].clone() for k, v in batch_val.items()}
fp32_bs1 = measure_latency(model.model, single, repeats=300, warmup=50, device="cpu")

print("FP32 baseline (bs=8):", fp32_bs8)
print("FP32 baseline (bs=1):", fp32_bs1)

FP32 baseline (bs=8): {'batch_size': 32, 'mean_ms': 3.455183334493389, 'p50_ms': 3.353599982801825, 'p95_ms': 4.365645052166655, 'p99_ms': 5.521776984678577, 'repeats': 150}
FP32 baseline (bs=1): {'batch_size': 1, 'mean_ms': 1.7905913351569325, 'p50_ms': 1.812000060454011, 'p95_ms': 2.214680088218302, 'p99_ms': 2.840427967021241, 'repeats': 300}


In [6]:
import torch

qmodel = torch.quantization.quantize_dynamic(
    model.model, 
    {torch.nn.Linear},
    dtype=torch.qint8
)


q_bs8 = measure_latency(qmodel, batch_val, repeats=150, warmup=30, device="cpu")
q_bs1 = measure_latency(qmodel, single,   repeats=300, warmup=50, device="cpu")


print("INT8 quant (bs=8):", q_bs8)
print("INT8 quant (bs=1):", q_bs1)

# quick accuracy sanity on a few batches
def quick_accuracy(hf_model, loader, max_batches=10, device="cpu"):
    hf_model.eval().to(device)
    correct = total = 0
    with torch.inference_mode():
        for i, b in enumerate(loader):
            if i >= max_batches: break
            b = _to_device(b, device)
            out = hf_model(**b)
            preds = out.logits.argmax(dim=-1)
            correct += (preds == b["labels"]).sum().item()
            total   += preds.numel()
    return correct / total

acc_fp32 = quick_accuracy(model.model, dm.val_dataloader(), max_batches=10)
acc_int8 = quick_accuracy(qmodel,      dm.val_dataloader(), max_batches=10)
print(f"Quick val accuracy FP32: {acc_fp32:.3f} | INT8: {acc_int8:.3f}")

For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  qmodel = torch.quantization.quantize_dynamic(


INT8 quant (bs=8): {'batch_size': 32, 'mean_ms': 6.670532658851395, 'p50_ms': 5.942899966612458, 'p95_ms': 11.41486995620653, 'p99_ms': 11.903988005360587, 'repeats': 150}
INT8 quant (bs=1): {'batch_size': 1, 'mean_ms': 3.0304009979590774, 'p50_ms': 2.9561500414274633, 'p95_ms': 3.862739959731698, 'p99_ms': 4.119967934675514, 'repeats': 300}
Quick val accuracy FP32: 0.495 | INT8: 0.495


In [7]:

# Cell 1 — CUDA-friendly DataModule (workers/pinning)
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from torch.utils.data import DataLoader
import pytorch_lightning as pl

# keep your existing MODEL_ID / MAX_LEN from before
# MODEL_ID = "..." 
# MAX_LEN = 128

class SST2DataModule(pl.LightningDataModule):
    def __init__(self, model_id=MODEL_ID, batch_size=32, num_workers=2, pin_memory=True, persistent_workers=True, prefetch_factor=2):
        super().__init__()
        self.model_id = model_id
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.pin_memory = pin_memory
        self.persistent_workers = persistent_workers
        self.prefetch_factor = prefetch_factor
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
        self.collate = DataCollatorWithPadding(self.tokenizer)

    def prepare_data(self):
        load_dataset("glue", "sst2")
        AutoTokenizer.from_pretrained(self.model_id)

    def setup(self, stage=None):
        ds_train = load_dataset("glue", "sst2", split="train[:1000]")
        ds_val   = load_dataset("glue", "sst2", split="validation[:200]")

        def tok_fn(batch):
            t = self.tokenizer(batch["sentence"], truncation=True, max_length=MAX_LEN)
            t["labels"] = batch["label"]
            return t

        ds_train = ds_train.map(tok_fn, batched=True, remove_columns=ds_train.column_names)
        ds_val   = ds_val.map(tok_fn,   batched=True, remove_columns=ds_val.column_names)
        self.ds_train, self.ds_val = ds_train, ds_val

    def _loader(self, ds, shuffle: bool):
        kw = dict(
            dataset=ds,
            batch_size=self.batch_size,
            shuffle=shuffle,
            collate_fn=self.collate,
            num_workers=self.num_workers,
            pin_memory=self.pin_memory,
            persistent_workers=self.persistent_workers if self.num_workers > 0 else False,
        )
        if self.num_workers > 0:
            kw["prefetch_factor"] = self.prefetch_factor
        return DataLoader(**kw)

    def train_dataloader(self):
        return self._loader(self.ds_train, shuffle=True)

    def val_dataloader(self):
        return self._loader(self.ds_val, shuffle=False)

print("DataModule (CUDA-ready) defined.")


DataModule (CUDA-ready) defined.


In [8]:
# Cell 2 — GPU training
import pytorch_lightning as pl, torch

pl.seed_everything(42, workers=True)

dm = SST2DataModule(model_id=MODEL_ID, batch_size=32, num_workers=2, pin_memory=True, persistent_workers=True)

dm.prepare_data(); dm.setup()

model = LitTinyClassifier(model_id=MODEL_ID, lr=5e-5)

can_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
precision = "bf16-mixed" if can_bf16 else ("16-mixed" if torch.cuda.is_available() else "32-true")

trainer = pl.Trainer(
    max_epochs=1,
    accelerator="gpu" if torch.cuda.is_available() else "cpu",
    devices=1,
    precision=precision,
    log_every_n_steps=10,
)
trainer.fit(model, datamodule=dm)


Seed set to 42
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at sshleifer/tiny-distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using bfloat16 Automatic Mixed Precision (AMP)
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Map: 100%|██████████| 1000/1000 [00:00<00:00, 19980.30 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 4222.19 examples/s]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                             | Params | Mode


Epoch 0: 100%|██████████| 32/32 [00:01<00:00, 16.91it/s, v_num=15, val_loss=0.693, val_acc=0.495, train_loss=0.693, train_acc=0.542]

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 32/32 [00:01<00:00, 16.78it/s, v_num=15, val_loss=0.693, val_acc=0.495, train_loss=0.693, train_acc=0.542]


In [9]:
# Cell 3 — CUDA latency
import time, numpy as np, torch

def _to_device(batch, device):
    return {k: v.to(device, non_blocking=True) for k, v in batch.items()}

def _should_sync(device: str) -> bool:
    return device.startswith("cuda") and torch.cuda.is_available()

def measure_latency(hf_model, batch, repeats=200, warmup=60, device="cuda", amp=False):
    hf_model.eval().to(device)
    batch = _to_device(batch, device)

    # warmup
    with torch.inference_mode():
        if amp and device.startswith("cuda"):
            with torch.autocast(device_type="cuda", dtype=(torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16)):
                for _ in range(warmup): _ = hf_model(**batch)
        else:
            for _ in range(warmup): _ = hf_model(**batch)
    if _should_sync(device): torch.cuda.synchronize()

    times = []
    with torch.inference_mode():
        for _ in range(repeats):
            t0 = time.perf_counter()
            if amp and device.startswith("cuda"):
                with torch.autocast(device_type="cuda", dtype=(torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16)):
                    _ = hf_model(**batch)
            else:
                _ = hf_model(**batch)
            if _should_sync(device): torch.cuda.synchronize()
            times.append((time.perf_counter() - t0) * 1000.0)

    arr = np.asarray(times, dtype=float)
    return {
        "batch_size": int(batch["input_ids"].shape[0]),
        "mean_ms": float(arr.mean()),
        "p50_ms": float(np.percentile(arr, 50)),
        "p95_ms": float(np.percentile(arr, 95)),
        "p99_ms": float(np.percentile(arr, 99)),
        "repeats": int(repeats),
        "amp": bool(amp),
        "device": device,
    }

# grab a val batch
val_loader = dm.val_dataloader()
batch_val = next(iter(val_loader))

# bs=8
fp32_cuda_bs8 = measure_latency(model.model, batch_val, repeats=200, warmup=80, device="cuda", amp=False)
amp_cuda_bs8  = measure_latency(model.model, batch_val, repeats=200, warmup=80, device="cuda", amp=True)

# bs=1
single = {k: v[:1].clone() for k, v in batch_val.items()}
fp32_cuda_bs1 = measure_latency(model.model, single, repeats=300, warmup=100, device="cuda", amp=False)
amp_cuda_bs1  = measure_latency(model.model, single, repeats=300, warmup=100, device="cuda", amp=True)

print("CUDA FP32 (bs=8):", fp32_cuda_bs8)
print("CUDA AMP  (bs=8):", amp_cuda_bs8)
print("CUDA FP32 (bs=1):", fp32_cuda_bs1)
print("CUDA AMP  (bs=1):", amp_cuda_bs1)


CUDA FP32 (bs=8): {'batch_size': 32, 'mean_ms': 2.531806997139938, 'p50_ms': 2.215099986642599, 'p95_ms': 4.532704927260055, 'p99_ms': 5.962219965877008, 'repeats': 200, 'amp': False, 'device': 'cuda'}
CUDA AMP  (bs=8): {'batch_size': 32, 'mean_ms': 3.429909997503273, 'p50_ms': 3.087949939072132, 'p95_ms': 5.425290024140846, 'p99_ms': 6.7342530004680095, 'repeats': 200, 'amp': True, 'device': 'cuda'}
CUDA FP32 (bs=1): {'batch_size': 1, 'mean_ms': 2.2018830000888556, 'p50_ms': 2.050450013484806, 'p95_ms': 3.317129996139556, 'p99_ms': 4.01725108618848, 'repeats': 300, 'amp': False, 'device': 'cuda'}
CUDA AMP  (bs=1): {'batch_size': 1, 'mean_ms': 3.0379619992648563, 'p50_ms': 2.707950014155358, 'p95_ms': 5.167229997459799, 'p99_ms': 6.680601987754925, 'repeats': 300, 'amp': True, 'device': 'cuda'}


In [10]:
import os, shutil, pytorch_lightning as pl, torch
from pytorch_lightning.profilers import PyTorchProfiler
from torch.profiler import ProfilerActivity, schedule, tensorboard_trace_handler

# clean old traces so we know we’re seeing fresh ones
shutil.rmtree("tb_traces", ignore_errors=True)
os.makedirs("tb_traces", exist_ok=True)

pl.seed_everything(42, workers=True)
dm = SST2DataModule(model_id=MODEL_ID, batch_size=32, num_workers=2, pin_memory=True, persistent_workers=True)
dm.prepare_data(); dm.setup()
model = LitTinyClassifier(model_id=MODEL_ID, lr=5e-5)

profiler = PyTorchProfiler(
    schedule=schedule(wait=1, warmup=1, active=8, repeat=1),   # short & guaranteed to run
    activities=[ProfilerActivity.CPU] + ([ProfilerActivity.CUDA] if torch.cuda.is_available() else []),
    on_trace_ready=tensorboard_trace_handler("tb_traces"),      # <-- write TensorBoard event files
    record_shapes=True,
    profile_memory=True,
)

trainer = pl.Trainer(
    max_epochs=1,
    limit_train_batches=0.7,     # ensure we surpass wait+warmup+active
    limit_val_batches=0,
    accelerator="gpu" if torch.cuda.is_available() else "cpu",
    devices=1,
    precision=("bf16-mixed" if (torch.cuda.is_available() and torch.cuda.is_bf16_supported())
              else ("16-mixed" if torch.cuda.is_available() else "32-true")),
    profiler=profiler,
    log_every_n_steps=10,
)
trainer.fit(model, datamodule=dm)

print("TB trace dir:", os.path.abspath("tb_traces"))
print("Contents:", os.listdir("tb_traces"))


Seed set to 42
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at sshleifer/tiny-distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using bfloat16 Automatic Mixed Precision (AMP)
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Map: 100%|██████████| 1000/1000 [00:00<00:00, 19173.44 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 7110.44 examples/s]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                             | Params | Mode


Epoch 0: 100%|██████████| 22/22 [00:02<00:00,  9.13it/s, v_num=16, train_loss=0.693, train_acc=0.548]

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 22/22 [00:02<00:00,  9.04it/s, v_num=16, train_loss=0.693, train_acc=0.548]


FIT Profiler Report
Profile stats for: records
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                          ProfilerStep*         0.00%       0.000us         0.00%       0.000us       0.000us     124.543ms       277.43%     124.543ms      15.568ms        

TB trace dir: c:\Users\fiona\Documents\GitHub\Transformers\transformers-mini\tb_traces
Contents: ['fionan_45784.1757258988193973400.pt.trace.json']


In [14]:
%load_ext tensorboard
%tensorboard --logdir tb_traces



The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 43892), started 6:34:05 ago. (Use '!kill 43892' to kill it.)

In [18]:
# Replace 43892 with the PID shown in your message if different
!taskkill /PID 43892 /F  2> NUL
%reload_ext tensorboard
%tensorboard --logdir tb_traces --port 6007 --reload_interval 3


Reusing TensorBoard on port 6007 (pid 15696), started 0:00:12 ago. (Use '!kill 15696' to kill it.)

In [19]:
# Time to investigate ONNX runtime
import onnxruntime as ort, os, multiprocessing as mp
print("ORT version:", ort.__version__)
print("Available providers:", ort.get_available_providers())
print("CPU count:", mp.cpu_count())

ORT version: 1.22.1
Available providers: ['AzureExecutionProvider', 'CPUExecutionProvider']
CPU count: 20


In [22]:
import torch, time, numpy as np
import onnxruntime as ort
from copy import deepcopy

onnx_path = "tiny_roberta_sst2.onnx"
hf = deepcopy(model.model).eval().to("cpu")

val_loader = dm.val_dataloader()
batch_val = next(iter(val_loader))
batch_nolabel = {k: v for k, v in batch_val.items() if k != "labels"}

example = (batch_nolabel["input_ids"], batch_nolabel["attention_mask"])
dynamic_axes = {"input_ids": {0:"batch",1:"seq"},
                "attention_mask": {0:"batch",1:"seq"},
                "logits": {0:"batch"}}

with torch.inference_mode():
    torch.onnx.export(
        hf, example, onnx_path,
        input_names=["input_ids","attention_mask"],
        output_names=["logits"],
        dynamic_axes=dynamic_axes,
        opset_version=17, do_constant_folding=True
    )

so = ort.SessionOptions()
so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
sess = ort.InferenceSession(onnx_path, sess_options=so, providers=["CPUExecutionProvider"])

import numpy as np, time
def to_numpy(t): return t.detach().cpu().numpy()



def ort_latency(session, batch, repeats=200, warmup=20):
    inputs = {"input_ids": to_numpy(batch["input_ids"]),
              "attention_mask": to_numpy(batch["attention_mask"])}
    for _ in range(warmup): _ = session.run(["logits"], inputs)
    ts=[]
    for _ in range(repeats):
        t0 = time.perf_counter()
        _ = session.run(["logits"], inputs)
        ts.append((time.perf_counter()-t0)*1000)
    arr=np.asarray(ts,float)
    return {"bs": int(inputs["input_ids"].shape[0]),
            "mean_ms": float(arr.mean()),
            "p50_ms": float(np.percentile(arr,50)),
            "p95_ms": float(np.percentile(arr,95))}
    
# batch = full val batch; single = bs=1
single = {k: v[:1].clone() for k, v in batch_nolabel.items()}
print("ORT CPU batch:", ort_latency(sess, batch_nolabel))
print("ORT CPU bs=1 :", ort_latency(sess, single, repeats=300, warmup=50))

  torch.onnx.export(
  inverted_mask = torch.tensor(1.0, dtype=dtype) - expanded_mask


ORT CPU batch: {'bs': 32, 'mean_ms': 2.7413950028130785, 'p50_ms': 2.685900020878762, 'p95_ms': 3.5613349929917604}
ORT CPU bs=1 : {'bs': 1, 'mean_ms': 0.29227467253804207, 'p50_ms': 0.27864996809512377, 'p95_ms': 0.3899000585079193}


In [23]:
# ONNX-3 — vary threads and see what happens
import onnxruntime as ort, numpy as np

def make_session(num_threads):
    so = ort.SessionOptions()
    so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
    so.intra_op_num_threads = num_threads   # math inside an op
    so.inter_op_num_threads = 1            # and across ops; keep 1 for latency
    return ort.InferenceSession(onnx_path, sess_options=so, providers=["CPUExecutionProvider"])

for n in [1, 2, 4, 8]:
    s = make_session(n)
    stats = ort_latency(s, single, repeats=300, warmup=50)
    print(f"threads={n} → bs=1 p50={stats['p50_ms']:.3f}ms, p95={stats['p95_ms']:.3f}ms")


threads=1 → bs=1 p50=0.103ms, p95=0.115ms
threads=2 → bs=1 p50=0.127ms, p95=0.147ms
threads=4 → bs=1 p50=0.141ms, p95=0.258ms
threads=8 → bs=1 p50=0.192ms, p95=0.317ms


In [25]:
# RAY-1: init & resources
import ray, platform, sys, os

ray.shutdown()
ray.init(ignore_reinit_error=True, include_dashboard=False)

print("Ray:", ray.__version__)
print("Python:", sys.version.split()[0], "| OS:", platform.platform())
print("Resources detected:", ray.cluster_resources())

# tiny sanity: put/get round-trip through Ray's object store
obj_ref = ray.put({"hello": "ray"})
print("Round-trip OK:", ray.get(obj_ref))


2025-09-07 18:33:55,108	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2025-09-07 18:33:57,510	INFO worker.py:1951 -- Started a local Ray instance.


Ray: 2.49.1
Python: 3.12.10 | OS: Windows-11-10.0.26100-SP0
Resources detected: {'CPU': 20.0, 'accelerator_type:G': 1.0, 'node:__internal_head__': 1.0, 'GPU': 1.0, 'node:127.0.0.1': 1.0, 'object_store_memory': 717299712.0, 'memory': 1673699328.0}
Round-trip OK: {'hello': 'ray'}


In [26]:
import ray, time, os
ray.shutdown(); ray.init(ignore_reinit_error=True, include_dashboard=False)

@ray.remote
def slow_square(x):
    time.sleep(0.2)         # simulate work
    return (x * x, os.getpid())

t0 = time.perf_counter()
futs = [slow_square.remote(i) for i in range(20)]
vals = ray.get(futs)
t1 = time.perf_counter()

print("First 5 results:", vals[:5])
print(f"Wall time for 20 × 0.2s tasks: {t1 - t0:.2f}s")


2025-09-07 18:35:42,305	INFO worker.py:1951 -- Started a local Ray instance.


First 5 results: [(0, 26980), (1, 12204), (4, 21172), (9, 28476), (16, 40692)]
Wall time for 20 × 0.2s tasks: 0.33s


In [27]:
import ray, torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

ray.shutdown(); ray.init(ignore_reinit_error=True)

@ray.remote(num_gpus=1)   # reserve your single GPU
class InferenceActor:
    def __init__(self, model_id, max_len=128):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tok = AutoTokenizer.from_pretrained(model_id)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2).to(self.device).eval()
        self.max_len = max_len

        # warmup once to trigger kernel autotuning/caches
        with torch.inference_mode():
            toks = self.tok(["warm up"], return_tensors="pt", padding=True, truncation=True, max_length=self.max_len).to(self.device)
            _ = self.model(**toks)

    def predict(self, sentences):
        with torch.inference_mode():
            toks = self.tok(sentences, return_tensors="pt", padding=True, truncation=True, max_length=self.max_len).to(self.device)
            logits = self.model(**toks).logits
            return logits.softmax(dim=-1).tolist()

actor = InferenceActor.remote(MODEL_ID)

# single call
probs = ray.get(actor.predict.remote(["this is great!", "this is terrible..."]))
print(probs)


2025-09-07 18:36:40,884	INFO worker.py:1951 -- Started a local Ray instance.
[36m(InferenceActor pid=40016)[0m Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at sshleifer/tiny-distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
[36m(InferenceActor pid=40016)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[[0.5002333521842957, 0.49976664781570435], [0.5002333521842957, 0.49976664781570435]]


In [28]:
sentences = [["ok"], ["meh"], ["amazing!"], ["awful..."], ["fine"], ["great"], ["bad"]]
futs = [actor.predict.remote(s) for s in sentences]
print([ray.get(f) for f in futs])


[[[0.5002333521842957, 0.49976664781570435]], [[0.5002333521842957, 0.49976664781570435]], [[0.5002333521842957, 0.49976664781570435]], [[0.5002333521842957, 0.49976664781570435]], [[0.5002333521842957, 0.49976664781570435]], [[0.5002333521842957, 0.49976664781570435]], [[0.5002333521842957, 0.49976664781570435]]]


In [29]:
import ray, torch, pytorch_lightning as pl
from ray.train.torch import TorchTrainer
from ray.train import ScalingConfig

ray.shutdown(); ray.init(ignore_reinit_error=True)

def train_loop(config):
    import pytorch_lightning as pl, torch
    pl.seed_everything(42, workers=True)
    dm = SST2DataModule(model_id=config["model_id"], batch_size=32, num_workers=0, pin_memory=True, persistent_workers=False)
    dm.prepare_data(); dm.setup()
    model = LitTinyClassifier(model_id=config["model_id"], lr=5e-5)

    trainer = pl.Trainer(
        max_epochs=1,
        accelerator="gpu" if torch.cuda.is_available() else "cpu",
        devices=1,
        precision="16-mixed" if torch.cuda.is_available() else "32-true",
        log_every_n_steps=50,
    )
    trainer.fit(model, datamodule=dm)
    return {"val_acc": float(trainer.callback_metrics.get("val_acc", 0.0))}

tt = TorchTrainer(
    train_loop_per_worker=train_loop,
    scaling_config=ScalingConfig(num_workers=1, use_gpu=torch.cuda.is_available()),
    train_loop_config={"model_id": MODEL_ID},
)
result = tt.fit()
print("Ray Train result:", result)

ray.shutdown()


2025-09-07 18:37:07,605	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2025-09-07 18:37:07,709	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2025-09-07 18:37:09,977	INFO worker.py:1951 -- Started a local Ray instance.
2025-09-07 18:37:11,500	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


== Status ==
Current time: 2025-09-07 18:37:11 (running for 00:00:00.13)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/20 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: C:/Users/fiona/AppData/Local/Temp/ray/session_2025-09-07_18-37-08_270046_45784/artifacts/2025-09-07_18-37-11/TorchTrainer_2025-09-07_18-37-11/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Current time: 2025-09-07 18:37:16 (running for 00:00:05.23)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/20 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: C:/Users/fiona/AppData/Local/Temp/ray/session_2025-09-07_18-37-08_270046_45784/artifacts/2025-09-07_18-37-11/TorchTrainer_2025-09-07_18-37-11/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Current time: 2025-09-07 18:37:21 (running for 00:00:10.26)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/20 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: C:/Users/fiona/A

[36m(RayTrainWorker pid=11272)[0m Setting up process group for: env:// [rank=0, world_size=1]
[36m(RayTrainWorker pid=11272)[0m [W907 18:37:21.000000000 socket.cpp:755] [c10d] The client socket has failed to connect to [kubernetes.docker.internal]:51486 (system error: 10049 - The requested address is not valid in its context.).
2025-09-07 18:37:22,132	ERROR tune_controller.py:1331 -- Trial task failed for trial TorchTrainer_48f84_00000
Traceback (most recent call last):
  File "c:\Users\fiona\Documents\GitHub\Transformers\transformers-mini\.venv312\Lib\site-packages\ray\air\execution\_internal\event_manager.py", line 110, in resolve_future
    result = ray.get(future)
             ^^^^^^^^^^^^^^^
  File "c:\Users\fiona\Documents\GitHub\Transformers\transformers-mini\.venv312\Lib\site-packages\ray\_private\auto_init_hook.py", line 22, in auto_init_wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\fiona\Documents\GitHub\Transformers\transformers-m

== Status ==
Current time: 2025-09-07 18:37:22 (running for 00:00:10.61)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/20 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: C:/Users/fiona/AppData/Local/Temp/ray/session_2025-09-07_18-37-08_270046_45784/artifacts/2025-09-07_18-37-11/TorchTrainer_2025-09-07_18-37-11/driver_artifacts
Number of trials: 1/1 (1 ERROR)
Number of errored trials: 1
+--------------------------+--------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               |   # failures | error file                                                                                                                                                                                                              |
|--------------------------+--------------+----------------

TrainingFailedError: The Ray Train run failed. Please inspect the previous error messages for a cause. After fixing the issue (assuming that the error is not caused by your own application logic, but rather an error such as OOM), you can restart the run from scratch or continue this run.
To continue this run, you can use: `trainer = TorchTrainer.restore("C:/Users/fiona/ray_results/TorchTrainer_2025-09-07_18-37-11")`.
To start a new run that will retry on training failures, set `train.RunConfig(failure_config=train.FailureConfig(max_failures))` in the Trainer's `run_config` with `max_failures > 0`, or `max_failures = -1` for unlimited retries.

In [30]:
import os, glob, pathlib, textwrap

run_dir = r"C:\Users\fiona\ray_results"  # base Ray results dir
# pick the most recent TorchTrainer run automatically:
cands = sorted(glob.glob(os.path.join(run_dir, "TorchTrainer_*")), key=os.path.getmtime)
print("Found runs:", cands[-3:])
trial_dir = cands[-1]
print("Using:", trial_dir)

# list interesting files
for p in sorted(glob.glob(os.path.join(trial_dir, "**"), recursive=True)):
    if any(s in p for s in (".log", ".err")):
        print(" -", p)

# print the last 200 lines of each .err / driver log
for p in sorted(glob.glob(os.path.join(trial_dir, "**", "*.err"), recursive=True)) + \
         sorted(glob.glob(os.path.join(trial_dir, "driver_*"))):
    print("\n==== tail:", p, "====")
    try:
        with open(p, "r", encoding="utf-8", errors="ignore") as f:
            tail = f.readlines()[-200:]
            print("".join(tail))
    except Exception as e:
        print("Could not read:", e)


Found runs: ['C:\\Users\\fiona\\ray_results\\TorchTrainer_2025-09-07_18-37-11']
Using: C:\Users\fiona\ray_results\TorchTrainer_2025-09-07_18-37-11


In [31]:
import ray, torch, pytorch_lightning as pl

ray.shutdown(); ray.init(ignore_reinit_error=True)

@ray.remote(num_gpus=1)  # reserve your single GPU
class LightningTrainActor:
    def __init__(self):
        import pytorch_lightning as pl
        pl.seed_everything(42, workers=True)

    def train_once(self, model_id, batch_size=32, precision=None):
        import pytorch_lightning as pl, torch, time

        # Data
        dm = SST2DataModule(model_id=model_id, batch_size=batch_size,
                            num_workers=0, pin_memory=True, persistent_workers=False)
        dm.prepare_data(); dm.setup()

        # Model
        model = LitTinyClassifier(model_id=model_id, lr=5e-5)

        # Precision
        if precision is None:
            precision = ("bf16-mixed" if (torch.cuda.is_available() and torch.cuda.is_bf16_supported())
                         else ("16-mixed" if torch.cuda.is_available() else "32-true"))

        trainer = pl.Trainer(
            max_epochs=1,
            accelerator="gpu" if torch.cuda.is_available() else "cpu",
            devices=1,
            precision=precision,
            log_every_n_steps=25,
        )

        t0 = time.perf_counter()
        trainer.fit(model, datamodule=dm)
        elapsed = time.perf_counter() - t0

        # capture a couple metrics
        val_acc = float(trainer.callback_metrics.get("val_acc", 0.0))
        val_loss = float(trainer.callback_metrics.get("val_loss", 0.0))
        return {"val_acc": val_acc, "val_loss": val_loss, "elapsed_s": elapsed, "precision": precision}

# spin up the actor and run training
actor = LightningTrainActor.remote()
result = ray.get(actor.train_once.remote(MODEL_ID, batch_size=32))
print("Ray actor training result:", result)

ray.shutdown()


2025-09-07 18:39:47,242	INFO worker.py:1951 -- Started a local Ray instance.
[36m(LightningTrainActor pid=21972)[0m Seed set to 42
Map: 100%|██████████| 1000/1000 [00:00<00:00, 20124.87 examples/s]
Map:   0%|          | 0/200 [00:00<?, ? examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 5076.90 examples/s]
[36m(LightningTrainActor pid=21972)[0m Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at sshleifer/tiny-distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
[36m(LightningTrainActor pid=21972)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[36m(LightningTrainActor pid=21972)[0m Using bfloat16 Automatic Mixed Precision (AMP)
[36m(LightningTrainActor pid=21972)[0m 💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.o

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]


[36m(LightningTrainActor pid=21972)[0m c:\Users\fiona\Documents\GitHub\Transformers\transformers-mini\.venv312\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


Epoch 0:   0%|          | 0/32 [00:00<?, ?it/s]                            
Epoch 0:  16%|█▌        | 5/32 [00:00<00:01, 21.52it/s, v_num=17]
Epoch 0:  34%|███▍      | 11/32 [00:00<00:00, 32.81it/s, v_num=17]
Epoch 0:  53%|█████▎    | 17/32 [00:00<00:00, 38.41it/s, v_num=17]
Epoch 0:  56%|█████▋    | 18/32 [00:00<00:00, 39.53it/s, v_num=17]
Epoch 0:  72%|███████▏  | 23/32 [00:00<00:00, 41.13it/s, v_num=17]
Epoch 0:  91%|█████████ | 29/32 [00:00<00:00, 43.23it/s, v_num=17]
Epoch 0: 100%|██████████| 32/32 [00:00<00:00, 44.04it/s, v_num=17]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/7 [00:00<?, ?it/s][A
Validation DataLoader 0:  14%|█▍        | 1/7 [00:00<00:00, 196.11it/s][A
Validation DataLoader 0:  29%|██▊       | 2/7 [00:00<00:00, 141.60it/s][A
Validation DataLoader 0:  43%|████▎     | 3/7 [00:00<00:00, 135.57it/s][A
Validation DataLoader 0:  57%|█████▋    | 4/7 [00:00<00:00, 130.94i