# Ray Train Tutorial Example
https://docs.ray.io/en/latest/train/examples/transformers/huggingface_text_classification.html

In [2]:
from pprint import pprint
import logging
import ray

ray.init(
    _memory=3 * 1024**3,              # 3 GB total usable memory
    object_store_memory=512 * 1024**2, # 512 MB for object store
    num_cpus=2,
    logging_level=logging.INFO
)

  from .autonotebook import tqdm as notebook_tqdm
2025-06-16 20:42:30,108	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2025-06-16 20:42:33,861	INFO worker.py:1908 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


0,1
Python version:,3.12.11
Ray version:,2.47.0
Dashboard:,http://127.0.0.1:8265


[36m(TorchTrainer pid=30734)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=30734)[0m - (node_id=f5899fab250745cd286f3d09b508679e1862b6b7c14c443791d6df17, ip=172.17.0.2, pid=30794) world_rank=0, local_rank=0, node_rank=0
[36m(RayTrainWorker pid=30794)[0m Setting up process group for: env:// [rank=0, world_size=1]


[36m(RayTrainWorker pid=30794)[0m Is CUDA available: False


Downloading builder script: 100%|██████████| 5.75k/5.75k [00:00<00:00, 12.7MB/s]
[36m(RayTrainWorker pid=30794)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
[36m(RayTrainWorker pid=30794)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[36m(RayTrainWorker pid=30794)[0m max_steps_per_epoch:  534


In [3]:
pprint(ray.cluster_resources())

{'CPU': 2.0,
 'memory': 3221225472.0,
 'node:172.17.0.2': 1.0,
 'node:__internal_head__': 1.0,
 'object_store_memory': 536870912.0}


In [4]:
use_gpu = False  # set this to False to run on CPUs
num_workers = 1  # set this to number of GPUs or CPUs you want to use

In [5]:
GLUE_TASKS = [
    "cola",
    "mnli",
    "mnli-mm",
    "mrpc",
    "qnli",
    "qqp",
    "rte",
    "sst2",
    "stsb",
    "wnli",
]

In [6]:
task = "cola"
model_checkpoint = "distilbert-base-uncased"
batch_size = 16

## Load Dataset

In [7]:
from datasets import load_dataset

actual_task = "mnli" if task == "mnli-mm" else task
datasets = load_dataset("glue", actual_task)

# Process Dataset

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [9]:
task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mnli-mm": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}

In [10]:
import ray.data

ray_datasets = {
    "train": ray.data.from_huggingface(datasets["train"]),
    "validation": ray.data.from_huggingface(datasets["validation"]),
    "test": ray.data.from_huggingface(datasets["test"]),
}
ray_datasets

2025-06-16 20:42:50,360	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
Parquet Files Sample 0: 100%|████████████████████| 1.00/1.00 [00:02<00:00, 2.29s/ file]
Parquet Files Sample 0: 100%|████████████████████| 1.00/1.00 [00:01<00:00, 1.63s/ file]
Parquet Files Sample 0: 100%|████████████████████| 1.00/1.00 [00:01<00:00, 1.67s/ file]


{'train': Dataset(num_rows=8551, schema={sentence: string, label: int64, idx: int32}),
 'validation': Dataset(num_rows=1043, schema={sentence: string, label: int64, idx: int32}),
 'test': Dataset(num_rows=1063, schema={sentence: string, label: int64, idx: int32})}

In [11]:
import numpy as np
import torch


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Tokenize input sentences
def collate_fn(examples: dict[str, np.array]):
    sentence1_key, sentence2_key = task_to_keys[task]
    if sentence2_key is None:
        outputs = tokenizer(
            list(examples[sentence1_key]),
            truncation=True,
            padding="longest",
            return_tensors="pt",
        )
    else:
        outputs = tokenizer(
            list(examples[sentence1_key]),
            list(examples[sentence2_key]),
            truncation=True,
            padding="longest",
            return_tensors="pt",
        )

    outputs["labels"] = torch.LongTensor(examples["label"])

    # Move all tensors to CPU (or GPU if available)
    for key, value in outputs.items():
        outputs[key] = value.to(device)

    return outputs

## Fine tuning

In [12]:
import torch
import numpy as np

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate


import ray.train
from ray.train.huggingface.transformers import prepare_trainer, RayTrainReportCallback

num_labels = 3 if task.startswith("mnli") else 1 if task == "stsb" else 2
metric_name = (
    "pearson"
    if task == "stsb"
    else "matthews_correlation"
    if task == "cola"
    else "accuracy"
)
model_name = model_checkpoint.split("/")[-1]
validation_key = (
    "validation_mismatched"
    if task == "mnli-mm"
    else "validation_matched"
    if task == "mnli"
    else "validation"
)
name = f"{model_name}-finetuned-{task}"

# Calculate the maximum steps per epoch based on the number of rows in the training dataset.
# Make sure to scale by the total number of training workers and the per device batch size.
max_steps_per_epoch = ray_datasets["train"].count() // (batch_size * num_workers)


def train_func(config):
    print(f"Is CUDA available: {torch.cuda.is_available()}")

    metric = evaluate.load("glue", actual_task)
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint, num_labels=num_labels
    )

    train_ds = ray.train.get_dataset_shard("train")
    eval_ds = ray.train.get_dataset_shard("eval")

    train_ds_iterable = train_ds.iter_torch_batches(
        batch_size=batch_size, collate_fn=collate_fn
    )
    eval_ds_iterable = eval_ds.iter_torch_batches(
        batch_size=batch_size, collate_fn=collate_fn
    )

    print("max_steps_per_epoch: ", max_steps_per_epoch)

    args = TrainingArguments(
        name,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=config.get("learning_rate", 2e-5),
        num_train_epochs=config.get("epochs", 2),
        weight_decay=config.get("weight_decay", 0.01),
        push_to_hub=False,
        max_steps=max_steps_per_epoch * config.get("epochs", 2),
        disable_tqdm=True,  # declutter the output a little
        no_cuda=not use_gpu,  # you need to explicitly set no_cuda if you want CPUs
        report_to="none",
    )

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        if task != "stsb":
            predictions = np.argmax(predictions, axis=1)
        else:
            predictions = predictions[:, 0]
        return metric.compute(predictions=predictions, references=labels)

    trainer = Trainer(
        model,
        args,
        train_dataset=train_ds_iterable,
        eval_dataset=eval_ds_iterable,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.add_callback(RayTrainReportCallback())

    trainer = prepare_trainer(trainer)

    print("Starting training")
    trainer.train()

In [13]:
from ray.train.torch import TorchTrainer
from ray.train import RunConfig, ScalingConfig, CheckpointConfig

trainer = TorchTrainer(
    train_func,
    scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    datasets={
        "train": ray_datasets["train"],
        "eval": ray_datasets["validation"],
    },
    run_config=RunConfig(
        checkpoint_config=CheckpointConfig(
            num_to_keep=1,
            checkpoint_score_attribute="eval_loss",
            checkpoint_score_order="min",
        ),
    ),
)

2025-06-16 20:43:13,665	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [14]:
result = trainer.fit()

2025-06-16 20:43:13,740	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


== Status ==
Current time: 2025-06-16 20:43:13 (running for 00:00:00.11)
Using FIFO scheduling algorithm.
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-06-16_20-42-30_121150_27857/artifacts/2025-06-16_20-43-13/TorchTrainer_2025-06-16_20-43-13/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Current time: 2025-06-16 20:43:18 (running for 00:00:05.20)
Using FIFO scheduling algorithm.
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-06-16_20-42-30_121150_27857/artifacts/2025-06-16_20-43-13/TorchTrainer_2025-06-16_20-43-13/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-06-16 20:43:24 (running for 00:00:10.27)
Using FIFO scheduling algorithm.
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-06-16_20-42-30_121150_27857/artifacts/2025-06-16_20-43-13/TorchTrainer_2025-06-16_20-43-13/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== S

2025-06-16 20:43:31,037	ERROR tune_controller.py:1331 -- Trial task failed for trial TorchTrainer_85e5e_00000
Traceback (most recent call last):
  File "/home/mluser/.local/lib/python3.12/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
             ^^^^^^^^^^^^^^^
  File "/home/mluser/.local/lib/python3.12/site-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/home/mluser/.local/lib/python3.12/site-packages/ray/_private/client_mode_hook.py", line 104, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/mluser/.local/lib/python3.12/site-packages/ray/_private/worker.py", line 2849, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
                                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/mluser/.local/lib/python

== Status ==
Current time: 2025-06-16 20:43:31 (running for 00:00:17.30)
Using FIFO scheduling algorithm.
Logical resource usage: 2.0/2 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-06-16_20-42-30_121150_27857/artifacts/2025-06-16_20-43-13/TorchTrainer_2025-06-16_20-43-13/driver_artifacts
Number of trials: 1/1 (1 ERROR)
Number of errored trials: 1
+--------------------------+--------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               |   # failures | error file                                                                                                                                                                                 |
|--------------------------+--------------+---------------------------------------------------------------------------------------------------------------------------------------

TrainingFailedError: The Ray Train run failed. Please inspect the previous error messages for a cause. After fixing the issue (assuming that the error is not caused by your own application logic, but rather an error such as OOM), you can restart the run from scratch or continue this run.
To continue this run, you can use: `trainer = TorchTrainer.restore("/home/mluser/ray_results/TorchTrainer_2025-06-16_20-43-13")`.
To start a new run that will retry on training failures, set `train.RunConfig(failure_config=train.FailureConfig(max_failures))` in the Trainer's `run_config` with `max_failures > 0`, or `max_failures = -1` for unlimited retries.