In [5]:
import pandas as pd
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

from datasets import load_dataset, load_metric, Dataset
from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
                          Trainer, TrainingArguments)

import numpy as np
from typing import Dict
import torch
import numpy as np

import ray
from ray.train.torch import TorchTrainer
from ray.train import RunConfig, ScalingConfig, CheckpointConfig
import ray.data
import ray.train
from ray.train.huggingface.transformers import prepare_trainer, RayTrainReportCallback
from ray import tune
from ray.tune import Tuner
from ray.tune.schedulers.async_hyperband import ASHAScheduler
from tqdm import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss


# Constants

In [3]:
train_path = "./data/train"
valid_path = "./data/dev"
DATA_PATH = "./data/train.csv"


num_labels = 3
metric_name = "accuracy"

model_checkpoint = "roberta-base"
model_name = model_checkpoint.split("/")[-1]
validation_key = "validation"
task = "cola"

name = f"{model_name}-finetuned-{task}"
actual_task = task
batch_size = 16
num_workers = 1
use_gpu = True

tune_epochs = 4


# Read data

In [13]:
df = pd.read_csv(DATA_PATH)

X_train, X_test, y_train, y_test = train_test_split(df['discourse_text'], df['discourse_effectiveness'],
                                                        stratify=df['discourse_effectiveness'],
                                                        test_size=0.1,
                                                        random_state=42)

df_train = pd.concat([X_train, y_train], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)

df_train.columns = ["text", 'label']
df_test.columns = ["text", 'label']


label2id = {
    'Ineffective' : 2,
    'Effective' : 1,
    'Adequate' : 0
}

df_train['label'] = df_train['label'].apply(lambda x: label2id[x])
df_test['label'] = df_test['label'].apply(lambda x: label2id[x])

In [21]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [18]:
train_dataset

Dataset({
    features: ['text', 'label', '__index_level_0__'],
    num_rows: 33088
})

In [22]:
test_dataset

Dataset({
    features: ['text', 'label', '__index_level_0__'],
    num_rows: 3677
})

# Ray training

In [23]:
ray_datasets = {
    "train": ray.data.from_huggingface(train_dataset),
    "validation": ray.data.from_huggingface(test_dataset),
}
ray_datasets

2023-12-28 15:05:33,929	INFO worker.py:1724 -- Started a local Ray instance.


{'train': MaterializedDataset(
    num_blocks=1,
    num_rows=33088,
    schema={text: string, label: int64, __index_level_0__: int64}
 ),
 'validation': MaterializedDataset(
    num_blocks=1,
    num_rows=3677,
    schema={text: string, label: int64, __index_level_0__: int64}
 )}

[33m(raylet)[0m [2023-12-28 15:05:43,908 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 93892296704; capacity: 2161779998720. Object creation will fail if spilling is required.
[33m(raylet)[0m [2023-12-28 15:05:53,915 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 93892272128; capacity: 2161779998720. Object creation will fail if spilling is required.


In [24]:

# Tokenize input sentences
def collate_fn(examples: Dict[str, np.array]):
    outputs =  tokenizer(list(examples['text']), truncation=True,  
                         padding="longest",
                         return_tensors="pt",)

    outputs["labels"] = torch.LongTensor(examples["label"])

    # Move all input tensors to GPU
    for key, value in outputs.items():
        outputs[key] = value.cuda()
    return outputs

In [25]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

# Calculate the maximum steps per epoch based on the number of rows in the training dataset.
# Make sure to scale by the total number of training workers and the per device batch size.
max_steps_per_epoch = ray_datasets["train"].count() // (batch_size * num_workers)


def train_func(config):
    print(f"Is CUDA available: {torch.cuda.is_available()}")

    metric = load_metric("glue", actual_task)
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
    
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint, num_labels=num_labels
    )

    train_ds = ray.train.get_dataset_shard("train")
    eval_ds = ray.train.get_dataset_shard("eval")

    train_ds_iterable = train_ds.iter_torch_batches(
        batch_size=batch_size, collate_fn=collate_fn
    )
    eval_ds_iterable = eval_ds.iter_torch_batches(
        batch_size=batch_size, collate_fn=collate_fn
    )

    print("max_steps_per_epoch: ", max_steps_per_epoch)

    args = TrainingArguments(
        name,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=config.get("learning_rate", 2e-5),
        num_train_epochs=config.get("epochs", 2),
        weight_decay=config.get("weight_decay", 0.01),
        push_to_hub=False,
        max_steps=max_steps_per_epoch * config.get("epochs", 2),
        disable_tqdm=True,  # declutter the output a little
        no_cuda=not use_gpu,  # you need to explicitly set no_cuda if you want CPUs
        report_to="none",
    )

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        if task != "stsb":
            predictions = np.argmax(predictions, axis=1)
        else:
            predictions = predictions[:, 0]
        return metric.compute(predictions=predictions, references=labels)

    trainer = Trainer(
        model,
        args,
        train_dataset=train_ds_iterable,
        eval_dataset=eval_ds_iterable,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.add_callback(RayTrainReportCallback())

    trainer = prepare_trainer(trainer)

    print("Starting training")
    trainer.train()

In [26]:

trainer = TorchTrainer(
    train_func,
    scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    datasets={
        "train": ray_datasets["train"],
        "eval": ray_datasets["validation"],
    },
    run_config=RunConfig(
        checkpoint_config=CheckpointConfig(
            num_to_keep=1,
            checkpoint_score_attribute="eval_loss",
            checkpoint_score_order="min",
        ),
    ),
)

In [27]:
result = trainer.fit()

0,1
Current time:,2023-12-28 15:32:58
Running for:,00:26:54.68
Memory:,19.7/393.5 GiB

Trial name,status,loc,iter,total time (s),loss,learning_rate,epoch
TorchTrainer_9e468_00000,TERMINATED,192.168.0.2:317617,2,1609.43,0.6498,0,1.5


[33m(raylet)[0m [2023-12-28 15:06:03,921 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 93890859008; capacity: 2161779998720. Object creation will fail if spilling is required.
[36m(TorchTrainer pid=317617)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=317617)[0m - (ip=192.168.0.2, pid=317669) world_rank=0, local_rank=0, node_rank=0
[36m(RayTrainWorker pid=317669)[0m Setting up process group for: env:// [rank=0, world_size=1]
[36m(SplitCoordinator pid=317723)[0m Auto configuring locality_with_output=['9426e9cf25a53c01389f879d75db2e4bd3751fd3a3bd13251baf11f1']


[36m(RayTrainWorker pid=317669)[0m Is CUDA available: True


[36m(RayTrainWorker pid=317669)[0m You can avoid this message in future by passing the argument `trust_remote_code=True`.
[36m(RayTrainWorker pid=317669)[0m Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
[33m(raylet)[0m [2023-12-28 15:06:13,933 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 93890727936; capacity: 2161779998720. Object creation will fail if spilling is required.
[36m(RayTrainWorker pid=317669)[0m Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
[36m(RayTrainWorker pid=317669)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[36m(RayTrainWork

[36m(RayTrainWorker pid=317669)[0m max_steps_per_epoch:  2068
[36m(RayTrainWorker pid=317669)[0m Starting training


[36m(SplitCoordinator pid=317722)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(1, equal=True)]
[36m(SplitCoordinator pid=317722)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=1.0, gpu=1.0, object_store_memory=0.0), locality_with_output=['9426e9cf25a53c01389f879d75db2e4bd3751fd3a3bd13251baf11f1'], preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[36m(SplitCoordinator pid=317722)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=317722) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[36m(SplitCoordinator pid=317722)[0m Auto configuring locality_with_output=['9426e9cf25a53c01389f879d75db2e4bd3751fd3a3bd13251baf11f1']
[33m(raylet)[0m [2023-12-28 15:06:23,953 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 93890703360; capacity: 2161779998720. Object creation will fail if spilling is required.
[33m(raylet)[0m [2023-12-28 15:06:33,965 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 93890654208; capacity: 2161779998720. Object creation will fail if spilling is required.
[33m(raylet)[0m [2023-12-28 15:06:43,974 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 93890531328; capacity: 2161779998720. Object creation will fail if spilling is required.
[33m(raylet)[0m [2023-12-28 15:06:53,987 E 3

[36m(RayTrainWorker pid=317669)[0m {'loss': 0.7725, 'learning_rate': 1e-05, 'epoch': 0.5}


[36m(SplitCoordinator pid=317723)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(1, equal=True)]
[36m(SplitCoordinator pid=317723)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=1.0, gpu=1.0, object_store_memory=0.0), locality_with_output=['9426e9cf25a53c01389f879d75db2e4bd3751fd3a3bd13251baf11f1'], preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[36m(SplitCoordinator pid=317723)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=317723) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[33m(raylet)[0m [2023-12-28 15:19:14,596 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 93886447616; capacity: 2161779998720. Object creation will fail if spilling is required.
[33m(raylet)[0m [2023-12-28 15:19:24,602 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 93886402560; capacity: 2161779998720. Object creation will fail if spilling is required.


[36m(RayTrainWorker pid=317669)[0m {'eval_loss': 0.7252287864685059, 'eval_matthews_correlation': 0.41824718545906114, 'eval_runtime': 27.2973, 'eval_samples_per_second': 134.702, 'eval_steps_per_second': 8.426, 'epoch': 0.5}


[33m(raylet)[0m [2023-12-28 15:19:34,613 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 93886210048; capacity: 2161779998720. Object creation will fail if spilling is required.
[36m(RayTrainWorker pid=317669)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/itmo/ray_results/TorchTrainer_2023-12-28_15-06-03/TorchTrainer_9e468_00000_0_2023-12-28_15-06-03/checkpoint_000000)
[36m(SplitCoordinator pid=317722)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(1, equal=True)]
[36m(SplitCoordinator pid=317722)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=1.0, gpu=1.0, object_store_memory=0.0), locality_with_output=['9426e9cf25a53c01389f879d75db2e4bd3751fd3a3bd13251baf11f1'], preserve_order=False, actor_locality_enabled=True, verbose_prog

(pid=317722) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[33m(raylet)[0m [2023-12-28 15:19:44,619 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 90887368704; capacity: 2161779998720. Object creation will fail if spilling is required.
[33m(raylet)[0m [2023-12-28 15:19:54,625 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 90887352320; capacity: 2161779998720. Object creation will fail if spilling is required.
[33m(raylet)[0m [2023-12-28 15:20:04,631 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 90887315456; capacity: 2161779998720. Object creation will fail if spilling is required.
[33m(raylet)[0m [2023-12-28 15:20:14,637 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 9

[36m(RayTrainWorker pid=317669)[0m {'loss': 0.6498, 'learning_rate': 0.0, 'epoch': 1.5}


[36m(SplitCoordinator pid=317723)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(1, equal=True)]
[36m(SplitCoordinator pid=317723)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=1.0, gpu=1.0, object_store_memory=0.0), locality_with_output=['9426e9cf25a53c01389f879d75db2e4bd3751fd3a3bd13251baf11f1'], preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[36m(SplitCoordinator pid=317723)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=317723) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[33m(raylet)[0m [2023-12-28 15:32:25,480 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 90883334144; capacity: 2161779998720. Object creation will fail if spilling is required.
[33m(raylet)[0m [2023-12-28 15:32:35,494 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 90883133440; capacity: 2161779998720. Object creation will fail if spilling is required.
[33m(raylet)[0m [2023-12-28 15:32:45,505 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 90883072000; capacity: 2161779998720. Object creation will fail if spilling is required.


[36m(RayTrainWorker pid=317669)[0m {'eval_loss': 0.719370424747467, 'eval_matthews_correlation': 0.4369429976748269, 'eval_runtime': 27.3603, 'eval_samples_per_second': 134.392, 'eval_steps_per_second': 8.406, 'epoch': 1.5}


[33m(raylet)[0m [2023-12-28 15:32:55,511 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 88513155072; capacity: 2161779998720. Object creation will fail if spilling is required.
[36m(RayTrainWorker pid=317669)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/itmo/ray_results/TorchTrainer_2023-12-28_15-06-03/TorchTrainer_9e468_00000_0_2023-12-28_15-06-03/checkpoint_000001)


[36m(RayTrainWorker pid=317669)[0m {'train_runtime': 1600.892, 'train_samples_per_second': 41.337, 'train_steps_per_second': 2.584, 'train_loss': 0.7111581769164803, 'epoch': 1.5}


2023-12-28 15:32:58,127	INFO tune.py:1042 -- Total run time: 1614.70 seconds (1614.68 seconds for the tuning loop).
[33m(raylet)[0m [2023-12-28 15:33:05,518 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 89383641088; capacity: 2161779998720. Object creation will fail if spilling is required.
[33m(raylet)[0m [2023-12-28 15:33:15,524 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 89383624704; capacity: 2161779998720. Object creation will fail if spilling is required.
[33m(raylet)[0m [2023-12-28 15:33:25,530 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 89383612416; capacity: 2161779998720. Object creation will fail if spilling is required.
[33m(raylet)[0m [2023-12-28 15:33:35,536 E 316406 316435] (raylet)

# Tuning

In [28]:
tuner = Tuner(
    trainer,
    param_space={
        "train_loop_config": {
            "learning_rate": tune.grid_search([2e-5, 2e-4, 2e-3, 2e-2]),
            "epochs": tune_epochs,
        }
    },
    tune_config=tune.TuneConfig(
        metric="eval_loss",
        mode="min",
        num_samples=1,
        scheduler=ASHAScheduler(
            max_t=tune_epochs,
        ),
    ),
    run_config=RunConfig(
        name="tune_transformers",
        checkpoint_config=CheckpointConfig(
            num_to_keep=1,
            checkpoint_score_attribute="eval_loss",
            checkpoint_score_order="min",
        ),
    ),
)

2023-12-28 15:41:11,344	INFO tuner_internal.py:401 -- A `RunConfig` was passed to both the `Tuner` and the `TorchTrainer`. The run config passed to the `Tuner` is the one that will be used.


In [29]:
tune_results = tuner.fit()


[33m(raylet)[0m [2023-12-28 16:57:49,765 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 72350732288; capacity: 2161779998720. Object creation will fail if spilling is required.
[33m(raylet)[0m [2023-12-28 16:57:59,772 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 72350683136; capacity: 2161779998720. Object creation will fail if spilling is required.
[33m(raylet)[0m [2023-12-28 16:58:09,789 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 72350629888; capacity: 2161779998720. Object creation will fail if spilling is required.


[36m(RayTrainWorker pid=353466)[0m {'loss': 2.2406, 'learning_rate': 0.015, 'epoch': 0.25}


(pid=353523) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[36m(SplitCoordinator pid=353523)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(1, equal=True)]
[36m(SplitCoordinator pid=353523)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=1.0, gpu=1.0, object_store_memory=0.0), locality_with_output=['9426e9cf25a53c01389f879d75db2e4bd3751fd3a3bd13251baf11f1'], preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[36m(SplitCoordinator pid=353523)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`
[33m(raylet)[0m [2023-12-28 16:58:19,795 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 72350519296; capacity: 2161779998720. Object creation will fail if spilling is required.


[36m(RayTrainWorker pid=353466)[0m {'eval_loss': 1.1120679378509521, 'eval_matthews_correlation': 0.0, 'eval_runtime': 12.5505, 'eval_samples_per_second': 292.976, 'eval_steps_per_second': 18.326, 'epoch': 0.25}


2023-12-28 16:58:26,750	INFO tune.py:1042 -- Total run time: 4634.05 seconds (4634.03 seconds for the tuning loop).
[36m(RayTrainWorker pid=353466)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/itmo/ray_results/tune_transformers/TorchTrainer_87803_00003_3_learning_rate=0.0200_2023-12-28_15-41-12/checkpoint_000000)


[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffffacec6e0764498d4f64fc634701000000 Worker ID: d197c2893207730880d74acc554cb5f0932630821ec4b948f95461d6 Node ID: 9426e9cf25a53c01389f879d75db2e4bd3751fd3a3bd13251baf11f1 Worker IP address: 192.168.0.2 Worker port: 34379 Worker PID: 353523 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker exits unexpectedly by a signal. SystemExit is raised (sys.exit is called). Exit code: 1. The process receives a SIGTERM.


[33m(raylet)[0m [2023-12-28 16:58:29,802 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 68350930944; capacity: 2161779998720. Object creation will fail if spilling is required.
[33m(raylet)[0m [2023-12-28 16:58:39,808 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 68350738432; capacity: 2161779998720. Object creation will fail if spilling is required.
[33m(raylet)[0m [2023-12-28 16:58:49,815 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 68350889984; capacity: 2161779998720. Object creation will fail if spilling is required.
[33m(raylet)[0m [2023-12-28 16:58:59,821 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 6

In [107]:
path = tune_results[0].checkpoint.path + "/checkpoint"
model = AutoModelForSequenceClassification.from_pretrained(path, num_labels=3).cuda()

In [108]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0) # only difference
    
y_pred = []
y_true = []
for example in tqdm(test_dataset):
    with torch.no_grad():
        inputs = tokenizer(example['text'], return_tensors="pt")
        logits = model(input_ids=inputs['input_ids'].cuda()[:,:512], 
                      attention_mask= inputs['attention_mask'].cuda()[:,:512]).logits
        y_true.append(example['label'])
        y_pred.append(softmax(logits.cpu().numpy()[0]))

  9%|███████████▊                                                                                                                          | 325/3677 [00:02<00:27, 121.57it/s][33m(raylet)[0m [2023-12-28 17:32:31,083 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 68342657024; capacity: 2161779998720. Object creation will fail if spilling is required.
 42%|███████████████████████████████████████████████████████▊                                                                             | 1542/3677 [00:12<00:17, 120.91it/s][33m(raylet)[0m [2023-12-28 17:32:41,107 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 68342484992; capacity: 2161779998720. Object creation will fail if spilling is required.
 75%|███████████████████████████████████████████████████████████████████████████████████████████████████▎ 

In [110]:
log_loss(y_true, y_pred)



0.864925101959551

[33m(raylet)[0m [2023-12-28 18:14:32,826 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 65016422400; capacity: 2161779998720. Object creation will fail if spilling is required.
[33m(raylet)[0m [2023-12-28 18:14:42,831 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 65016270848; capacity: 2161779998720. Object creation will fail if spilling is required.
[33m(raylet)[0m [2023-12-28 18:14:52,836 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 65016262656; capacity: 2161779998720. Object creation will fail if spilling is required.
[33m(raylet)[0m [2023-12-28 18:15:02,842 E 316406 316435] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-28_15-05-32_192665_313140 is over 95% full, available space: 6