In [1]:
import pandas as pd
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

from datasets import load_dataset, load_metric, Dataset
from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
                          Trainer, TrainingArguments)

import numpy as np
from typing import Dict
import torch
import numpy as np

import ray
from ray.train.torch import TorchTrainer
from ray.train import RunConfig, ScalingConfig, CheckpointConfig
import ray.data
import ray.train
from ray.train.huggingface.transformers import prepare_trainer, RayTrainReportCallback
from ray import tune
from ray.tune import Tuner
from ray.tune.schedulers.async_hyperband import ASHAScheduler
from tqdm import tqdm
from sklearn.metrics import f1_score


# Constants

In [2]:
train_path = "./data/train"
valid_path = "./data/dev"

num_labels = 3
metric_name = "accuracy"

model_checkpoint = "distilbert-base-uncased"
model_name = model_checkpoint.split("/")[-1]
validation_key = "validation"
task = "cola"

name = f"{model_name}-finetuned-{task}"
actual_task = task
batch_size = 16
num_workers = 1
use_gpu = True

tune_epochs = 4


# Read data

In [3]:
with open(train_path + ".input0", 'r') as f,  open(train_path + ".label", 'r') as g:
    train_texts = f.read().split("\n")[:-1]
    train_labels = g.read().split("\n")[:-1]
    
with open(valid_path + ".input0", 'r') as f,  open(valid_path + ".label", 'r') as g:
    valid_texts = f.read().split("\n")[:-1]
    valid_labels = g.read().split("\n")[:-1]

In [4]:
df_train = pd.DataFrame([train_texts, train_labels]).T
df_train.columns = ["text", 'label']
df_train['label'] = df_train['label'].astype(int)
train_dataset = Dataset.from_pandas(df_train)

In [5]:
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 29412
})

In [6]:
df_test = pd.DataFrame([valid_texts, valid_labels]).T
df_test.columns = ["text", 'label']
df_test['label'] = df_test['label'].astype(int)
test_dataset = Dataset.from_pandas(df_test)

In [7]:
test_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 7353
})

# Ray training

In [8]:
ray_datasets = {
    "train": ray.data.from_huggingface(train_dataset),
    "validation": ray.data.from_huggingface(test_dataset),
}
ray_datasets

2023-12-25 18:26:01,394	INFO worker.py:1724 -- Started a local Ray instance.


{'train': MaterializedDataset(
    num_blocks=1,
    num_rows=29412,
    schema={text: string, label: int64}
 ),
 'validation': MaterializedDataset(
    num_blocks=1,
    num_rows=7353,
    schema={text: string, label: int64}
 )}

In [9]:

# Tokenize input sentences
def collate_fn(examples: Dict[str, np.array]):
    outputs =  tokenizer(list(examples['text']), truncation=True,  
                         padding="longest",
                         return_tensors="pt",)

    outputs["labels"] = torch.LongTensor(examples["label"])

    # Move all input tensors to GPU
    for key, value in outputs.items():
        outputs[key] = value.cuda()
    return outputs

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

# Calculate the maximum steps per epoch based on the number of rows in the training dataset.
# Make sure to scale by the total number of training workers and the per device batch size.
max_steps_per_epoch = ray_datasets["train"].count() // (batch_size * num_workers)


def train_func(config):
    print(f"Is CUDA available: {torch.cuda.is_available()}")

    metric = load_metric("glue", actual_task)
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
    
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint, num_labels=num_labels
    )

    train_ds = ray.train.get_dataset_shard("train")
    eval_ds = ray.train.get_dataset_shard("eval")

    train_ds_iterable = train_ds.iter_torch_batches(
        batch_size=batch_size, collate_fn=collate_fn
    )
    eval_ds_iterable = eval_ds.iter_torch_batches(
        batch_size=batch_size, collate_fn=collate_fn
    )

    print("max_steps_per_epoch: ", max_steps_per_epoch)

    args = TrainingArguments(
        name,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=config.get("learning_rate", 2e-5),
        num_train_epochs=config.get("epochs", 2),
        weight_decay=config.get("weight_decay", 0.01),
        push_to_hub=False,
        max_steps=max_steps_per_epoch * config.get("epochs", 2),
        disable_tqdm=True,  # declutter the output a little
        no_cuda=not use_gpu,  # you need to explicitly set no_cuda if you want CPUs
        report_to="none",
    )

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        if task != "stsb":
            predictions = np.argmax(predictions, axis=1)
        else:
            predictions = predictions[:, 0]
        return metric.compute(predictions=predictions, references=labels)

    trainer = Trainer(
        model,
        args,
        train_dataset=train_ds_iterable,
        eval_dataset=eval_ds_iterable,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.add_callback(RayTrainReportCallback())

    trainer = prepare_trainer(trainer)

    print("Starting training")
    trainer.train()

In [11]:

trainer = TorchTrainer(
    train_func,
    scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    datasets={
        "train": ray_datasets["train"],
        "eval": ray_datasets["validation"],
    },
    run_config=RunConfig(
        checkpoint_config=CheckpointConfig(
            num_to_keep=1,
            checkpoint_score_attribute="eval_loss",
            checkpoint_score_order="min",
        ),
    ),
)

In [12]:
result = trainer.fit()

0,1
Current time:,2023-12-25 18:32:09
Running for:,00:06:04.37
Memory:,21.0/393.5 GiB

Trial name,status,loc,iter,total time (s),loss,learning_rate,epoch
TorchTrainer_10d8a_00000,TERMINATED,192.168.0.2:92340,2,359.401,0.6566,0,1.5


[36m(TorchTrainer pid=92340)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=92340)[0m - (ip=192.168.0.2, pid=92417) world_rank=0, local_rank=0, node_rank=0
[36m(RayTrainWorker pid=92417)[0m Setting up process group for: env:// [rank=0, world_size=1]


[36m(RayTrainWorker pid=92417)[0m Is CUDA available: True


[36m(SplitCoordinator pid=92472)[0m Auto configuring locality_with_output=['1a66715af743dabd9b80b3a893f9972e0380dae976dd72512d19bc2a']
[36m(RayTrainWorker pid=92417)[0m You can avoid this message in future by passing the argument `trust_remote_code=True`.
[36m(RayTrainWorker pid=92417)[0m Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


[36m(RayTrainWorker pid=92417)[0m max_steps_per_epoch:  1838


[36m(RayTrainWorker pid=92417)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']
[36m(RayTrainWorker pid=92417)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[36m(RayTrainWorker pid=92417)[0m Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


[36m(RayTrainWorker pid=92417)[0m Starting training


[36m(SplitCoordinator pid=92472)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(1, equal=True)]
[36m(SplitCoordinator pid=92472)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=1.0, gpu=1.0, object_store_memory=0.0), locality_with_output=['1a66715af743dabd9b80b3a893f9972e0380dae976dd72512d19bc2a'], preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[36m(SplitCoordinator pid=92472)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=92472) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]



[36m(RayTrainWorker pid=92417)[0m {'loss': 0.7774, 'learning_rate': 9.99455930359086e-06, 'epoch': 0.5}


[36m(SplitCoordinator pid=92473)[0m Auto configuring locality_with_output=['1a66715af743dabd9b80b3a893f9972e0380dae976dd72512d19bc2a']
[36m(SplitCoordinator pid=92473)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(1, equal=True)]
[36m(SplitCoordinator pid=92473)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=1.0, gpu=1.0, object_store_memory=0.0), locality_with_output=['1a66715af743dabd9b80b3a893f9972e0380dae976dd72512d19bc2a'], preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[36m(SplitCoordinator pid=92473)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=92473) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[36m(RayTrainWorker pid=92417)[0m {'eval_loss': 0.7320316433906555, 'eval_matthews_correlation': 0.39615303511604993, 'eval_runtime': 13.5295, 'eval_samples_per_second': 543.479, 'eval_steps_per_second': 34.0, 'epoch': 0.5}


[36m(RayTrainWorker pid=92417)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/itmo/ray_results/TorchTrainer_2023-12-25_18-26-05/TorchTrainer_10d8a_00000_0_2023-12-25_18-26-05/checkpoint_000000)
[36m(SplitCoordinator pid=92472)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(1, equal=True)]
[36m(SplitCoordinator pid=92472)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=1.0, gpu=1.0, object_store_memory=0.0), locality_with_output=['1a66715af743dabd9b80b3a893f9972e0380dae976dd72512d19bc2a'], preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[36m(SplitCoordinator pid=92472)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=92472) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[36m(RayTrainWorker pid=92417)[0m {'loss': 0.6566, 'learning_rate': 0.0, 'epoch': 1.5}


[36m(SplitCoordinator pid=92473)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(1, equal=True)]
[36m(SplitCoordinator pid=92473)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=1.0, gpu=1.0, object_store_memory=0.0), locality_with_output=['1a66715af743dabd9b80b3a893f9972e0380dae976dd72512d19bc2a'], preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[36m(SplitCoordinator pid=92473)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=92473) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[36m(RayTrainWorker pid=92417)[0m {'eval_loss': 0.7379251718521118, 'eval_matthews_correlation': 0.4091076801455507, 'eval_runtime': 13.4609, 'eval_samples_per_second': 546.25, 'eval_steps_per_second': 34.173, 'epoch': 1.5}


[36m(RayTrainWorker pid=92417)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/itmo/ray_results/TorchTrainer_2023-12-25_18-26-05/TorchTrainer_10d8a_00000_0_2023-12-25_18-26-05/checkpoint_000001)


[36m(RayTrainWorker pid=92417)[0m {'train_runtime': 352.7186, 'train_samples_per_second': 166.75, 'train_steps_per_second': 10.422, 'train_loss': 0.7170052844889145, 'epoch': 1.5}


2023-12-25 18:32:09,928	INFO tune.py:1042 -- Total run time: 364.40 seconds (364.37 seconds for the tuning loop).


# Tuning

In [13]:
tuner = Tuner(
    trainer,
    param_space={
        "train_loop_config": {
            "learning_rate": tune.grid_search([2e-5, 2e-4, 2e-3, 2e-2]),
            "epochs": tune_epochs,
        }
    },
    tune_config=tune.TuneConfig(
        metric="eval_loss",
        mode="min",
        num_samples=1,
        scheduler=ASHAScheduler(
            max_t=tune_epochs,
        ),
    ),
    run_config=RunConfig(
        name="tune_transformers",
        checkpoint_config=CheckpointConfig(
            num_to_keep=1,
            checkpoint_score_attribute="eval_loss",
            checkpoint_score_order="min",
        ),
    ),
)

2023-12-25 18:32:09,938	INFO tuner_internal.py:401 -- A `RunConfig` was passed to both the `Tuner` and the `TorchTrainer`. The run config passed to the `Tuner` is the one that will be used.


In [14]:
tune_results = tuner.fit()


0,1
Current time:,2023-12-25 18:53:23
Running for:,00:21:13.89
Memory:,20.9/393.5 GiB

Trial name,status,loc,train_loop_config/le arning_rate,iter,total time (s),loss,learning_rate,epoch
TorchTrainer_ea11d_00000,TERMINATED,192.168.0.2:97812,2e-05,4,712.982,0.4253,0.0,3.25
TorchTrainer_ea11d_00001,TERMINATED,192.168.0.2:108568,0.0002,1,183.064,0.9574,0.000149973,0.25
TorchTrainer_ea11d_00002,TERMINATED,192.168.0.2:111376,0.002,1,182.321,0.9892,0.00149973,0.25
TorchTrainer_ea11d_00003,TERMINATED,192.168.0.2:114275,0.02,1,180.285,1.1552,0.0149973,0.25


[36m(TorchTrainer pid=97812)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=97812)[0m - (ip=192.168.0.2, pid=97865) world_rank=0, local_rank=0, node_rank=0
[36m(RayTrainWorker pid=97865)[0m Setting up process group for: env:// [rank=0, world_size=1]
[36m(SplitCoordinator pid=97919)[0m Auto configuring locality_with_output=['1a66715af743dabd9b80b3a893f9972e0380dae976dd72512d19bc2a']


[36m(RayTrainWorker pid=97865)[0m Is CUDA available: True


[36m(RayTrainWorker pid=97865)[0m You can avoid this message in future by passing the argument `trust_remote_code=True`.
[36m(RayTrainWorker pid=97865)[0m Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
[36m(RayTrainWorker pid=97865)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight']
[36m(RayTrainWorker pid=97865)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[36m(RayTrainWorker pid=97865)[0m Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


[36m(RayTrainWorker pid=97865)[0m max_steps_per_epoch:  1838
[36m(RayTrainWorker pid=97865)[0m Starting training




(pid=97919) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[36m(SplitCoordinator pid=97919)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(1, equal=True)]
[36m(SplitCoordinator pid=97919)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=1.0, gpu=1.0, object_store_memory=0.0), locality_with_output=['1a66715af743dabd9b80b3a893f9972e0380dae976dd72512d19bc2a'], preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[36m(SplitCoordinator pid=97919)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


[36m(RayTrainWorker pid=97865)[0m {'loss': 0.782, 'learning_rate': 1.4997279651795431e-05, 'epoch': 0.25}


[36m(SplitCoordinator pid=97920)[0m Auto configuring locality_with_output=['1a66715af743dabd9b80b3a893f9972e0380dae976dd72512d19bc2a']
[36m(SplitCoordinator pid=97920)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(1, equal=True)]
[36m(SplitCoordinator pid=97920)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=1.0, gpu=1.0, object_store_memory=0.0), locality_with_output=['1a66715af743dabd9b80b3a893f9972e0380dae976dd72512d19bc2a'], preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[36m(SplitCoordinator pid=97920)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=97920) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[36m(RayTrainWorker pid=97865)[0m {'eval_loss': 0.7412927746772766, 'eval_matthews_correlation': 0.39369773685844595, 'eval_runtime': 13.4839, 'eval_samples_per_second': 545.315, 'eval_steps_per_second': 34.115, 'epoch': 0.25}


[36m(RayTrainWorker pid=97865)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/itmo/ray_results/tune_transformers/TorchTrainer_ea11d_00000_0_learning_rate=0.0000_2023-12-25_18-32-09/checkpoint_000000)


(pid=97919) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[36m(SplitCoordinator pid=97919)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(1, equal=True)]
[36m(SplitCoordinator pid=97919)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=1.0, gpu=1.0, object_store_memory=0.0), locality_with_output=['1a66715af743dabd9b80b3a893f9972e0380dae976dd72512d19bc2a'], preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[36m(SplitCoordinator pid=97919)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


[36m(RayTrainWorker pid=97865)[0m {'loss': 0.658, 'learning_rate': 9.99455930359086e-06, 'epoch': 1.25}


[36m(SplitCoordinator pid=97920)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(1, equal=True)]
[36m(SplitCoordinator pid=97920)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=1.0, gpu=1.0, object_store_memory=0.0), locality_with_output=['1a66715af743dabd9b80b3a893f9972e0380dae976dd72512d19bc2a'], preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[36m(SplitCoordinator pid=97920)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=97920) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[36m(RayTrainWorker pid=97865)[0m {'eval_loss': 0.7539792060852051, 'eval_matthews_correlation': 0.406581779908781, 'eval_runtime': 13.5197, 'eval_samples_per_second': 543.873, 'eval_steps_per_second': 34.024, 'epoch': 1.25}


[36m(RayTrainWorker pid=97865)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/itmo/ray_results/tune_transformers/TorchTrainer_ea11d_00000_0_learning_rate=0.0000_2023-12-25_18-32-09/checkpoint_000001)


(pid=97919) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[36m(SplitCoordinator pid=97919)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(1, equal=True)]
[36m(SplitCoordinator pid=97919)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=1.0, gpu=1.0, object_store_memory=0.0), locality_with_output=['1a66715af743dabd9b80b3a893f9972e0380dae976dd72512d19bc2a'], preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[36m(SplitCoordinator pid=97919)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


[36m(RayTrainWorker pid=97865)[0m {'loss': 0.5275, 'learning_rate': 4.99183895538629e-06, 'epoch': 2.25}


[36m(SplitCoordinator pid=97920)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(1, equal=True)]
[36m(SplitCoordinator pid=97920)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=1.0, gpu=1.0, object_store_memory=0.0), locality_with_output=['1a66715af743dabd9b80b3a893f9972e0380dae976dd72512d19bc2a'], preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[36m(SplitCoordinator pid=97920)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=97920) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[36m(RayTrainWorker pid=97865)[0m {'eval_loss': 0.8132387399673462, 'eval_matthews_correlation': 0.3989952703856063, 'eval_runtime': 13.4843, 'eval_samples_per_second': 545.302, 'eval_steps_per_second': 34.114, 'epoch': 2.25}


[36m(RayTrainWorker pid=97865)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/itmo/ray_results/tune_transformers/TorchTrainer_ea11d_00000_0_learning_rate=0.0000_2023-12-25_18-32-09/checkpoint_000002)


(pid=97919) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[36m(SplitCoordinator pid=97919)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(1, equal=True)]
[36m(SplitCoordinator pid=97919)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=1.0, gpu=1.0, object_store_memory=0.0), locality_with_output=['1a66715af743dabd9b80b3a893f9972e0380dae976dd72512d19bc2a'], preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[36m(SplitCoordinator pid=97919)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


[36m(RayTrainWorker pid=97865)[0m {'loss': 0.4253, 'learning_rate': 0.0, 'epoch': 3.25}


[36m(SplitCoordinator pid=97920)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(1, equal=True)]
[36m(SplitCoordinator pid=97920)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=1.0, gpu=1.0, object_store_memory=0.0), locality_with_output=['1a66715af743dabd9b80b3a893f9972e0380dae976dd72512d19bc2a'], preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[36m(SplitCoordinator pid=97920)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=97920) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[36m(RayTrainWorker pid=97865)[0m {'eval_loss': 0.9307839870452881, 'eval_matthews_correlation': 0.38854276596384485, 'eval_runtime': 13.4802, 'eval_samples_per_second': 545.468, 'eval_steps_per_second': 34.124, 'epoch': 3.25}


[36m(RayTrainWorker pid=97865)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/itmo/ray_results/tune_transformers/TorchTrainer_ea11d_00000_0_learning_rate=0.0000_2023-12-25_18-32-09/checkpoint_000003)


[36m(RayTrainWorker pid=97865)[0m {'train_runtime': 706.4031, 'train_samples_per_second': 166.522, 'train_steps_per_second': 10.408, 'train_loss': 0.5983102858650282, 'epoch': 3.25}


[36m(TorchTrainer pid=108568)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=108568)[0m - (ip=192.168.0.2, pid=108623) world_rank=0, local_rank=0, node_rank=0
[36m(RayTrainWorker pid=108623)[0m Setting up process group for: env:// [rank=0, world_size=1]


[36m(RayTrainWorker pid=108623)[0m Is CUDA available: True


[36m(SplitCoordinator pid=108677)[0m Auto configuring locality_with_output=['1a66715af743dabd9b80b3a893f9972e0380dae976dd72512d19bc2a']
[36m(RayTrainWorker pid=108623)[0m You can avoid this message in future by passing the argument `trust_remote_code=True`.
[36m(RayTrainWorker pid=108623)[0m Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


[36m(RayTrainWorker pid=108623)[0m max_steps_per_epoch:  1838


[36m(RayTrainWorker pid=108623)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
[36m(RayTrainWorker pid=108623)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[36m(RayTrainWorker pid=108623)[0m Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


[36m(RayTrainWorker pid=108623)[0m Starting training


(pid=108676) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[36m(SplitCoordinator pid=108676)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(1, equal=True)]
[36m(SplitCoordinator pid=108676)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=1.0, gpu=1.0, object_store_memory=0.0), locality_with_output=['1a66715af743dabd9b80b3a893f9972e0380dae976dd72512d19bc2a'], preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[36m(SplitCoordinator pid=108676)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


[36m(RayTrainWorker pid=108623)[0m {'loss': 0.9574, 'learning_rate': 0.0001499727965179543, 'epoch': 0.25}


[36m(SplitCoordinator pid=108676)[0m Auto configuring locality_with_output=['1a66715af743dabd9b80b3a893f9972e0380dae976dd72512d19bc2a']
[36m(SplitCoordinator pid=108677)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(1, equal=True)]
[36m(SplitCoordinator pid=108677)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=1.0, gpu=1.0, object_store_memory=0.0), locality_with_output=['1a66715af743dabd9b80b3a893f9972e0380dae976dd72512d19bc2a'], preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[36m(SplitCoordinator pid=108677)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=108677) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[36m(RayTrainWorker pid=108623)[0m {'eval_loss': 0.8985300064086914, 'eval_matthews_correlation': 0.28063325591945343, 'eval_runtime': 13.3326, 'eval_samples_per_second': 551.506, 'eval_steps_per_second': 34.502, 'epoch': 0.25}


[36m(RayTrainWorker pid=108623)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/itmo/ray_results/tune_transformers/TorchTrainer_ea11d_00001_1_learning_rate=0.0002_2023-12-25_18-32-09/checkpoint_000000)
[36m(TorchTrainer pid=111376)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=111376)[0m - (ip=192.168.0.2, pid=111430) world_rank=0, local_rank=0, node_rank=0
[36m(RayTrainWorker pid=111430)[0m Setting up process group for: env:// [rank=0, world_size=1]


[36m(RayTrainWorker pid=111430)[0m Is CUDA available: True


[36m(SplitCoordinator pid=111485)[0m Auto configuring locality_with_output=['1a66715af743dabd9b80b3a893f9972e0380dae976dd72512d19bc2a']
[36m(RayTrainWorker pid=111430)[0m You can avoid this message in future by passing the argument `trust_remote_code=True`.
[36m(RayTrainWorker pid=111430)[0m Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


[36m(RayTrainWorker pid=111430)[0m max_steps_per_epoch:  1838


[36m(RayTrainWorker pid=111430)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight']
[36m(RayTrainWorker pid=111430)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[36m(RayTrainWorker pid=111430)[0m Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


[36m(RayTrainWorker pid=111430)[0m Starting training


(pid=111485) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[36m(SplitCoordinator pid=111485)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(1, equal=True)]
[36m(SplitCoordinator pid=111485)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=1.0, gpu=1.0, object_store_memory=0.0), locality_with_output=['1a66715af743dabd9b80b3a893f9972e0380dae976dd72512d19bc2a'], preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[36m(SplitCoordinator pid=111485)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


[36m(RayTrainWorker pid=111430)[0m {'loss': 0.9892, 'learning_rate': 0.001499727965179543, 'epoch': 0.25}


[36m(SplitCoordinator pid=111486)[0m Auto configuring locality_with_output=['1a66715af743dabd9b80b3a893f9972e0380dae976dd72512d19bc2a']
[36m(SplitCoordinator pid=111486)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(1, equal=True)]
[36m(SplitCoordinator pid=111486)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=1.0, gpu=1.0, object_store_memory=0.0), locality_with_output=['1a66715af743dabd9b80b3a893f9972e0380dae976dd72512d19bc2a'], preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[36m(SplitCoordinator pid=111486)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=111486) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[36m(RayTrainWorker pid=111430)[0m {'eval_loss': 0.9776955842971802, 'eval_matthews_correlation': 0.0, 'eval_runtime': 13.1467, 'eval_samples_per_second': 559.304, 'eval_steps_per_second': 34.99, 'epoch': 0.25}


[36m(RayTrainWorker pid=111430)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/itmo/ray_results/tune_transformers/TorchTrainer_ea11d_00002_2_learning_rate=0.0020_2023-12-25_18-32-09/checkpoint_000000)
[36m(TorchTrainer pid=114275)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=114275)[0m - (ip=192.168.0.2, pid=114330) world_rank=0, local_rank=0, node_rank=0
[36m(RayTrainWorker pid=114330)[0m Setting up process group for: env:// [rank=0, world_size=1]


[36m(RayTrainWorker pid=114330)[0m Is CUDA available: True


[36m(SplitCoordinator pid=114385)[0m Auto configuring locality_with_output=['1a66715af743dabd9b80b3a893f9972e0380dae976dd72512d19bc2a']
[36m(RayTrainWorker pid=114330)[0m You can avoid this message in future by passing the argument `trust_remote_code=True`.
[36m(RayTrainWorker pid=114330)[0m Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
[36m(RayTrainWorker pid=114330)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias']
[36m(RayTrainWorker pid=114330)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[36m(RayTrainWorker pid=114330)[0m Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is r

[36m(RayTrainWorker pid=114330)[0m max_steps_per_epoch:  1838
[36m(RayTrainWorker pid=114330)[0m Starting training


(pid=114384) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[36m(SplitCoordinator pid=114384)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(1, equal=True)]
[36m(SplitCoordinator pid=114384)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=1.0, gpu=1.0, object_store_memory=0.0), locality_with_output=['1a66715af743dabd9b80b3a893f9972e0380dae976dd72512d19bc2a'], preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[36m(SplitCoordinator pid=114384)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


[36m(RayTrainWorker pid=114330)[0m {'loss': 1.1552, 'learning_rate': 0.01499727965179543, 'epoch': 0.25}


[36m(SplitCoordinator pid=114384)[0m Auto configuring locality_with_output=['1a66715af743dabd9b80b3a893f9972e0380dae976dd72512d19bc2a']
[36m(SplitCoordinator pid=114385)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(1, equal=True)]
[36m(SplitCoordinator pid=114385)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=1.0, gpu=1.0, object_store_memory=0.0), locality_with_output=['1a66715af743dabd9b80b3a893f9972e0380dae976dd72512d19bc2a'], preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[36m(SplitCoordinator pid=114385)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=114385) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[36m(RayTrainWorker pid=114330)[0m {'eval_loss': 0.9774772524833679, 'eval_matthews_correlation': 0.0, 'eval_runtime': 13.1348, 'eval_samples_per_second': 559.81, 'eval_steps_per_second': 35.021, 'epoch': 0.25}


2023-12-25 18:53:23,875	INFO tune.py:1042 -- Total run time: 1273.91 seconds (1273.89 seconds for the tuning loop).
[36m(RayTrainWorker pid=114330)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/itmo/ray_results/tune_transformers/TorchTrainer_ea11d_00003_3_learning_rate=0.0200_2023-12-25_18-32-09/checkpoint_000000)


In [99]:
path = tune_results[0].checkpoint.path + "/checkpoint"
model = AutoModelForSequenceClassification.from_pretrained(path, num_labels=3).cuda()

In [100]:
y_pred = []
y_true = []
for example in tqdm(test_dataset):
    with torch.no_grad():
        inputs = tokenizer(example['text'], return_tensors="pt")
        logits = model(input_ids=inputs['input_ids'].cuda()[:,:512], 
                      attention_mask= inputs['attention_mask'].cuda()[:,:512]).logits
        prediction = logits.argmax().item()
        y_true.append(example['label'])
        y_pred.append(prediction)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7353/7353 [00:34<00:00, 212.56it/s]


In [101]:
score = f1_score(y_pred, y_true, average=None)
score

array([0.72418478, 0.65909677, 0.36818409])

In [102]:
score.mean()

0.5838218829494224