In [1]:
import numpy as np
import pandas as pd
import os
import ray
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, default_data_collator
from datasets import load_dataset, load_metric
import evaluate
import ray.data
from ray.data.preprocessors import BatchMapper
from ray.train.huggingface import HuggingFaceTrainer
from ray.air.config import RunConfig, ScalingConfig, CheckpointConfig
from ray.air.integrations.mlflow import MLflowLoggerCallback
from ray.air.integrations.wandb import WandbLoggerCallback
import wandb

In [2]:
wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/rj2665/.netrc


True

In [3]:
os.environ['RAY_memory_usage_threshold'] = "0.98"

In [4]:
model_name = "distilroberta-base"
use_gpu = True
num_workers = 2
cpus_per_worker = 1

In [5]:
ray.init(
    runtime_env={
        "pip": [
            "datasets",
            "evaluate",
            "accelerate>=0.16.0",
            "transformers>=4.26.0",
            "torch>=1.12.0",
            "deepspeed",
        ]
    }
)

2023-05-09 01:44:40,148	INFO worker.py:1550 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


0,1
Python version:,3.7.12
Ray version:,2.3.1
Dashboard:,http://127.0.0.1:8265


In [6]:
imdb_data = load_dataset("imdb")

Found cached dataset imdb (/home/rj2665/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
ray_imdb_data = ray.data.from_huggingface(imdb_data)

In [8]:
ray_imdb_data

{'train': Dataset(num_blocks=1, num_rows=25000, schema={text: string, label: int64}),
 'test': Dataset(num_blocks=1, num_rows=25000, schema={text: string, label: int64}),
 'unsupervised': Dataset(num_blocks=1, num_rows=50000, schema={text: string, label: int64})}

In [9]:
def tokenize(batch: pd.DataFrame):
    # Tokenize the input text
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
    tokenized_texts = tokenizer(list(batch['text']), padding=True, truncation=True, return_tensors="np")
    tokenized_texts['input_ids'] = tokenized_texts['input_ids'].tolist()
    tokenized_texts['attention_mask'] = tokenized_texts['attention_mask'].tolist()
    tokenized_texts = {**batch, **tokenized_texts}
    return pd.DataFrame.from_dict(tokenized_texts)

In [10]:
tokenizer = BatchMapper(tokenize, batch_format="pandas")

In [11]:
batch_size = 16
def trainer_init_per_worker(train_dataset, eval_dataset = None, **config):
    print(f"Is CUDA available: {torch.cuda.is_available()}")
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    torch.backends.cuda.matmul.allow_tf32 = True
    deepspeed = {
        "fp16": {
            "enabled": "auto",
            "initial_scale_power": 8,
        },
        "bf16": {"enabled": "auto"},
        "optimizer": {
            "type": "AdamW",
            "params": {
                "lr": "auto",
                "betas": "auto",
                "eps": "auto",
            },
        },
        "zero_optimization": {
            "stage": 3,
            "offload_optimizer": {
                "device": "cpu",
                "pin_memory": True,
            },
            "offload_param": {
                "device": "cpu",
                "pin_memory": True,
            },
            "overlap_comm": True,
            "contiguous_gradients": True,
            "reduce_bucket_size": "auto",
            "stage3_prefetch_bucket_size": "auto",
            "stage3_param_persistence_threshold": "auto",
            "gather_16bit_weights_on_model_save": True,
            "round_robin_gradients": True,
        },
        "gradient_accumulation_steps": "auto",
        "gradient_clipping": "auto",
        "steps_per_print": 10,
        "train_batch_size": "auto",
        "train_micro_batch_size_per_gpu": "auto",
        "wall_clock_breakdown": False,
    }
    args = TrainingArguments(
        "ray-bert-finetune-imdb",
        save_strategy="no",
        logging_steps=1,
        learning_rate=config.get("learning_rate", 2e-5),
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        label_names=["input_ids", "attention_mask"],
        num_train_epochs=config.get("epochs", 2),
        weight_decay=config.get("weight_decay", 0.01),
        fp16=True,
        # gradient_checkpointing=True,
        deepspeed=deepspeed,
        push_to_hub=False,
        disable_tqdm=True,  # declutter the output a little
        # no_cuda=not use_gpu,  # you need to explicitly set no_cuda if you want CPUs
    )

    def compute_metrics(eval_pred):
        load_accuracy = load_metric("accuracy")
        load_f1 = load_metric("f1")
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
        f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
        return {"accuracy": accuracy, "f1": f1}


    trainer = Trainer(
        model,
        args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        data_collator = default_data_collator,
    )

    print("Starting training")
    return trainer

In [12]:
trainer = HuggingFaceTrainer(
    trainer_init_per_worker=trainer_init_per_worker,
    scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu, resources_per_worker={"GPU": 1, "CPU": 1}),
    datasets={
        "train": ray_imdb_data["train"],
        "evaluation": ray_imdb_data["test"],
    },
    run_config=RunConfig(
#         callbacks=[MLflowLoggerCallback(experiment_name="ray-roberta-finetune-imdb")],
        callbacks=[WandbLoggerCallback(project="ray-roberta-finetune-imdb")],
        checkpoint_config=CheckpointConfig(
            num_to_keep=1,
            checkpoint_score_attribute="loss",
            checkpoint_score_order="min",
        ),
    ),
    preprocessor=tokenizer,
)

In [13]:
result = trainer.fit()


0,1
Current time:,2023-05-09 02:01:43
Running for:,00:16:28.79
Memory:,11.8/14.7 GiB

Trial name,# failures,error file
HuggingFaceTrainer_24ef3_00000,1,/home/rj2665/ray_results/HuggingFaceTrainer_2023-05-09_01-45-14/HuggingFaceTrainer_24ef3_00000_0_2023-05-09_01-45-15/error.txt

Trial name,status,loc,iter,total time (s),loss,learning_rate,epoch
HuggingFaceTrainer_24ef3_00000,ERROR,10.158.0.3:12722,1563,971.779,0.3112,1.27877e-08,1.99872


2023-05-09 01:45:15,092	INFO wandb.py:314 -- Already logged into W&B.
[2m[36m(_WandbLoggingActor pid=12725)[0m wandb: Currently logged in as: rj2665 (hpml3). Use `wandb login --relogin` to force relogin
[2m[36m(HuggingFaceTrainer pid=12722)[0m 2023-05-09 01:45:24,146	INFO bulk_executor.py:39 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[BatchMapper] -> AllToAllOperator[randomize_block_order]
[2m[36m(_WandbLoggingActor pid=12725)[0m wandb: Tracking run with wandb version 0.15.2
[2m[36m(_WandbLoggingActor pid=12725)[0m wandb: Run data is saved locally in /home/rj2665/ray_results/HuggingFaceTrainer_2023-05-09_01-45-14/HuggingFaceTrainer_24ef3_00000_0_2023-05-09_01-45-15/wandb/run-20230509_014518-24ef3_00000
[2m[36m(_WandbLoggingActor pid=12725)[0m wandb: Run `wandb offline` to turn off syncing.
[2m[36m(_WandbLoggingActor pid=12725)[0m wandb: Syncing run HuggingFaceTrainer_24ef3_00000
[2m[36m(_WandbLoggingActor pid=12725)[0m wandb: ⭐️ View project at h

[2m[36m(RayTrainWorker pid=13733)[0m Is CUDA available: True
[2m[36m(RayTrainWorker pid=13732)[0m Is CUDA available: True


[2m[36m(RayTrainWorker pid=13732)[0m Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight']
[2m[36m(RayTrainWorker pid=13732)[0m - This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(RayTrainWorker pid=13732)[0m - This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(RayTrainWorker pid=13732)[0m Some weight

[2m[36m(RayTrainWorker pid=13733)[0m Starting training
[2m[36m(RayTrainWorker pid=13732)[0m Starting training
[2m[36m(RayTrainWorker pid=13733)[0m Installed CUDA version 11.3 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
[2m[36m(RayTrainWorker pid=13732)[0m Installed CUDA version 11.3 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination


[2m[36m(RayTrainWorker pid=13732)[0m Using /home/rj2665/.cache/torch_extensions/py37_cu117 as PyTorch extensions root...
[2m[36m(RayTrainWorker pid=13733)[0m Using /home/rj2665/.cache/torch_extensions/py37_cu117 as PyTorch extensions root...
[2m[36m(RayTrainWorker pid=13732)[0m Detected CUDA files, patching ldflags
[2m[36m(RayTrainWorker pid=13732)[0m Emitting ninja build file /home/rj2665/.cache/torch_extensions/py37_cu117/cpu_adam/build.ninja...
[2m[36m(RayTrainWorker pid=13732)[0m Building extension module cpu_adam...
[2m[36m(RayTrainWorker pid=13732)[0m Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


[2m[36m(RayTrainWorker pid=13733)[0m Installed CUDA version 11.3 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
[2m[36m(RayTrainWorker pid=13732)[0m Installed CUDA version 11.3 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination


[2m[36m(RayTrainWorker pid=13732)[0m Loading extension module cpu_adam...
[2m[36m(RayTrainWorker pid=13733)[0m Loading extension module cpu_adam...


[2m[36m(RayTrainWorker pid=13732)[0m ninja: no work to do.
[2m[36m(RayTrainWorker pid=13732)[0m Time to load cpu_adam op: 2.625136137008667 seconds
[2m[36m(RayTrainWorker pid=13733)[0m Time to load cpu_adam op: 2.715188503265381 seconds


[2m[36m(RayTrainWorker pid=13733)[0m Using /home/rj2665/.cache/torch_extensions/py37_cu117 as PyTorch extensions root...
[2m[36m(RayTrainWorker pid=13732)[0m Using /home/rj2665/.cache/torch_extensions/py37_cu117 as PyTorch extensions root...


[2m[36m(RayTrainWorker pid=13733)[0m ninja: no work to do.
[2m[36m(RayTrainWorker pid=13733)[0m Time to load utils op: 0.23445630073547363 seconds
[2m[36m(RayTrainWorker pid=13732)[0m Time to load utils op: 0.2026958465576172 seconds


[2m[36m(RayTrainWorker pid=13733)[0m Emitting ninja build file /home/rj2665/.cache/torch_extensions/py37_cu117/utils/build.ninja...
[2m[36m(RayTrainWorker pid=13733)[0m Building extension module utils...
[2m[36m(RayTrainWorker pid=13733)[0m Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
[2m[36m(RayTrainWorker pid=13733)[0m Loading extension module utils...
[2m[36m(RayTrainWorker pid=13732)[0m Loading extension module utils...


[2m[36m(RayTrainWorker pid=13732)[0m Parameter Offload: Total persistent parameters: 64514 in 66 params
[2m[36m(RayTrainWorker pid=13733)[0m Time to load utils op: 0.0005412101745605469 seconds


[2m[36m(RayTrainWorker pid=13733)[0m Using /home/rj2665/.cache/torch_extensions/py37_cu117 as PyTorch extensions root...
[2m[36m(RayTrainWorker pid=13733)[0m No modifications detected for re-loaded extension module utils, skipping build step...
[2m[36m(RayTrainWorker pid=13733)[0m Loading extension module utils...
[2m[36m(RayTrainWorker pid=13732)[0m Using /home/rj2665/.cache/torch_extensions/py37_cu117 as PyTorch extensions root...
[2m[36m(RayTrainWorker pid=13732)[0m No modifications detected for re-loaded extension module utils, skipping build step...
[2m[36m(RayTrainWorker pid=13732)[0m Loading extension module utils...


[2m[36m(RayTrainWorker pid=13732)[0m Time to load utils op: 0.0003960132598876953 seconds
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.7249, 'learning_rate': 1.9987212276214835e-05, 'epoch': 0.0}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.709, 'learning_rate': 1.997442455242967e-05, 'epoch': 0.0}


Trial name,_time_this_iter_s,_timestamp,_training_iteration,date,done,episodes_total,epoch,experiment_id,experiment_tag,hostname,iterations_since_restore,learning_rate,loss,node_ip,pid,step,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
HuggingFaceTrainer_24ef3_00000,0.530241,1683597695,1563,2023-05-09_02-01-35,False,,1.99872,b22c34d8cad341c48ff258428f7beab3,0,instance-7,1563,1.27877e-08,0.3112,10.158.0.3,12722,1563,971.779,0.53044,971.779,1683597695,0,,1563,24ef3_00000,0.010596




[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.7117, 'learning_rate': 1.9961636828644503e-05, 'epoch': 0.0}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.6987, 'learning_rate': 1.9948849104859337e-05, 'epoch': 0.01}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.7009, 'learning_rate': 1.993606138107417e-05, 'epoch': 0.01}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.6948, 'learning_rate': 1.9923273657289004e-05, 'epoch': 0.01}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.6909, 'learning_rate': 1.9910485933503838e-05, 'epoch': 0.01}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.6907, 'learning_rate': 1.9897698209718672e-05, 'epoch': 0.01}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.6914, 'learning_rate': 1.9884910485933505e-05, 'epoch': 0.01}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.6843, 'learning_rate': 1.987212276214834e-05, 'epoch': 0.01}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.6907, 'learning_rate': 1.9859335038363173e-05, '

[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.3278, 'learning_rate': 1.9028132992327367e-05, 'epoch': 0.1}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2876, 'learning_rate': 1.90153452685422e-05, 'epoch': 0.1}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2955, 'learning_rate': 1.9002557544757035e-05, 'epoch': 0.1}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0963, 'learning_rate': 1.898976982097187e-05, 'epoch': 0.1}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1682, 'learning_rate': 1.8976982097186702e-05, 'epoch': 0.1}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.615, 'learning_rate': 1.8964194373401536e-05, 'epoch': 0.1}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1908, 'learning_rate': 1.895140664961637e-05, 'epoch': 0.1}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1166, 'learning_rate': 1.8938618925831204e-05, 'epoch': 0.11}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2411, 'learning_rate': 1.8925831202046038e-05, 'epoch': 0

[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.3581, 'learning_rate': 1.8094629156010232e-05, 'epoch': 0.19}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2187, 'learning_rate': 1.8081841432225066e-05, 'epoch': 0.19}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.3892, 'learning_rate': 1.80690537084399e-05, 'epoch': 0.19}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2585, 'learning_rate': 1.8056265984654733e-05, 'epoch': 0.19}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2653, 'learning_rate': 1.8043478260869567e-05, 'epoch': 0.2}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.5126, 'learning_rate': 1.80306905370844e-05, 'epoch': 0.2}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1253, 'learning_rate': 1.8017902813299235e-05, 'epoch': 0.2}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1338, 'learning_rate': 1.800511508951407e-05, 'epoch': 0.2}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1907, 'learning_rate': 1.7992327365728902e-05, 'epoch'

[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2645, 'learning_rate': 1.7161125319693097e-05, 'epoch': 0.28}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1799, 'learning_rate': 1.714833759590793e-05, 'epoch': 0.29}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0969, 'learning_rate': 1.7135549872122764e-05, 'epoch': 0.29}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2196, 'learning_rate': 1.7122762148337598e-05, 'epoch': 0.29}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1738, 'learning_rate': 1.710997442455243e-05, 'epoch': 0.29}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2656, 'learning_rate': 1.7097186700767265e-05, 'epoch': 0.29}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.4192, 'learning_rate': 1.70843989769821e-05, 'epoch': 0.29}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2441, 'learning_rate': 1.7071611253196933e-05, 'epoch': 0.29}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2763, 'learning_rate': 1.7058823529411767e-05, 'e

[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2596, 'learning_rate': 1.6227621483375958e-05, 'epoch': 0.38}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2745, 'learning_rate': 1.6214833759590795e-05, 'epoch': 0.38}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1626, 'learning_rate': 1.620204603580563e-05, 'epoch': 0.38}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2289, 'learning_rate': 1.6189258312020462e-05, 'epoch': 0.38}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1909, 'learning_rate': 1.6176470588235296e-05, 'epoch': 0.38}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1713, 'learning_rate': 1.616368286445013e-05, 'epoch': 0.38}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0984, 'learning_rate': 1.6150895140664964e-05, 'epoch': 0.38}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0451, 'learning_rate': 1.6138107416879797e-05, 'epoch': 0.39}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2142, 'learning_rate': 1.612531969309463e-05, '

[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.3842, 'learning_rate': 1.5294117647058822e-05, 'epoch': 0.47}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.3705, 'learning_rate': 1.528132992327366e-05, 'epoch': 0.47}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1396, 'learning_rate': 1.5268542199488493e-05, 'epoch': 0.47}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2963, 'learning_rate': 1.5255754475703327e-05, 'epoch': 0.47}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.3226, 'learning_rate': 1.524296675191816e-05, 'epoch': 0.48}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2043, 'learning_rate': 1.5230179028132994e-05, 'epoch': 0.48}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.5388, 'learning_rate': 1.5217391304347828e-05, 'epoch': 0.48}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1107, 'learning_rate': 1.520460358056266e-05, 'epoch': 0.48}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2, 'learning_rate': 1.5191815856777494e-05, 'epo

[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0368, 'learning_rate': 1.4360613810741688e-05, 'epoch': 0.56}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0539, 'learning_rate': 1.4347826086956522e-05, 'epoch': 0.57}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0464, 'learning_rate': 1.4335038363171356e-05, 'epoch': 0.57}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0403, 'learning_rate': 1.432225063938619e-05, 'epoch': 0.57}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.253, 'learning_rate': 1.4309462915601025e-05, 'epoch': 0.57}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1918, 'learning_rate': 1.4296675191815859e-05, 'epoch': 0.57}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0669, 'learning_rate': 1.4283887468030693e-05, 'epoch': 0.57}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.104, 'learning_rate': 1.4271099744245525e-05, 'epoch': 0.57}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.392, 'learning_rate': 1.4258312020460359e-05, 'e

[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2802, 'learning_rate': 1.3427109974424553e-05, 'epoch': 0.66}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2852, 'learning_rate': 1.3414322250639387e-05, 'epoch': 0.66}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1573, 'learning_rate': 1.340153452685422e-05, 'epoch': 0.66}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.3351, 'learning_rate': 1.3388746803069054e-05, 'epoch': 0.66}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1131, 'learning_rate': 1.337595907928389e-05, 'epoch': 0.66}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1638, 'learning_rate': 1.3363171355498723e-05, 'epoch': 0.66}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.3044, 'learning_rate': 1.3350383631713557e-05, 'epoch': 0.66}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.105, 'learning_rate': 1.333759590792839e-05, 'epoch': 0.67}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2621, 'learning_rate': 1.3324808184143223e-05, 'e

[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1488, 'learning_rate': 1.2493606138107417e-05, 'epoch': 0.75}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1542, 'learning_rate': 1.2480818414322251e-05, 'epoch': 0.75}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0414, 'learning_rate': 1.2468030690537085e-05, 'epoch': 0.75}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0477, 'learning_rate': 1.2455242966751919e-05, 'epoch': 0.75}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2303, 'learning_rate': 1.2442455242966753e-05, 'epoch': 0.76}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0859, 'learning_rate': 1.2429667519181588e-05, 'epoch': 0.76}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2344, 'learning_rate': 1.2416879795396422e-05, 'epoch': 0.76}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2036, 'learning_rate': 1.2404092071611254e-05, 'epoch': 0.76}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1255, 'learning_rate': 1.2391304347826088e-05

[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1031, 'learning_rate': 1.1560102301790282e-05, 'epoch': 0.84}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.5127, 'learning_rate': 1.1547314578005116e-05, 'epoch': 0.85}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.3809, 'learning_rate': 1.153452685421995e-05, 'epoch': 0.85}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0703, 'learning_rate': 1.1521739130434783e-05, 'epoch': 0.85}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0585, 'learning_rate': 1.1508951406649617e-05, 'epoch': 0.85}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.137, 'learning_rate': 1.1496163682864453e-05, 'epoch': 0.85}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2545, 'learning_rate': 1.1483375959079286e-05, 'epoch': 0.85}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0633, 'learning_rate': 1.1470588235294118e-05, 'epoch': 0.85}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1235, 'learning_rate': 1.1457800511508952e-05, 

[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1332, 'learning_rate': 1.0626598465473147e-05, 'epoch': 0.94}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1506, 'learning_rate': 1.061381074168798e-05, 'epoch': 0.94}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.241, 'learning_rate': 1.0601023017902814e-05, 'epoch': 0.94}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1134, 'learning_rate': 1.0588235294117648e-05, 'epoch': 0.94}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.3274, 'learning_rate': 1.0575447570332482e-05, 'epoch': 0.94}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0565, 'learning_rate': 1.0562659846547317e-05, 'epoch': 0.94}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0867, 'learning_rate': 1.0549872122762147e-05, 'epoch': 0.95}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0894, 'learning_rate': 1.0537084398976983e-05, 'epoch': 0.95}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1836, 'learning_rate': 1.0524296675191817e-05, 

[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1816, 'learning_rate': 9.680306905370845e-06, 'epoch': 1.03}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0971, 'learning_rate': 9.667519181585679e-06, 'epoch': 1.03}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2314, 'learning_rate': 9.654731457800512e-06, 'epoch': 1.03}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1636, 'learning_rate': 9.641943734015346e-06, 'epoch': 1.04}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0289, 'learning_rate': 9.62915601023018e-06, 'epoch': 1.04}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1754, 'learning_rate': 9.616368286445014e-06, 'epoch': 1.04}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.3158, 'learning_rate': 9.603580562659847e-06, 'epoch': 1.04}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2107, 'learning_rate': 9.590792838874681e-06, 'epoch': 1.04}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1446, 'learning_rate': 9.578005115089515e-06, 'epoch':

[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.3463, 'learning_rate': 8.734015345268543e-06, 'epoch': 1.13}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0834, 'learning_rate': 8.721227621483377e-06, 'epoch': 1.13}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0659, 'learning_rate': 8.70843989769821e-06, 'epoch': 1.13}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.3435, 'learning_rate': 8.695652173913044e-06, 'epoch': 1.13}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.137, 'learning_rate': 8.682864450127878e-06, 'epoch': 1.13}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1394, 'learning_rate': 8.670076726342712e-06, 'epoch': 1.13}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.3971, 'learning_rate': 8.657289002557546e-06, 'epoch': 1.13}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1036, 'learning_rate': 8.64450127877238e-06, 'epoch': 1.14}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0734, 'learning_rate': 8.631713554987213e-06, 'epoch': 1

[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1156, 'learning_rate': 7.787723785166241e-06, 'epoch': 1.22}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.4098, 'learning_rate': 7.774936061381073e-06, 'epoch': 1.22}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0769, 'learning_rate': 7.762148337595909e-06, 'epoch': 1.22}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0908, 'learning_rate': 7.749360613810743e-06, 'epoch': 1.23}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2267, 'learning_rate': 7.736572890025576e-06, 'epoch': 1.23}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0639, 'learning_rate': 7.72378516624041e-06, 'epoch': 1.23}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0134, 'learning_rate': 7.710997442455244e-06, 'epoch': 1.23}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1451, 'learning_rate': 7.698209718670078e-06, 'epoch': 1.23}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0051, 'learning_rate': 7.685421994884912e-06, 'epoch':

[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2804, 'learning_rate': 6.841432225063939e-06, 'epoch': 1.32}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.4104, 'learning_rate': 6.828644501278773e-06, 'epoch': 1.32}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1221, 'learning_rate': 6.815856777493607e-06, 'epoch': 1.32}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0065, 'learning_rate': 6.803069053708441e-06, 'epoch': 1.32}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.167, 'learning_rate': 6.790281329923274e-06, 'epoch': 1.32}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1257, 'learning_rate': 6.777493606138108e-06, 'epoch': 1.32}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1105, 'learning_rate': 6.764705882352942e-06, 'epoch': 1.32}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1896, 'learning_rate': 6.751918158567775e-06, 'epoch': 1.32}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0825, 'learning_rate': 6.739130434782609e-06, 'epoch':

[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.4023, 'learning_rate': 5.895140664961637e-06, 'epoch': 1.41}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.7069, 'learning_rate': 5.882352941176471e-06, 'epoch': 1.41}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.8687, 'learning_rate': 5.8695652173913055e-06, 'epoch': 1.41}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1945, 'learning_rate': 5.8567774936061384e-06, 'epoch': 1.41}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0255, 'learning_rate': 5.843989769820972e-06, 'epoch': 1.42}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0531, 'learning_rate': 5.831202046035807e-06, 'epoch': 1.42}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2915, 'learning_rate': 5.81841432225064e-06, 'epoch': 1.42}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1311, 'learning_rate': 5.8056265984654735e-06, 'epoch': 1.42}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0355, 'learning_rate': 5.792838874680308e-06, 'epoc

[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0473, 'learning_rate': 4.9488491048593355e-06, 'epoch': 1.51}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1592, 'learning_rate': 4.936061381074169e-06, 'epoch': 1.51}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1009, 'learning_rate': 4.923273657289003e-06, 'epoch': 1.51}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2175, 'learning_rate': 4.910485933503837e-06, 'epoch': 1.51}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0249, 'learning_rate': 4.8976982097186705e-06, 'epoch': 1.51}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2018, 'learning_rate': 4.884910485933504e-06, 'epoch': 1.51}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1121, 'learning_rate': 4.872122762148338e-06, 'epoch': 1.51}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0299, 'learning_rate': 4.859335038363172e-06, 'epoch': 1.51}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1459, 'learning_rate': 4.8465473145780055e-06, 'epo

[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0128, 'learning_rate': 4.002557544757034e-06, 'epoch': 1.6}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.176, 'learning_rate': 3.9897698209718675e-06, 'epoch': 1.6}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1807, 'learning_rate': 3.976982097186701e-06, 'epoch': 1.6}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0178, 'learning_rate': 3.964194373401535e-06, 'epoch': 1.6}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0224, 'learning_rate': 3.951406649616369e-06, 'epoch': 1.6}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0792, 'learning_rate': 3.9386189258312025e-06, 'epoch': 1.61}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.271, 'learning_rate': 3.925831202046036e-06, 'epoch': 1.61}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1256, 'learning_rate': 3.91304347826087e-06, 'epoch': 1.61}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1127, 'learning_rate': 3.900255754475704e-06, 'epoch': 1.61

[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2319, 'learning_rate': 3.069053708439898e-06, 'epoch': 1.69}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2667, 'learning_rate': 3.0562659846547316e-06, 'epoch': 1.69}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.3585, 'learning_rate': 3.043478260869566e-06, 'epoch': 1.7}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2261, 'learning_rate': 3.030690537084399e-06, 'epoch': 1.7}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0801, 'learning_rate': 3.017902813299233e-06, 'epoch': 1.7}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.19, 'learning_rate': 3.0051150895140667e-06, 'epoch': 1.7}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0525, 'learning_rate': 2.9923273657289004e-06, 'epoch': 1.7}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.245, 'learning_rate': 2.9795396419437346e-06, 'epoch': 1.7}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0526, 'learning_rate': 2.966751918158568e-06, 'epoch': 1.7

[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1554, 'learning_rate': 2.1355498721227624e-06, 'epoch': 1.79}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0664, 'learning_rate': 2.122762148337596e-06, 'epoch': 1.79}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0775, 'learning_rate': 2.10997442455243e-06, 'epoch': 1.79}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.4528, 'learning_rate': 2.0971867007672637e-06, 'epoch': 1.79}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1263, 'learning_rate': 2.0843989769820974e-06, 'epoch': 1.79}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0311, 'learning_rate': 2.071611253196931e-06, 'epoch': 1.79}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0749, 'learning_rate': 2.058823529411765e-06, 'epoch': 1.79}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.6708, 'learning_rate': 2.0460358056265987e-06, 'epoch': 1.8}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0857, 'learning_rate': 2.0332480818414325e-06, 'epo

[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2243, 'learning_rate': 1.2020460358056267e-06, 'epoch': 1.88}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.012, 'learning_rate': 1.1892583120204605e-06, 'epoch': 1.88}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0135, 'learning_rate': 1.1764705882352942e-06, 'epoch': 1.88}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0356, 'learning_rate': 1.163682864450128e-06, 'epoch': 1.88}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2186, 'learning_rate': 1.1508951406649617e-06, 'epoch': 1.88}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.008, 'learning_rate': 1.1381074168797955e-06, 'epoch': 1.89}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.1215, 'learning_rate': 1.1253196930946293e-06, 'epoch': 1.89}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.4148, 'learning_rate': 1.112531969309463e-06, 'epoch': 1.89}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2709, 'learning_rate': 1.0997442455242968e-06, 'e

[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0343, 'learning_rate': 2.5575447570332484e-07, 'epoch': 1.97}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.2638, 'learning_rate': 2.4296675191815855e-07, 'epoch': 1.98}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.3431, 'learning_rate': 2.3017902813299236e-07, 'epoch': 1.98}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.3101, 'learning_rate': 2.173913043478261e-07, 'epoch': 1.98}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.3031, 'learning_rate': 2.0460358056265988e-07, 'epoch': 1.98}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0563, 'learning_rate': 1.9181585677749362e-07, 'epoch': 1.98}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.3121, 'learning_rate': 1.7902813299232738e-07, 'epoch': 1.98}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.07, 'learning_rate': 1.6624040920716116e-07, 'epoch': 1.98}
[2m[36m(RayTrainWorker pid=13732)[0m {'loss': 0.0607, 'learning_rate': 1.534526854219949e-07, 'e

[2m[36m(RayTrainWorker pid=13732)[0m   "Positional args are being deprecated, use kwargs instead. Refer to "
[2m[36m(RayTrainWorker pid=13733)[0m   "Positional args are being deprecated, use kwargs instead. Refer to "


[2m[36m(RayTrainWorker pid=13732)[0m {'train_runtime': 835.0049, 'train_samples_per_second': 29.94, 'train_steps_per_second': 1.873, 'train_loss': 0.20461657528987015, 'epoch': 2.0}


2023-05-09 02:01:39,259	ERROR trial_runner.py:1062 -- Trial HuggingFaceTrainer_24ef3_00000: Error processing event.
ray.exceptions.RayTaskError(PermissionError): [36mray::_Inner.train()[39m (pid=12722, ip=10.158.0.3, repr=HuggingFaceTrainer)
  File "/opt/conda/lib/python3.7/site-packages/ray/tune/trainable/trainable.py", line 368, in train
    raise skipped from exception_cause(skipped)
  File "/opt/conda/lib/python3.7/site-packages/ray/tune/trainable/function_trainable.py", line 340, in entrypoint
    self._status_reporter.get_checkpoint(),
  File "/opt/conda/lib/python3.7/site-packages/ray/train/base_trainer.py", line 505, in _trainable_func
    super()._trainable_func(self._merged_config, reporter, checkpoint_dir)
  File "/opt/conda/lib/python3.7/site-packages/ray/tune/trainable/function_trainable.py", line 654, in _trainable_func
    output = fn()
  File "/opt/conda/lib/python3.7/site-packages/ray/train/base_trainer.py", line 415, in train_func
    trainer.training_loop()
  File 