In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import pickle
import numpy as np
import torch
import evaluate
import shutil
from ray.tune import CLIReporter

from datasets import load_dataset, Dataset, load_metric
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    HfArgumentParser,
    Seq2SeqTrainingArguments,
    T5ForConditionalGeneration,
    set_seed,
    Seq2SeqTrainer,
)

import pandas as pd

from ray import tune
from ray.tune.schedulers import ASHAScheduler
# from ray.tune.integration.torch import TuneReportCallback
import ray.train.huggingface.transformers
from ray.train import ScalingConfig
from ray.train.torch import TorchTrainer


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


# Q2

## prepare and test model

In [2]:
# 加载模型和分词器
model_name = "flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [3]:
input_text = "Where is the capital of China?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0]))

<pad> shanghai</s>




## `train_func()` ---  Encapsulate data preprocessing, training, and evaluation

load & preprocess data

In [4]:
def preprocess_function(examples,mytokenizer):
    # Ensure inputs and targets are lists of strings
    inputs = [str(ex) for ex in examples['input']]
    targets = [str(ex) for ex in examples['output']]

    # Tokenize inputs
    model_inputs = mytokenizer(inputs, max_length=512, truncation=True)
    # Tokenize targets
    # with tokenizer.as_target_tokenizer():
    labels = mytokenizer(targets, max_length=128, truncation=True)
    # Add labels to model inputs
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

def save_dataset(dataset, path):
    # dataset.save_to_disk(path)
    if not os.path.exists(os.path.dirname(path)):
        os.makedirs(os.path.dirname(path))
    with open(path, 'wb') as f:
        pickle.dump(dataset, f)

def load_preprocessed_dataset(path):
    # return Dataset.load_from_disk(path)
    with open(path, 'rb') as f:
        tmp = pickle.load(f)
    return tmp

def load_and_preprocess_datasets(tokenizer=tokenizer):
    train_save_path = 'data/mydata/preprocessed/train.pkl'
    val_save_path = 'data/mydata/preprocessed/val.pkl'
    test_save_path = 'data/mydata/preprocessed/test.pkl'

    if os.path.exists(train_save_path) and os.path.exists(val_save_path) and os.path.exists(test_save_path):
        train_dataset = load_preprocessed_dataset(train_save_path)
        val_dataset = load_preprocessed_dataset(val_save_path)
        test_dataset = load_preprocessed_dataset(test_save_path)
    else:
        train_dataset = load_dataset('csv', data_files='/spark_zc/STA323_zc/proj2/data/mydata/mytrain.csv', cache_dir='data/cachefile')['train']
        val_dataset = load_dataset('csv', data_files='/spark_zc/STA323_zc/proj2/data/mydata/myvalidation.csv', cache_dir='data/cachefile')['train']
        test_dataset = load_dataset('csv', data_files='/spark_zc/STA323_zc/proj2/data/mydata/mytest.csv', cache_dir='data/cachefile')['train']

        train_dataset = train_dataset.map(lambda examples: preprocess_function(examples, tokenizer), batched=True)
        val_dataset = val_dataset.map(lambda examples: preprocess_function(examples, tokenizer), batched=True)
        test_dataset = test_dataset.map(lambda examples: preprocess_function(examples, tokenizer), batched=True)

        save_dataset(train_dataset, train_save_path)
        save_dataset(val_dataset, val_save_path)
        save_dataset(test_dataset, test_save_path)

    return train_dataset, val_dataset, test_dataset

train_dataset, val_dataset, test_dataset = load_and_preprocess_datasets()

定义一个训练函数，该函数将接收超参数配置并训练模型。注意要使用Ray的对象存储来获取数据集。

In [5]:
# [1] Encapsulate data preprocessing, training, and evaluation
# logic in a training function
# ============================================================

def train_func(config):
    ###################################* load data
    train_dataset, val_dataset, test_dataset = load_and_preprocess_datasets()

    ###################################* load model and tokenizer
    model_name_path = "/spark_zc/STA323_zc/proj2/flan-t5-small"
    tokenizer = AutoTokenizer.from_pretrained(model_name_path)
    model = T5ForConditionalGeneration.from_pretrained(model_name_path)

    ###################################* metrix
    def compute_metrics(p):
        predictions, labels = p

        if isinstance(predictions, tuple):
            predictions = predictions[0]
        predictions = np.argmax(predictions, axis=-1)

        predictions = predictions.flatten()
        labels = labels.flatten()

        metric = load_metric("accuracy")
        return metric.compute(predictions=predictions, references=labels)

    training_args = Seq2SeqTrainingArguments(
        output_dir="/spark_zc/STA323_zc/proj2/results",
        learning_rate=config["learning_rate"],
        per_device_train_batch_size=config["batch_size"],
        per_device_eval_batch_size=config["batch_size"],
        num_train_epochs=config["num_epochs"],
        weight_decay=config["weight_decay"],
        save_total_limit=1,          # 只保留一个checkpoint
        load_best_model_at_end=True, # 是否在训练结束时加载最佳模型
        metric_for_best_model="eval_loss", # 选择最佳模型的指标
        greater_is_better=False,  # 表示更小的评估损失表示更好的模型
        save_strategy="epoch",  # 表示每个训练周期结束后保存模型
        evaluation_strategy="epoch", # 每个训练周期结束后评估模型
    )

    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[ray.train.huggingface.transformers.RayTrainReportCallback()],
    )

    trainer = ray.train.huggingface.transformers.prepare_trainer(trainer)
    trainer.train()

    # # Save the trained model checkpoint
    # with tune.checkpoint_dir(step=trainer.state.global_step) as checkpoint_dir:
    #     model.save_pretrained(checkpoint_dir)
    #     tokenizer.save_pretrained(checkpoint_dir)

An error `FileNotFoundError: Unable to find '/tmp/ray/session_2024-06-03_23-57-28_281310_151578/artifacts/2024-06-04_00-03-46/TorchTrainer_2024-06-04_00-03-46/working_dirs/TorchTrainer_dba0f_00000_0_2024-06-04_00-03-46/data/mydata/mytrain.csv'` is the direct cause of failure, hence I substitue the relative path with the absolute path.

In [6]:
# # [4] Define a Ray TorchTrainer to launch `train_func` on all workers
# # ===================================================================
# ray_trainer = TorchTrainer(
#     train_func,
#     scaling_config=ScalingConfig(num_workers=1, use_gpu=False),
#     # [4a] If running in a multi-node cluster, this is where you
#     # should configure the run's persistent storage that is accessible
#     # across all worker nodes.
#     # run_config=ray.train.RunConfig(
#     #     failure_config=ray.train.FailureConfig(max_failures = -1)
#     #     )
# )
# result: ray.train.Result = ray_trainer.fit()

# # [5] Load the trained model.
# with result.checkpoint.as_directory() as checkpoint_dir:
#     checkpoint_path = os.path.join(
#         checkpoint_dir,
#         ray.train.huggingface.transformers.RayTrainReportCallback.CHECKPOINT_NAME,
#     )
#     model = T5ForConditionalGeneration.from_pretrained(checkpoint_path)

Access training results

In [7]:
# result.metrics     # The metrics reported during training.
# result.checkpoint  # The latest checkpoint reported during training.
# result.path        # The path where logs are stored.
# result.error       # The exception that was raised, if training failed.

## Search space and init

初始化Ray，并定义调度器以控制试验的调度策略。

In [8]:

scheduler = tune.schedulers.ASHAScheduler(
    metric="eval_accuracy",
    mode="max",
    max_t=10,
    grace_period=1,
    reduction_factor=2
)


In [9]:
def tune_transformer():
    search_space = {
        "learning_rate": tune.loguniform(1e-5, 1e-3),
        "batch_size": tune.choice([8, 16, 32]),
        "num_epochs": tune.choice([1]),
        # "num_epochs": tune.choice([2, 3, 4]),
        "weight_decay": tune.uniform(0.0, 0.3),
    }

    reporter = CLIReporter(
        parameter_columns=["learning_rate", "num_train_epochs", "weight_decay"],
        metric_columns=["eval_accuracy", "eval_loss", "epoch", "training_iteration"],
        max_report_frequency=10,  # 控制报告频率
        print_intermediate_tables=False  # 关闭中间表格输出
    )
    analysis = tune.run(
        train_func,
        resources_per_trial={"cpu": 0.2},  # Adjust as needed
        config=search_space,
        num_samples=1,  # Number of trials
        scheduler=scheduler,
        progress_reporter=reporter,
        name="tune_qa_model",
        local_dir="/spark_zc/STA323_zc/proj2/ray_results",
        stop={"training_iteration": 2},
        keep_checkpoints_num=3,  # 限制保存的 checkpoint 数量
        checkpoint_score_attr="eval_acc",  # 根据验证集的准确率选择保留的 checkpoint
    )

    print("Best hyperparameters found were: ", analysis.best_config)
    best_trial = analysis.get_best_trial(metric="eval_loss", mode="min")
    best_model_path = best_trial.checkpoint.value
    # 创建目标文件夹路径
    destination_path = "/path/to/bestmodel"
    # 复制整个目录
    shutil.copytree(best_model_path, destination_path)

In [10]:
# Run Ray Tune
# ray.shutdown()
ray.init(ignore_reinit_error=True)
tune_transformer()

2024-06-04 02:38:06,282	INFO worker.py:1740 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m
2024-06-04 02:38:46,079	INFO tune.py:614 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


== Status ==
Current time: 2024-06-04 02:38:46 (running for 00:00:00.27)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 PENDING)




[36m(pid=336911)[0m   torch.utils._pytree._register_pytree_node(
[36m(pid=336911)[0m   torch.utils._pytree._register_pytree_node(
Generating train split: 0 examples [00:00, ? examples/s]
Generating train split: 10000 examples [00:00, 41070.70 examples/s]
Generating train split: 20000 examples [00:00, 45172.39 examples/s]
Generating train split: 30000 examples [00:00, 50480.11 examples/s]
Generating train split: 40000 examples [00:00, 52764.39 examples/s]
Generating train split: 50000 examples [00:00, 53214.78 examples/s]
Generating train split: 60000 examples [00:01, 51209.02 examples/s]
Generating train split: 70000 examples [00:01, 50229.60 examples/s]
Generating train split: 81819 examples [00:01, 51001.31 examples/s]


== Status ==
Current time: 2024-06-04 02:38:56 (running for 00:00:10.37)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Generating train split: 0 examples [00:00, ? examples/s]
Generating train split: 5000 examples [00:00, 51092.97 examples/s]
Generating train split: 0 examples [00:00, ? examples/s]
Generating train split: 10000 examples [00:00, 73774.41 examples/s]
Generating train split: 20302 examples [00:00, 60754.44 examples/s]
Map:   0%|          | 0/81819 [00:00<?, ? examples/s]
Map:   1%|          | 1000/81819 [00:01<01:21, 994.05 examples/s]
Map:   2%|▏         | 2000/81819 [00:02<01:22, 965.57 examples/s]
Map:   4%|▎         | 3000/81819 [00:03<01:21, 969.32 examples/s]
Map:   5%|▍         | 4000/81819 [00:04<01:38, 792.60 examples/s]
Map:   6%|▌         | 5000/81819 [00:05<01:33, 825.76 examples/s]
Map:   7%|▋         | 6000/81819 [00:06<01:27, 870.22 examples/s]
Map:   9%|▊         | 7000/81819 [00:07<01:24, 887.15 examples/s]


== Status ==
Current time: 2024-06-04 02:39:06 (running for 00:00:20.41)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Map:  10%|▉         | 8000/81819 [00:08<01:19, 933.28 examples/s]
Map:  11%|█         | 9000/81819 [00:10<01:24, 862.51 examples/s]
Map:  12%|█▏        | 10000/81819 [00:11<01:19, 902.74 examples/s]
Map:  13%|█▎        | 11000/81819 [00:12<01:16, 930.51 examples/s]
Map:  15%|█▍        | 12000/81819 [00:13<01:13, 949.43 examples/s]
Map:  16%|█▌        | 13000/81819 [00:14<01:10, 976.55 examples/s]
Map:  17%|█▋        | 14000/81819 [00:15<01:18, 867.81 examples/s]
Map:  18%|█▊        | 15000/81819 [00:16<01:15, 888.73 examples/s]
Map:  20%|█▉        | 16000/81819 [00:17<01:12, 913.04 examples/s]


== Status ==
Current time: 2024-06-04 02:39:16 (running for 00:00:30.44)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Map:  21%|██        | 17000/81819 [00:18<01:08, 942.10 examples/s]
Map:  22%|██▏       | 18000/81819 [00:19<01:06, 959.55 examples/s]
Map:  23%|██▎       | 19000/81819 [00:21<01:12, 866.57 examples/s]
Map:  24%|██▍       | 20000/81819 [00:22<01:15, 822.66 examples/s]
Map:  26%|██▌       | 21000/81819 [00:23<01:09, 874.68 examples/s]
Map:  27%|██▋       | 22000/81819 [00:24<01:04, 921.19 examples/s]
Map:  28%|██▊       | 23000/81819 [00:25<01:09, 843.08 examples/s]
Map:  29%|██▉       | 24000/81819 [00:26<01:06, 871.16 examples/s]
Map:  31%|███       | 25000/81819 [00:27<01:02, 905.36 examples/s]


== Status ==
Current time: 2024-06-04 02:39:26 (running for 00:00:40.48)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Map:  32%|███▏      | 26000/81819 [00:28<00:59, 943.55 examples/s]
Map:  33%|███▎      | 27000/81819 [00:29<00:56, 967.08 examples/s]
Map:  34%|███▍      | 28000/81819 [00:31<01:01, 870.01 examples/s]
Map:  35%|███▌      | 29000/81819 [00:32<00:58, 895.34 examples/s]
Map:  37%|███▋      | 30000/81819 [00:33<00:56, 917.21 examples/s]
Map:  38%|███▊      | 31000/81819 [00:34<00:53, 949.79 examples/s]
Map:  39%|███▉      | 32000/81819 [00:35<00:53, 939.49 examples/s]
Map:  40%|████      | 33000/81819 [00:36<00:58, 839.89 examples/s]
Map:  42%|████▏     | 34000/81819 [00:37<00:54, 873.64 examples/s]


== Status ==
Current time: 2024-06-04 02:39:36 (running for 00:00:50.51)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Map:  43%|████▎     | 35000/81819 [00:38<00:50, 922.72 examples/s]
Map:  44%|████▍     | 36000/81819 [00:40<00:51, 887.29 examples/s]
Map:  45%|████▌     | 37000/81819 [00:41<00:48, 916.87 examples/s]
Map:  46%|████▋     | 38000/81819 [00:42<00:51, 854.19 examples/s]
Map:  48%|████▊     | 39000/81819 [00:43<00:47, 902.10 examples/s]
Map:  49%|████▉     | 40000/81819 [00:44<00:44, 946.03 examples/s]
Map:  50%|█████     | 41000/81819 [00:45<00:42, 958.00 examples/s]
Map:  51%|█████▏    | 42000/81819 [00:46<00:42, 935.86 examples/s]
Map:  53%|█████▎    | 43000/81819 [00:48<00:47, 822.66 examples/s]


== Status ==
Current time: 2024-06-04 02:39:46 (running for 00:01:00.54)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Map:  54%|█████▍    | 44000/81819 [00:49<00:44, 851.68 examples/s]
Map:  55%|█████▍    | 45000/81819 [00:50<00:41, 877.58 examples/s]
Map:  56%|█████▌    | 46000/81819 [00:51<00:41, 872.17 examples/s]
Map:  57%|█████▋    | 47000/81819 [00:52<00:44, 783.87 examples/s]
Map:  59%|█████▊    | 48000/81819 [00:53<00:40, 831.25 examples/s]
Map:  60%|█████▉    | 49000/81819 [00:55<00:38, 845.06 examples/s]
Map:  61%|██████    | 50000/81819 [00:56<00:42, 746.10 examples/s]


== Status ==
Current time: 2024-06-04 02:39:56 (running for 00:01:10.57)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Map:  62%|██████▏   | 51000/81819 [00:58<00:43, 703.35 examples/s]
Map:  64%|██████▎   | 52000/81819 [00:59<00:39, 755.45 examples/s]
Map:  65%|██████▍   | 53000/81819 [01:00<00:35, 804.17 examples/s]
Map:  66%|██████▌   | 54000/81819 [01:01<00:33, 825.35 examples/s]
Map:  67%|██████▋   | 55000/81819 [01:02<00:30, 868.01 examples/s]
Map:  68%|██████▊   | 56000/81819 [01:04<00:31, 832.01 examples/s]
Map:  70%|██████▉   | 57000/81819 [01:05<00:28, 876.79 examples/s]
Map:  71%|███████   | 58000/81819 [01:06<00:26, 891.81 examples/s]
Map:  72%|███████▏  | 59000/81819 [01:07<00:24, 918.14 examples/s]


== Status ==
Current time: 2024-06-04 02:40:06 (running for 00:01:20.60)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Map:  73%|███████▎  | 60000/81819 [01:08<00:25, 845.43 examples/s]
Map:  75%|███████▍  | 61000/81819 [01:09<00:23, 902.45 examples/s]
Map:  76%|███████▌  | 62000/81819 [01:10<00:20, 950.75 examples/s]
Map:  77%|███████▋  | 63000/81819 [01:11<00:21, 891.12 examples/s]
Map:  78%|███████▊  | 64000/81819 [01:12<00:19, 930.38 examples/s]
Map:  79%|███████▉  | 65000/81819 [01:13<00:19, 879.14 examples/s]
Map:  81%|████████  | 66000/81819 [01:14<00:16, 931.39 examples/s]
Map:  82%|████████▏ | 67000/81819 [01:15<00:15, 950.04 examples/s]
Map:  83%|████████▎ | 68000/81819 [01:16<00:14, 938.59 examples/s]
Map:  84%|████████▍ | 69000/81819 [01:17<00:13, 952.74 examples/s]


== Status ==
Current time: 2024-06-04 02:40:16 (running for 00:01:30.63)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Map:  86%|████████▌ | 70000/81819 [01:19<00:13, 893.62 examples/s]
Map:  87%|████████▋ | 71000/81819 [01:20<00:11, 930.91 examples/s]
Map:  88%|████████▊ | 72000/81819 [01:21<00:10, 932.11 examples/s]
Map:  89%|████████▉ | 73000/81819 [01:22<00:09, 941.04 examples/s]
Map:  90%|█████████ | 74000/81819 [01:23<00:08, 963.60 examples/s]
Map:  92%|█████████▏| 75000/81819 [01:24<00:07, 903.76 examples/s]
Map:  93%|█████████▎| 76000/81819 [01:25<00:06, 943.43 examples/s]
Map:  94%|█████████▍| 77000/81819 [01:26<00:05, 860.31 examples/s]
Map:  95%|█████████▌| 78000/81819 [01:27<00:04, 898.93 examples/s]


== Status ==
Current time: 2024-06-04 02:40:26 (running for 00:01:40.67)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Map:  97%|█████████▋| 79000/81819 [01:28<00:03, 934.90 examples/s]
Map:  98%|█████████▊| 80000/81819 [01:30<00:02, 882.22 examples/s]
Map:  99%|█████████▉| 81000/81819 [01:31<00:00, 936.53 examples/s]
Map: 100%|██████████| 81819/81819 [01:31<00:00, 889.75 examples/s]
Map:   0%|          | 0/5000 [00:00<?, ? examples/s]
Map:  20%|██        | 1000/5000 [00:00<00:03, 1062.61 examples/s]
Map:  40%|████      | 2000/5000 [00:01<00:02, 1047.45 examples/s]
Map:  60%|██████    | 3000/5000 [00:02<00:01, 1065.55 examples/s]
Map:  80%|████████  | 4000/5000 [00:04<00:01, 916.40 examples/s] 
Map: 100%|██████████| 5000/5000 [00:05<00:00, 952.03 examples/s]
Map:   0%|          | 0/20302 [00:00<?, ? examples/s]
Map:   5%|▍         | 1000/20302 [00:00<00:15, 1255.19 examples/s]


== Status ==
Current time: 2024-06-04 02:40:36 (running for 00:01:50.69)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Map:  10%|▉         | 2000/20302 [00:01<00:13, 1321.43 examples/s]
Map:  15%|█▍        | 3000/20302 [00:02<00:15, 1107.83 examples/s]
Map:  20%|█▉        | 4000/20302 [00:03<00:16, 1002.25 examples/s]
Map:  25%|██▍       | 5000/20302 [00:04<00:15, 1015.42 examples/s]
Map:  30%|██▉       | 6000/20302 [00:06<00:16, 861.88 examples/s] 
Map:  34%|███▍      | 7000/20302 [00:07<00:14, 933.12 examples/s]
Map:  39%|███▉      | 8000/20302 [00:08<00:12, 977.24 examples/s]
Map:  44%|████▍     | 9000/20302 [00:09<00:12, 886.75 examples/s]
Map:  49%|████▉     | 10000/20302 [00:10<00:11, 899.98 examples/s]


== Status ==
Current time: 2024-06-04 02:40:46 (running for 00:02:00.72)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




Map:  54%|█████▍    | 11000/20302 [00:11<00:09, 949.06 examples/s]
Map:  59%|█████▉    | 12000/20302 [00:12<00:08, 993.69 examples/s]
Map:  64%|██████▍   | 13000/20302 [00:13<00:07, 988.68 examples/s]
Map:  69%|██████▉   | 14000/20302 [00:14<00:07, 893.47 examples/s]
Map:  74%|███████▍  | 15000/20302 [00:15<00:05, 964.65 examples/s]
Map:  79%|███████▉  | 16000/20302 [00:16<00:04, 1020.37 examples/s]
Map:  84%|████████▎ | 17000/20302 [00:17<00:03, 983.90 examples/s] 
Map:  89%|████████▊ | 18000/20302 [00:18<00:02, 999.31 examples/s]
Map:  94%|█████████▎| 19000/20302 [00:19<00:01, 916.56 examples/s]
Map:  99%|█████████▊| 20000/20302 [00:20<00:00, 942.27 examples/s]
Map: 100%|██████████| 20302/20302 [00:21<00:00, 966.67 examples/s]


== Status ==
Current time: 2024-06-04 02:40:56 (running for 00:02:10.75)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




[36m(train_func pid=336911)[0m dataloader_config = DataLoaderConfiguration(dispatch_batches=None)
  0%|          | 0/2557 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


== Status ==
Current time: 2024-06-04 02:41:06 (running for 00:02:20.78)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:41:16 (running for 00:02:30.80)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:41:26 (running for 00:02:40.83)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logi

  0%|          | 1/2557 [00:49<35:17:59, 49.72s/it]


== Status ==
Current time: 2024-06-04 02:41:57 (running for 00:03:10.91)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:42:07 (running for 00:03:20.94)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:42:17 (running for 00:03:30.95)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logi

  0%|          | 2/2557 [01:45<37:55:58, 53.45s/it]


== Status ==
Current time: 2024-06-04 02:42:47 (running for 00:04:01.05)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:42:57 (running for 00:04:11.09)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:43:07 (running for 00:04:21.12)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logi

  0%|          | 3/2557 [02:33<35:57:53, 50.69s/it]


== Status ==
Current time: 2024-06-04 02:43:37 (running for 00:04:51.21)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:43:47 (running for 00:05:01.23)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:43:57 (running for 00:05:11.27)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logi

  0%|          | 4/2557 [03:36<39:24:51, 55.58s/it]


== Status ==
Current time: 2024-06-04 02:44:37 (running for 00:05:51.38)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:44:47 (running for 00:06:01.41)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:44:57 (running for 00:06:11.44)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logi

  0%|          | 5/2557 [04:12<34:20:24, 48.44s/it]


== Status ==
Current time: 2024-06-04 02:45:17 (running for 00:06:31.61)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:45:27 (running for 00:06:41.64)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:45:37 (running for 00:06:51.66)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logi

  0%|          | 6/2557 [04:54<32:56:59, 46.50s/it]


== Status ==
Current time: 2024-06-04 02:45:57 (running for 00:07:11.73)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:46:07 (running for 00:07:21.76)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:46:17 (running for 00:07:31.78)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logi

  0%|          | 7/2557 [05:50<35:00:09, 49.42s/it]


== Status ==
Current time: 2024-06-04 02:46:58 (running for 00:08:11.91)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:47:08 (running for 00:08:21.96)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:47:18 (running for 00:08:31.99)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logi

  0%|          | 8/2557 [06:59<39:33:23, 55.87s/it]


== Status ==
Current time: 2024-06-04 02:48:08 (running for 00:09:22.19)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:48:18 (running for 00:09:32.22)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:48:28 (running for 00:09:42.25)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logi

  0%|          | 9/2557 [08:12<43:20:37, 61.24s/it]


== Status ==
Current time: 2024-06-04 02:49:18 (running for 00:10:32.37)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:49:28 (running for 00:10:42.40)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:49:38 (running for 00:10:52.42)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logi

  0%|          | 10/2557 [09:25<45:53:37, 64.87s/it]


== Status ==
Current time: 2024-06-04 02:50:28 (running for 00:11:42.62)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:50:38 (running for 00:11:52.64)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:50:48 (running for 00:12:02.67)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logi

  0%|          | 11/2557 [10:40<48:03:08, 67.95s/it]


== Status ==
Current time: 2024-06-04 02:51:48 (running for 00:13:02.83)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:51:58 (running for 00:13:12.86)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:52:08 (running for 00:13:22.88)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logi

  0%|          | 12/2557 [11:21<42:05:54, 59.55s/it]


== Status ==
Current time: 2024-06-04 02:52:29 (running for 00:13:43.00)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:52:39 (running for 00:13:53.02)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:52:49 (running for 00:14:03.05)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logi

  1%|          | 13/2557 [12:20<42:00:27, 59.44s/it]


== Status ==
Current time: 2024-06-04 02:53:19 (running for 00:14:33.13)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:53:29 (running for 00:14:43.15)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:53:39 (running for 00:14:53.17)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logi

  1%|          | 14/2557 [13:14<40:45:42, 57.70s/it]


== Status ==
Current time: 2024-06-04 02:54:19 (running for 00:15:33.27)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:54:29 (running for 00:15:43.29)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:54:39 (running for 00:15:53.32)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logi

  1%|          | 15/2557 [13:59<38:13:13, 54.13s/it]


== Status ==
Current time: 2024-06-04 02:54:59 (running for 00:16:13.37)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:55:09 (running for 00:16:23.39)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:55:19 (running for 00:16:33.42)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logi

  1%|          | 16/2557 [14:42<35:46:35, 50.69s/it]


== Status ==
Current time: 2024-06-04 02:55:49 (running for 00:17:03.54)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:55:59 (running for 00:17:13.63)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:56:09 (running for 00:17:23.72)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logi

  1%|          | 17/2557 [15:23<33:42:34, 47.78s/it]


== Status ==
Current time: 2024-06-04 02:56:30 (running for 00:17:43.89)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:56:40 (running for 00:17:53.99)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:56:50 (running for 00:18:04.04)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logi

  1%|          | 18/2557 [16:17<35:02:22, 49.68s/it]


== Status ==
Current time: 2024-06-04 02:57:20 (running for 00:18:34.24)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:57:30 (running for 00:18:44.34)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 0.2/128 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2024-06-04_02-38-03_004700_328376/artifacts/2024-06-04_02-38-46/tune_qa_model/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2024-06-04 02:57:40 (running for 00:18:54.37)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logi

RuntimeError: Caught unexpected exception: Task was killed due to the node running low on memory.
Memory on the node (IP: 172.17.0.13, ID: cc6410da2eb894759e0b9d534b66d9fe489099b538bc5897caac4e0d) where the task (actor ID: 996e88f353f9e77e16c7982b01000000, name=ImplicitFunc.__init__, pid=336911, memory used=16.02GB) was running was 30.45GB / 32.00GB (0.951603), which exceeds the memory usage threshold of 0.95. Ray killed this worker (ID: 06fb3109625fbd86a41e6133fca054b69b0862aa511990a32b1d5887) because it was the most recently scheduled task; to see more information about memory usage on this node, use `ray logs raylet.out -ip 172.17.0.13`. To see the logs of the worker, use `ray logs worker-06fb3109625fbd86a41e6133fca054b69b0862aa511990a32b1d5887*out -ip 172.17.0.13. Top 10 memory users:
PID	MEM(GB)	COMMAND
336911	16.02	ray::ImplicitFunc.train
127410	2.46	/data/root/.vscode-server/cli/servers/Stable-dc96b837cf6bb4af9cd736aa3af08cf8279f7685/server/node /r...
279474	1.08	/root/anaconda3/bin/python -m ipykernel_launcher --f=/root/.local/share/jupyter/runtime/kernel-v2-12...
56	0.89	/opt/module/jdk_1.8/bin/java -Xmx1G -Xms1G -server -XX:+UseG1GC -XX:MaxGCPauseMillis=20 -XX:Initiati...
328376	0.82	/root/anaconda3/bin/python -m ipykernel_launcher --f=/root/.local/share/jupyter/runtime/kernel-v2-12...
127144	0.76	/data/root/.vscode-server/cli/servers/Stable-dc96b837cf6bb4af9cd736aa3af08cf8279f7685/server/node --...
328407	0.57	/root/anaconda3/lib/python3.11/site-packages/ray/core/src/ray/gcs/gcs_server --log_dir=/tmp/ray/sess...
27	0.20	/opt/module/jdk_1.8/bin/java -Dzookeeper.log.dir=/opt/module/zookeeper/apache-zookeeper-3.8.3-bin/bi...
6159	0.15	/data/root/.vscode-server/cli/servers/Stable-dc96b837cf6bb4af9cd736aa3af08cf8279f7685/server/node /d...
10	0.12	/root/anaconda3/bin/python /root/anaconda3/bin/jupyter-lab --port=8888 --ip 0.0.0.0 --no-browser --a...
Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. Set max_restarts and max_task_retries to enable retry when the task crashes due to OOM. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.

[33m(raylet)[0m [2024-06-04 02:58:06,198 E 328649 328649] (raylet) node_manager.cc:3041: 2 Workers (tasks / actors) killed due to memory pressure (OOM), 0 Workers crashed due to other reasons at node (ID: cc6410da2eb894759e0b9d534b66d9fe489099b538bc5897caac4e0d, IP: 172.17.0.13) over the last time period. To see more information about the Workers killed on this node, use `ray logs raylet.out -ip 172.17.0.13`
[33m(raylet)[0m 
[33m(raylet)[0m Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.


In [None]:
# Save the best model
# best_trial = analysis.get_best_trial("eval_loss", "min", "last")
# best_checkpoint_dir = analysis.get_best_checkpoint(best_trial)
# model.save_pretrained(best_checkpoint_dir)
# tokenizer.save_pretrained(best_checkpoint_dir)