In [1]:
import paddle
import paddlenlp
import pandas as pd
import numpy as np
from paddlenlp.transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import f1_score, accuracy_score
from paddle.io import Dataset,DataLoader
from paddlenlp.trainer import Trainer, TrainingArguments
paddle.device.set_device('gpu:0')

Place(gpu:0)

In [2]:
data_train = pd.read_excel("/home/aistudio/work/评测数据集/train/usual_train.xlsx") #训练集
data_test=pd.read_excel("/home/aistudio/work/评测数据集/test（最终评测集）/真实评测集/usual_test_labeled.xlsx") #测试集
data_eval=pd.read_excel("/home/aistudio/work/评测数据集/eval（刷榜数据集）/usual_eval_labeled.xlsx") #验证集

#数据准备，训练用到的为训练集和测试集需要向量化
train_sentence=np.array(data_train["文本"])
train_label=np.array(data_train["情绪标签"])
eval_sentence=np.array(data_eval["文本"])
eval_label=np.array(data_eval["情绪标签"])

#自定义DataSet类
class DataSet(Dataset):
    def __init__(self, data, label, tokenizer):
        self.data = data
        self.label = label
        self.input_data = tokenizer(
            [sentence for sentence in data],
            max_seq_len=512, 
            pad_to_max_seq_len=True
        )

    def __len__(self):
        return len(self.label)

    def __getitem__(self, item):
        return {
            "input_ids": self.input_data['input_ids'][item],
            "token_type_ids": self.input_data['token_type_ids'][item],
            "labels": self.label[item],
        }

#导入预训练好的中文tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
#将训练集和验证集实例化
train_set=DataSet(train_sentence,train_label,tokenizer)
eval_set=DataSet(eval_sentence,eval_label,tokenizer)


[2024-03-04 12:49:11,078] [    INFO] - We are using <class 'paddlenlp.transformers.bert.tokenizer.BertTokenizer'> to load 'bert-base-chinese'.
[2024-03-04 12:49:11,085] [    INFO] - Already cached /home/aistudio/.paddlenlp/models/bert-base-chinese/bert-base-chinese-vocab.txt
[2024-03-04 12:49:11,106] [    INFO] - tokenizer config file saved in /home/aistudio/.paddlenlp/models/bert-base-chinese/tokenizer_config.json
[2024-03-04 12:49:11,114] [    INFO] - Special tokens file saved in /home/aistudio/.paddlenlp/models/bert-base-chinese/special_tokens_map.json


In [3]:
#指定好训练参数
training_args = TrainingArguments(
    output_dir='./work/output',
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,  
    dataloader_num_workers=2,
    logging_dir='./work/logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=200,
    save_steps=200,
    save_total_limit=10,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
)

[2024-03-04 12:49:24,318] [    INFO] - The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [4]:
#定义一个评价函数
def compute_metrics(pred):
    lables = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(lables, preds)
    f1 = f1_score(lables, preds, average="macro")
    return {
        "accuracy": acc,
        "f1": f1,
    }

In [5]:
#导入预训练的中文BERT模型
model = AutoModelForSequenceClassification.from_pretrained("bert-base-chinese", num_classes=6,cache_dir="./work/")
#指定所用模型，损失函数，训练参数，数据集，验证集，评价函数，然后实例化训练器
trainer=Trainer(
    model=model,
    criterion=paddle.nn.CrossEntropyLoss(),
    args=training_args,
    train_dataset=train_set,
    eval_dataset=eval_set,
    compute_metrics=compute_metrics
)

[2024-03-04 12:49:24,331] [    INFO] - We are using <class 'paddlenlp.transformers.bert.modeling.BertForSequenceClassification'> to load 'bert-base-chinese'.
[2024-03-04 12:49:24,335] [    INFO] - Already cached /home/aistudio/.paddlenlp/models/bert-base-chinese/bert-base-chinese.pdparams
W0304 12:49:24.340519  3149 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 12.0, Runtime API Version: 11.2
W0304 12:49:24.345371  3149 gpu_resources.cc:91] device: 0, cuDNN Version: 8.2.
[2024-03-04 12:49:28,137] [    INFO] -     Training Configuration Arguments    
[2024-03-04 12:49:28,143] [    INFO] - paddle commit id              :3fa7a736e32508e797616b6344d97814c37d3ff8
[2024-03-04 12:49:28,149] [    INFO] - _no_sync_in_gradient_accumulation:True
[2024-03-04 12:49:28,154] [    INFO] - adam_beta1                    :0.9
[2024-03-04 12:49:28,157] [    INFO] - adam_beta2                    :0.999
[2024-03-04 12:49:28,159] [    INFO] - adam_epsi

In [6]:
#开始训练，训练完成后将模型保存再work目录下
trainer.train()
trainer.save_model("./work/model")

[2024-03-04 12:49:28,449] [    INFO] - ***** Running training *****
[2024-03-04 12:49:28,452] [    INFO] -   Num examples = 27766
[2024-03-04 12:49:28,455] [    INFO] -   Num Epochs = 10
[2024-03-04 12:49:28,458] [    INFO] -   Instantaneous batch size per device = 16
[2024-03-04 12:49:28,460] [    INFO] -   Total train batch size (w. parallel, distributed & accumulation) = 16
[2024-03-04 12:49:28,463] [    INFO] -   Gradient Accumulation steps = 1
[2024-03-04 12:49:28,466] [    INFO] -   Total optimization steps = 17360
[2024-03-04 12:49:28,469] [    INFO] -   Total num train samples = 277660


  0%|          | 0/17360 [00:00<?, ?it/s]

loss: 1.63693829, learning_rate: 4.997119815668203e-05, global_step: 10, interval_runtime: 6.1632, interval_samples_per_second: 2.596, interval_steps_per_second: 1.623, epoch: 0.0058
loss: 1.41821852, learning_rate: 4.994239631336406e-05, global_step: 20, interval_runtime: 4.3377, interval_samples_per_second: 3.689, interval_steps_per_second: 2.305, epoch: 0.0115
loss: 1.16201048, learning_rate: 4.991359447004609e-05, global_step: 30, interval_runtime: 4.3464, interval_samples_per_second: 3.681, interval_steps_per_second: 2.301, epoch: 0.0173
loss: 1.00052643, learning_rate: 4.988479262672811e-05, global_step: 40, interval_runtime: 4.3461, interval_samples_per_second: 3.681, interval_steps_per_second: 2.301, epoch: 0.023
loss: 0.9472127, learning_rate: 4.985599078341014e-05, global_step: 50, interval_runtime: 4.3423, interval_samples_per_second: 3.685, interval_steps_per_second: 2.303, epoch: 0.0288
loss: 1.01215458, learning_rate: 4.9827188940092165e-05, global_step: 60, interval

[2024-03-04 12:50:57,556] [    INFO] - ***** Running Evaluation *****
[2024-03-04 12:50:57,558] [    INFO] -   Num examples = 2000
[2024-03-04 12:50:57,561] [    INFO] -   Pre device batch size = 16
[2024-03-04 12:50:57,563] [    INFO] -   Total Batch size = 16
[2024-03-04 12:50:57,565] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 0.7947730422019958, eval_accuracy: 0.731, eval_f1: 0.6826489511285422, eval_runtime: 20.6134, eval_samples_per_second: 97.024, eval_steps_per_second: 6.064, epoch: 0.1152


[2024-03-04 12:51:18,176] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-200


loss: 0.86151829, learning_rate: 4.939516129032259e-05, global_step: 210, interval_runtime: 36.069, interval_samples_per_second: 0.444, interval_steps_per_second: 0.277, epoch: 0.121
loss: 0.82195225, learning_rate: 4.936635944700461e-05, global_step: 220, interval_runtime: 4.3742, interval_samples_per_second: 3.658, interval_steps_per_second: 2.286, epoch: 0.1267
loss: 0.79830728, learning_rate: 4.933755760368664e-05, global_step: 230, interval_runtime: 4.3733, interval_samples_per_second: 3.659, interval_steps_per_second: 2.287, epoch: 0.1325
loss: 0.76559358, learning_rate: 4.9308755760368664e-05, global_step: 240, interval_runtime: 4.3782, interval_samples_per_second: 3.654, interval_steps_per_second: 2.284, epoch: 0.1382
loss: 0.87336502, learning_rate: 4.927995391705069e-05, global_step: 250, interval_runtime: 4.3922, interval_samples_per_second: 3.643, interval_steps_per_second: 2.277, epoch: 0.144
loss: 0.76584845, learning_rate: 4.925115207373272e-05, global_step: 260, in

[2024-03-04 12:52:57,091] [    INFO] - ***** Running Evaluation *****
[2024-03-04 12:52:57,094] [    INFO] -   Num examples = 2000
[2024-03-04 12:52:57,096] [    INFO] -   Pre device batch size = 16
[2024-03-04 12:52:57,099] [    INFO] -   Total Batch size = 16
[2024-03-04 12:52:57,101] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 0.7751703262329102, eval_accuracy: 0.722, eval_f1: 0.6915939287058941, eval_runtime: 20.8228, eval_samples_per_second: 96.049, eval_steps_per_second: 6.003, epoch: 0.2304


[2024-03-04 12:53:17,925] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-400


loss: 0.86745949, learning_rate: 4.881912442396314e-05, global_step: 410, interval_runtime: 36.3856, interval_samples_per_second: 0.44, interval_steps_per_second: 0.275, epoch: 0.2362
loss: 0.97101698, learning_rate: 4.8790322580645164e-05, global_step: 420, interval_runtime: 4.401, interval_samples_per_second: 3.636, interval_steps_per_second: 2.272, epoch: 0.2419
loss: 0.9155304, learning_rate: 4.876152073732719e-05, global_step: 430, interval_runtime: 4.3918, interval_samples_per_second: 3.643, interval_steps_per_second: 2.277, epoch: 0.2477
loss: 0.8215621, learning_rate: 4.873271889400922e-05, global_step: 440, interval_runtime: 4.4019, interval_samples_per_second: 3.635, interval_steps_per_second: 2.272, epoch: 0.2535
loss: 0.85305538, learning_rate: 4.870391705069124e-05, global_step: 450, interval_runtime: 4.3997, interval_samples_per_second: 3.637, interval_steps_per_second: 2.273, epoch: 0.2592
loss: 0.72033863, learning_rate: 4.8675115207373275e-05, global_step: 460, in

[2024-03-04 12:54:57,186] [    INFO] - ***** Running Evaluation *****
[2024-03-04 12:54:57,188] [    INFO] -   Num examples = 2000
[2024-03-04 12:54:57,190] [    INFO] -   Pre device batch size = 16
[2024-03-04 12:54:57,193] [    INFO] -   Total Batch size = 16
[2024-03-04 12:54:57,195] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 0.7344194650650024, eval_accuracy: 0.74, eval_f1: 0.7043349691422547, eval_runtime: 20.9173, eval_samples_per_second: 95.614, eval_steps_per_second: 5.976, epoch: 0.3456


[2024-03-04 12:55:18,112] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-600


loss: 0.79023409, learning_rate: 4.824308755760369e-05, global_step: 610, interval_runtime: 37.0588, interval_samples_per_second: 0.432, interval_steps_per_second: 0.27, epoch: 0.3514
loss: 0.77363892, learning_rate: 4.8214285714285716e-05, global_step: 620, interval_runtime: 4.4224, interval_samples_per_second: 3.618, interval_steps_per_second: 2.261, epoch: 0.3571
loss: 0.63662872, learning_rate: 4.818548387096775e-05, global_step: 630, interval_runtime: 4.4281, interval_samples_per_second: 3.613, interval_steps_per_second: 2.258, epoch: 0.3629
loss: 0.9003459, learning_rate: 4.8156682027649774e-05, global_step: 640, interval_runtime: 4.4171, interval_samples_per_second: 3.622, interval_steps_per_second: 2.264, epoch: 0.3687
loss: 0.77951646, learning_rate: 4.81278801843318e-05, global_step: 650, interval_runtime: 4.414, interval_samples_per_second: 3.625, interval_steps_per_second: 2.266, epoch: 0.3744
loss: 0.81409063, learning_rate: 4.8099078341013826e-05, global_step: 660, i

[2024-03-04 12:56:58,018] [    INFO] - ***** Running Evaluation *****
[2024-03-04 12:56:58,022] [    INFO] -   Num examples = 2000
[2024-03-04 12:56:58,025] [    INFO] -   Pre device batch size = 16
[2024-03-04 12:56:58,027] [    INFO] -   Total Batch size = 16
[2024-03-04 12:56:58,029] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 0.7306503057479858, eval_accuracy: 0.7455, eval_f1: 0.7151519811066677, eval_runtime: 20.9558, eval_samples_per_second: 95.439, eval_steps_per_second: 5.965, epoch: 0.4608


[2024-03-04 12:57:18,981] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-800
[2024-03-04 12:57:22,661] [    INFO] - Deleting older checkpoint [work/output/checkpoint-34700] due to args.save_total_limit


loss: 0.73585172, learning_rate: 4.766705069124424e-05, global_step: 810, interval_runtime: 29.0676, interval_samples_per_second: 0.55, interval_steps_per_second: 0.344, epoch: 0.4666
loss: 0.79117918, learning_rate: 4.763824884792627e-05, global_step: 820, interval_runtime: 4.4047, interval_samples_per_second: 3.632, interval_steps_per_second: 2.27, epoch: 0.4724
loss: 0.66303582, learning_rate: 4.76094470046083e-05, global_step: 830, interval_runtime: 4.4201, interval_samples_per_second: 3.62, interval_steps_per_second: 2.262, epoch: 0.4781
loss: 0.67692022, learning_rate: 4.7580645161290326e-05, global_step: 840, interval_runtime: 4.4164, interval_samples_per_second: 3.623, interval_steps_per_second: 2.264, epoch: 0.4839
loss: 0.55614233, learning_rate: 4.755184331797235e-05, global_step: 850, interval_runtime: 4.4169, interval_samples_per_second: 3.622, interval_steps_per_second: 2.264, epoch: 0.4896
loss: 0.77006936, learning_rate: 4.752304147465438e-05, global_step: 860, int

[2024-03-04 12:58:50,815] [    INFO] - ***** Running Evaluation *****
[2024-03-04 12:58:50,819] [    INFO] -   Num examples = 2000
[2024-03-04 12:58:50,822] [    INFO] -   Pre device batch size = 16
[2024-03-04 12:58:50,826] [    INFO] -   Total Batch size = 16
[2024-03-04 12:58:50,828] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 0.7400710582733154, eval_accuracy: 0.729, eval_f1: 0.708803553103821, eval_runtime: 20.8094, eval_samples_per_second: 96.111, eval_steps_per_second: 6.007, epoch: 0.576


[2024-03-04 12:59:11,630] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-1000
[2024-03-04 12:59:15,030] [    INFO] - Deleting older checkpoint [work/output/checkpoint-34710] due to args.save_total_limit


loss: 0.67830677, learning_rate: 4.709101382488479e-05, global_step: 1010, interval_runtime: 28.6481, interval_samples_per_second: 0.559, interval_steps_per_second: 0.349, epoch: 0.5818
loss: 0.82095194, learning_rate: 4.7062211981566826e-05, global_step: 1020, interval_runtime: 4.4145, interval_samples_per_second: 3.624, interval_steps_per_second: 2.265, epoch: 0.5876
loss: 0.79213505, learning_rate: 4.703341013824885e-05, global_step: 1030, interval_runtime: 4.4335, interval_samples_per_second: 3.609, interval_steps_per_second: 2.256, epoch: 0.5933
loss: 0.6410213, learning_rate: 4.700460829493088e-05, global_step: 1040, interval_runtime: 4.4678, interval_samples_per_second: 3.581, interval_steps_per_second: 2.238, epoch: 0.5991
loss: 0.81676493, learning_rate: 4.697580645161291e-05, global_step: 1050, interval_runtime: 4.4282, interval_samples_per_second: 3.613, interval_steps_per_second: 2.258, epoch: 0.6048
loss: 0.66969528, learning_rate: 4.694700460829493e-05, global_step: 

[2024-03-04 13:00:43,436] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:00:43,439] [    INFO] -   Num examples = 2000
[2024-03-04 13:00:43,442] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:00:43,444] [    INFO] -   Total Batch size = 16
[2024-03-04 13:00:43,446] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 0.6716945767402649, eval_accuracy: 0.7605, eval_f1: 0.7318967169985461, eval_runtime: 20.9281, eval_samples_per_second: 95.565, eval_steps_per_second: 5.973, epoch: 0.6912


[2024-03-04 13:01:04,370] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-1200
[2024-03-04 13:01:08,164] [    INFO] - Deleting older checkpoint [work/output/checkpoint-2000] due to args.save_total_limit


loss: 0.55853863, learning_rate: 4.6514976958525345e-05, global_step: 1210, interval_runtime: 29.1744, interval_samples_per_second: 0.548, interval_steps_per_second: 0.343, epoch: 0.697
loss: 0.60833511, learning_rate: 4.648617511520738e-05, global_step: 1220, interval_runtime: 4.4466, interval_samples_per_second: 3.598, interval_steps_per_second: 2.249, epoch: 0.7028
loss: 0.86867657, learning_rate: 4.6457373271889403e-05, global_step: 1230, interval_runtime: 4.4336, interval_samples_per_second: 3.609, interval_steps_per_second: 2.256, epoch: 0.7085
loss: 0.61505356, learning_rate: 4.642857142857143e-05, global_step: 1240, interval_runtime: 4.4379, interval_samples_per_second: 3.605, interval_steps_per_second: 2.253, epoch: 0.7143
loss: 0.81560726, learning_rate: 4.639976958525346e-05, global_step: 1250, interval_runtime: 4.444, interval_samples_per_second: 3.6, interval_steps_per_second: 2.25, epoch: 0.72
loss: 0.6835681, learning_rate: 4.637096774193548e-05, global_step: 1260, 

[2024-03-04 13:02:37,087] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:02:37,090] [    INFO] -   Num examples = 2000
[2024-03-04 13:02:37,091] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:02:37,094] [    INFO] -   Total Batch size = 16
[2024-03-04 13:02:37,096] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 0.6924294233322144, eval_accuracy: 0.7545, eval_f1: 0.729216186170713, eval_runtime: 20.9379, eval_samples_per_second: 95.521, eval_steps_per_second: 5.97, epoch: 0.8065


[2024-03-04 13:02:58,036] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-1400
[2024-03-04 13:03:01,849] [    INFO] - Deleting older checkpoint [work/output/checkpoint-4000] due to args.save_total_limit


loss: 0.69151964, learning_rate: 4.5938940092165897e-05, global_step: 1410, interval_runtime: 29.2113, interval_samples_per_second: 0.548, interval_steps_per_second: 0.342, epoch: 0.8122
loss: 0.80565481, learning_rate: 4.591013824884793e-05, global_step: 1420, interval_runtime: 4.492, interval_samples_per_second: 3.562, interval_steps_per_second: 2.226, epoch: 0.818
loss: 0.59375486, learning_rate: 4.5881336405529955e-05, global_step: 1430, interval_runtime: 4.4796, interval_samples_per_second: 3.572, interval_steps_per_second: 2.232, epoch: 0.8237
loss: 0.80220919, learning_rate: 4.585253456221199e-05, global_step: 1440, interval_runtime: 4.4147, interval_samples_per_second: 3.624, interval_steps_per_second: 2.265, epoch: 0.8295
loss: 0.67036304, learning_rate: 4.5823732718894014e-05, global_step: 1450, interval_runtime: 4.4304, interval_samples_per_second: 3.611, interval_steps_per_second: 2.257, epoch: 0.8353
loss: 0.65784626, learning_rate: 4.579493087557604e-05, global_step:

[2024-03-04 13:04:30,449] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:04:30,453] [    INFO] -   Num examples = 2000
[2024-03-04 13:04:30,457] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:04:30,460] [    INFO] -   Total Batch size = 16
[2024-03-04 13:04:30,463] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 0.6919365525245667, eval_accuracy: 0.765, eval_f1: 0.7303350876117921, eval_runtime: 21.0241, eval_samples_per_second: 95.129, eval_steps_per_second: 5.946, epoch: 0.9217


[2024-03-04 13:04:51,480] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-1600
[2024-03-04 13:04:55,276] [    INFO] - Deleting older checkpoint [work/output/checkpoint-6000] due to args.save_total_limit


loss: 0.73888597, learning_rate: 4.5362903225806455e-05, global_step: 1610, interval_runtime: 29.258, interval_samples_per_second: 0.547, interval_steps_per_second: 0.342, epoch: 0.9274
loss: 0.77147188, learning_rate: 4.533410138248848e-05, global_step: 1620, interval_runtime: 4.4087, interval_samples_per_second: 3.629, interval_steps_per_second: 2.268, epoch: 0.9332
loss: 0.73463268, learning_rate: 4.530529953917051e-05, global_step: 1630, interval_runtime: 4.4259, interval_samples_per_second: 3.615, interval_steps_per_second: 2.259, epoch: 0.9389
loss: 0.6897584, learning_rate: 4.527649769585254e-05, global_step: 1640, interval_runtime: 4.4292, interval_samples_per_second: 3.612, interval_steps_per_second: 2.258, epoch: 0.9447
loss: 0.71953998, learning_rate: 4.5247695852534565e-05, global_step: 1650, interval_runtime: 4.4328, interval_samples_per_second: 3.609, interval_steps_per_second: 2.256, epoch: 0.9505
loss: 0.53310289, learning_rate: 4.52188940092166e-05, global_step: 1

[2024-03-04 13:06:23,941] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:06:23,944] [    INFO] -   Num examples = 2000
[2024-03-04 13:06:23,947] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:06:23,950] [    INFO] -   Total Batch size = 16
[2024-03-04 13:06:23,953] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 0.7478088140487671, eval_accuracy: 0.758, eval_f1: 0.7099497747502338, eval_runtime: 20.9824, eval_samples_per_second: 95.318, eval_steps_per_second: 5.957, epoch: 1.0369


[2024-03-04 13:06:44,933] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-1800
[2024-03-04 13:06:49,271] [    INFO] - Deleting older checkpoint [work/output/checkpoint-8000] due to args.save_total_limit


loss: 0.53293438, learning_rate: 4.4786866359447007e-05, global_step: 1810, interval_runtime: 29.8752, interval_samples_per_second: 0.536, interval_steps_per_second: 0.335, epoch: 1.0426
loss: 0.60602751, learning_rate: 4.475806451612903e-05, global_step: 1820, interval_runtime: 4.4306, interval_samples_per_second: 3.611, interval_steps_per_second: 2.257, epoch: 1.0484
loss: 0.52126417, learning_rate: 4.4729262672811065e-05, global_step: 1830, interval_runtime: 4.4341, interval_samples_per_second: 3.608, interval_steps_per_second: 2.255, epoch: 1.0541
loss: 0.469839, learning_rate: 4.470046082949309e-05, global_step: 1840, interval_runtime: 4.4325, interval_samples_per_second: 3.61, interval_steps_per_second: 2.256, epoch: 1.0599
loss: 0.41080995, learning_rate: 4.467165898617512e-05, global_step: 1850, interval_runtime: 4.4213, interval_samples_per_second: 3.619, interval_steps_per_second: 2.262, epoch: 1.0657
loss: 0.57756624, learning_rate: 4.464285714285715e-05, global_step: 1

[2024-03-04 13:08:18,022] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:08:18,025] [    INFO] -   Num examples = 2000
[2024-03-04 13:08:18,029] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:08:18,035] [    INFO] -   Total Batch size = 16
[2024-03-04 13:08:18,038] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 0.753353476524353, eval_accuracy: 0.76, eval_f1: 0.7222526836044502, eval_runtime: 20.9368, eval_samples_per_second: 95.526, eval_steps_per_second: 5.97, epoch: 1.1521


[2024-03-04 13:08:38,965] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-2000
[2024-03-04 13:08:42,719] [    INFO] - Deleting older checkpoint [work/output/checkpoint-8680] due to args.save_total_limit


loss: 0.51607423, learning_rate: 4.421082949308756e-05, global_step: 2010, interval_runtime: 29.1388, interval_samples_per_second: 0.549, interval_steps_per_second: 0.343, epoch: 1.1578
loss: 0.52073946, learning_rate: 4.4182027649769584e-05, global_step: 2020, interval_runtime: 4.4197, interval_samples_per_second: 3.62, interval_steps_per_second: 2.263, epoch: 1.1636
loss: 0.59188714, learning_rate: 4.415322580645162e-05, global_step: 2030, interval_runtime: 4.4237, interval_samples_per_second: 3.617, interval_steps_per_second: 2.261, epoch: 1.1694
loss: 0.47208495, learning_rate: 4.412442396313364e-05, global_step: 2040, interval_runtime: 4.4289, interval_samples_per_second: 3.613, interval_steps_per_second: 2.258, epoch: 1.1751
loss: 0.54158907, learning_rate: 4.409562211981567e-05, global_step: 2050, interval_runtime: 4.4275, interval_samples_per_second: 3.614, interval_steps_per_second: 2.259, epoch: 1.1809
loss: 0.53747311, learning_rate: 4.40668202764977e-05, global_step: 2

[2024-03-04 13:10:11,396] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:10:11,399] [    INFO] -   Num examples = 2000
[2024-03-04 13:10:11,401] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:10:11,403] [    INFO] -   Total Batch size = 16
[2024-03-04 13:10:11,405] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 0.7254361510276794, eval_accuracy: 0.76, eval_f1: 0.7325324525416992, eval_runtime: 20.9994, eval_samples_per_second: 95.241, eval_steps_per_second: 5.953, epoch: 1.2673


[2024-03-04 13:10:32,419] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-2200
[2024-03-04 13:10:36,447] [    INFO] - Deleting older checkpoint [work/output/checkpoint-200] due to args.save_total_limit


loss: 0.60166559, learning_rate: 4.363479262672811e-05, global_step: 2210, interval_runtime: 29.497, interval_samples_per_second: 0.542, interval_steps_per_second: 0.339, epoch: 1.273
loss: 0.50307169, learning_rate: 4.3605990783410136e-05, global_step: 2220, interval_runtime: 4.4225, interval_samples_per_second: 3.618, interval_steps_per_second: 2.261, epoch: 1.2788
loss: 0.57385993, learning_rate: 4.357718894009217e-05, global_step: 2230, interval_runtime: 4.4235, interval_samples_per_second: 3.617, interval_steps_per_second: 2.261, epoch: 1.2846
loss: 0.48271666, learning_rate: 4.3548387096774194e-05, global_step: 2240, interval_runtime: 4.4437, interval_samples_per_second: 3.601, interval_steps_per_second: 2.25, epoch: 1.2903
loss: 0.67006273, learning_rate: 4.351958525345623e-05, global_step: 2250, interval_runtime: 4.4374, interval_samples_per_second: 3.606, interval_steps_per_second: 2.254, epoch: 1.2961
loss: 0.42768683, learning_rate: 4.349078341013825e-05, global_step: 2

[2024-03-04 13:12:05,174] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:12:05,177] [    INFO] -   Num examples = 2000
[2024-03-04 13:12:05,179] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:12:05,182] [    INFO] -   Total Batch size = 16
[2024-03-04 13:12:05,184] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 0.7124483585357666, eval_accuracy: 0.7565, eval_f1: 0.7325057653060411, eval_runtime: 20.9021, eval_samples_per_second: 95.684, eval_steps_per_second: 5.98, epoch: 1.3825


[2024-03-04 13:12:26,083] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-2400
[2024-03-04 13:12:30,581] [    INFO] - Deleting older checkpoint [work/output/checkpoint-400] due to args.save_total_limit


loss: 0.468713, learning_rate: 4.305875576036866e-05, global_step: 2410, interval_runtime: 29.8502, interval_samples_per_second: 0.536, interval_steps_per_second: 0.335, epoch: 1.3882
loss: 0.4795083, learning_rate: 4.3029953917050694e-05, global_step: 2420, interval_runtime: 4.4501, interval_samples_per_second: 3.595, interval_steps_per_second: 2.247, epoch: 1.394
loss: 0.57197375, learning_rate: 4.300115207373272e-05, global_step: 2430, interval_runtime: 4.4038, interval_samples_per_second: 3.633, interval_steps_per_second: 2.271, epoch: 1.3998
loss: 0.50669842, learning_rate: 4.2972350230414746e-05, global_step: 2440, interval_runtime: 4.4192, interval_samples_per_second: 3.621, interval_steps_per_second: 2.263, epoch: 1.4055
loss: 0.48477101, learning_rate: 4.294354838709678e-05, global_step: 2450, interval_runtime: 4.4557, interval_samples_per_second: 3.591, interval_steps_per_second: 2.244, epoch: 1.4113
loss: 0.54707856, learning_rate: 4.2914746543778805e-05, global_step: 2

[2024-03-04 13:13:58,967] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:13:58,971] [    INFO] -   Num examples = 2000
[2024-03-04 13:13:58,974] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:13:58,977] [    INFO] -   Total Batch size = 16
[2024-03-04 13:13:58,979] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 0.7769097089767456, eval_accuracy: 0.745, eval_f1: 0.7092464248049452, eval_runtime: 20.9993, eval_samples_per_second: 95.241, eval_steps_per_second: 5.953, epoch: 1.4977


[2024-03-04 13:14:19,969] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-2600
[2024-03-04 13:14:24,105] [    INFO] - Deleting older checkpoint [work/output/checkpoint-600] due to args.save_total_limit


loss: 0.60179739, learning_rate: 4.248271889400922e-05, global_step: 2610, interval_runtime: 29.6098, interval_samples_per_second: 0.54, interval_steps_per_second: 0.338, epoch: 1.5035
loss: 0.68053041, learning_rate: 4.2453917050691246e-05, global_step: 2620, interval_runtime: 4.4026, interval_samples_per_second: 3.634, interval_steps_per_second: 2.271, epoch: 1.5092
loss: 0.59616032, learning_rate: 4.242511520737327e-05, global_step: 2630, interval_runtime: 4.4108, interval_samples_per_second: 3.627, interval_steps_per_second: 2.267, epoch: 1.515
loss: 0.51292572, learning_rate: 4.2396313364055304e-05, global_step: 2640, interval_runtime: 4.4188, interval_samples_per_second: 3.621, interval_steps_per_second: 2.263, epoch: 1.5207
loss: 0.55425835, learning_rate: 4.236751152073733e-05, global_step: 2650, interval_runtime: 4.4182, interval_samples_per_second: 3.621, interval_steps_per_second: 2.263, epoch: 1.5265
loss: 0.5072773, learning_rate: 4.2338709677419356e-05, global_step: 

[2024-03-04 13:15:52,495] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:15:52,498] [    INFO] -   Num examples = 2000
[2024-03-04 13:15:52,501] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:15:52,503] [    INFO] -   Total Batch size = 16
[2024-03-04 13:15:52,505] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 0.7174680233001709, eval_accuracy: 0.7675, eval_f1: 0.7386200102986379, eval_runtime: 20.836, eval_samples_per_second: 95.988, eval_steps_per_second: 5.999, epoch: 1.6129


[2024-03-04 13:16:13,339] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-2800
[2024-03-04 13:16:17,397] [    INFO] - Deleting older checkpoint [work/output/checkpoint-800] due to args.save_total_limit


loss: 0.64551706, learning_rate: 4.190668202764977e-05, global_step: 2810, interval_runtime: 29.3333, interval_samples_per_second: 0.545, interval_steps_per_second: 0.341, epoch: 1.6187
loss: 0.66895127, learning_rate: 4.18778801843318e-05, global_step: 2820, interval_runtime: 4.414, interval_samples_per_second: 3.625, interval_steps_per_second: 2.266, epoch: 1.6244
loss: 0.55817118, learning_rate: 4.184907834101382e-05, global_step: 2830, interval_runtime: 4.4352, interval_samples_per_second: 3.607, interval_steps_per_second: 2.255, epoch: 1.6302
loss: 0.64855599, learning_rate: 4.1820276497695856e-05, global_step: 2840, interval_runtime: 4.4334, interval_samples_per_second: 3.609, interval_steps_per_second: 2.256, epoch: 1.6359
loss: 0.53550224, learning_rate: 4.179147465437788e-05, global_step: 2850, interval_runtime: 4.4096, interval_samples_per_second: 3.628, interval_steps_per_second: 2.268, epoch: 1.6417
loss: 0.52453465, learning_rate: 4.176267281105991e-05, global_step: 2

[2024-03-04 13:17:45,660] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:17:45,662] [    INFO] -   Num examples = 2000
[2024-03-04 13:17:45,665] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:17:45,667] [    INFO] -   Total Batch size = 16
[2024-03-04 13:17:45,670] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 0.7119519710540771, eval_accuracy: 0.7575, eval_f1: 0.7289252800573073, eval_runtime: 20.9206, eval_samples_per_second: 95.6, eval_steps_per_second: 5.975, epoch: 1.7281


[2024-03-04 13:18:06,586] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-3000
[2024-03-04 13:18:10,655] [    INFO] - Deleting older checkpoint [work/output/checkpoint-1000] due to args.save_total_limit


loss: 0.55326314, learning_rate: 4.133064516129033e-05, global_step: 3010, interval_runtime: 29.441, interval_samples_per_second: 0.543, interval_steps_per_second: 0.34, epoch: 1.7339
loss: 0.44706588, learning_rate: 4.130184331797235e-05, global_step: 3020, interval_runtime: 4.4082, interval_samples_per_second: 3.63, interval_steps_per_second: 2.268, epoch: 1.7396
loss: 0.55641246, learning_rate: 4.1273041474654375e-05, global_step: 3030, interval_runtime: 4.4149, interval_samples_per_second: 3.624, interval_steps_per_second: 2.265, epoch: 1.7454
loss: 0.62753687, learning_rate: 4.124423963133641e-05, global_step: 3040, interval_runtime: 4.403, interval_samples_per_second: 3.634, interval_steps_per_second: 2.271, epoch: 1.7512
loss: 0.45314431, learning_rate: 4.1215437788018434e-05, global_step: 3050, interval_runtime: 4.4216, interval_samples_per_second: 3.619, interval_steps_per_second: 2.262, epoch: 1.7569
loss: 0.50177712, learning_rate: 4.1186635944700466e-05, global_step: 3

[2024-03-04 13:19:38,986] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:19:38,989] [    INFO] -   Num examples = 2000
[2024-03-04 13:19:38,992] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:19:38,994] [    INFO] -   Total Batch size = 16
[2024-03-04 13:19:38,996] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 0.7035934925079346, eval_accuracy: 0.765, eval_f1: 0.7318071272777975, eval_runtime: 20.825, eval_samples_per_second: 96.038, eval_steps_per_second: 6.002, epoch: 1.8433


[2024-03-04 13:19:59,822] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-3200
[2024-03-04 13:20:03,932] [    INFO] - Deleting older checkpoint [work/output/checkpoint-1200] due to args.save_total_limit


loss: 0.55647655, learning_rate: 4.075460829493088e-05, global_step: 3210, interval_runtime: 29.3823, interval_samples_per_second: 0.545, interval_steps_per_second: 0.34, epoch: 1.8491
loss: 0.52292752, learning_rate: 4.072580645161291e-05, global_step: 3220, interval_runtime: 4.406, interval_samples_per_second: 3.631, interval_steps_per_second: 2.27, epoch: 1.8548
loss: 0.48161206, learning_rate: 4.0697004608294933e-05, global_step: 3230, interval_runtime: 4.4168, interval_samples_per_second: 3.623, interval_steps_per_second: 2.264, epoch: 1.8606
loss: 0.45087724, learning_rate: 4.066820276497696e-05, global_step: 3240, interval_runtime: 4.4181, interval_samples_per_second: 3.621, interval_steps_per_second: 2.263, epoch: 1.8664
loss: 0.57789359, learning_rate: 4.0639400921658985e-05, global_step: 3250, interval_runtime: 4.4677, interval_samples_per_second: 3.581, interval_steps_per_second: 2.238, epoch: 1.8721
loss: 0.65472736, learning_rate: 4.061059907834102e-05, global_step: 3

[2024-03-04 13:21:32,165] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:21:32,172] [    INFO] -   Num examples = 2000
[2024-03-04 13:21:32,178] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:21:32,181] [    INFO] -   Total Batch size = 16
[2024-03-04 13:21:32,185] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 0.7260286808013916, eval_accuracy: 0.7605, eval_f1: 0.7252786095638394, eval_runtime: 21.0598, eval_samples_per_second: 94.968, eval_steps_per_second: 5.935, epoch: 1.9585


[2024-03-04 13:21:53,236] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-3400
[2024-03-04 13:21:57,288] [    INFO] - Deleting older checkpoint [work/output/checkpoint-1400] due to args.save_total_limit


loss: 0.5099638, learning_rate: 4.017857142857143e-05, global_step: 3410, interval_runtime: 29.5813, interval_samples_per_second: 0.541, interval_steps_per_second: 0.338, epoch: 1.9643
loss: 0.55543485, learning_rate: 4.014976958525346e-05, global_step: 3420, interval_runtime: 4.4284, interval_samples_per_second: 3.613, interval_steps_per_second: 2.258, epoch: 1.97
loss: 0.58148937, learning_rate: 4.0120967741935485e-05, global_step: 3430, interval_runtime: 4.4325, interval_samples_per_second: 3.61, interval_steps_per_second: 2.256, epoch: 1.9758
loss: 0.58463616, learning_rate: 4.009216589861751e-05, global_step: 3440, interval_runtime: 4.4225, interval_samples_per_second: 3.618, interval_steps_per_second: 2.261, epoch: 1.9816
loss: 0.5869451, learning_rate: 4.0063364055299544e-05, global_step: 3450, interval_runtime: 4.4284, interval_samples_per_second: 3.613, interval_steps_per_second: 2.258, epoch: 1.9873
loss: 0.58797646, learning_rate: 4.003456221198157e-05, global_step: 346

[2024-03-04 13:23:25,825] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:23:25,839] [    INFO] -   Num examples = 2000
[2024-03-04 13:23:25,845] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:23:25,847] [    INFO] -   Total Batch size = 16
[2024-03-04 13:23:25,849] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 0.7831739783287048, eval_accuracy: 0.7545, eval_f1: 0.7198186248293207, eval_runtime: 20.8548, eval_samples_per_second: 95.901, eval_steps_per_second: 5.994, epoch: 2.0737


[2024-03-04 13:23:46,686] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-3600
[2024-03-04 13:23:51,371] [    INFO] - Deleting older checkpoint [work/output/checkpoint-1600] due to args.save_total_limit


loss: 0.31779816, learning_rate: 3.9602534562211985e-05, global_step: 3610, interval_runtime: 30.011, interval_samples_per_second: 0.533, interval_steps_per_second: 0.333, epoch: 2.0795
loss: 0.27178748, learning_rate: 3.957373271889401e-05, global_step: 3620, interval_runtime: 4.4099, interval_samples_per_second: 3.628, interval_steps_per_second: 2.268, epoch: 2.0853
loss: 0.37941127, learning_rate: 3.954493087557604e-05, global_step: 3630, interval_runtime: 4.3956, interval_samples_per_second: 3.64, interval_steps_per_second: 2.275, epoch: 2.091
loss: 0.40322356, learning_rate: 3.951612903225806e-05, global_step: 3640, interval_runtime: 4.3993, interval_samples_per_second: 3.637, interval_steps_per_second: 2.273, epoch: 2.0968
loss: 0.34743612, learning_rate: 3.9487327188940095e-05, global_step: 3650, interval_runtime: 4.4025, interval_samples_per_second: 3.634, interval_steps_per_second: 2.271, epoch: 2.1025
loss: 0.29108598, learning_rate: 3.945852534562212e-05, global_step: 3

[2024-03-04 13:25:19,554] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:25:19,556] [    INFO] -   Num examples = 2000
[2024-03-04 13:25:19,559] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:25:19,561] [    INFO] -   Total Batch size = 16
[2024-03-04 13:25:19,563] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 0.8612672090530396, eval_accuracy: 0.7585, eval_f1: 0.7195561418828172, eval_runtime: 21.0517, eval_samples_per_second: 95.004, eval_steps_per_second: 5.938, epoch: 2.1889


[2024-03-04 13:25:40,626] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-3800
[2024-03-04 13:25:44,819] [    INFO] - Deleting older checkpoint [work/output/checkpoint-1800] due to args.save_total_limit


loss: 0.33946855, learning_rate: 3.9026497695852537e-05, global_step: 3810, interval_runtime: 29.6979, interval_samples_per_second: 0.539, interval_steps_per_second: 0.337, epoch: 2.1947
loss: 0.3604358, learning_rate: 3.899769585253457e-05, global_step: 3820, interval_runtime: 4.4163, interval_samples_per_second: 3.623, interval_steps_per_second: 2.264, epoch: 2.2005
loss: 0.29968133, learning_rate: 3.8968894009216595e-05, global_step: 3830, interval_runtime: 4.4028, interval_samples_per_second: 3.634, interval_steps_per_second: 2.271, epoch: 2.2062
loss: 0.45616913, learning_rate: 3.8940092165898614e-05, global_step: 3840, interval_runtime: 4.4126, interval_samples_per_second: 3.626, interval_steps_per_second: 2.266, epoch: 2.212
loss: 0.33924015, learning_rate: 3.891129032258065e-05, global_step: 3850, interval_runtime: 4.4082, interval_samples_per_second: 3.63, interval_steps_per_second: 2.269, epoch: 2.2177
loss: 0.32886477, learning_rate: 3.888248847926267e-05, global_step: 

[2024-03-04 13:27:13,065] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:27:13,067] [    INFO] -   Num examples = 2000
[2024-03-04 13:27:13,069] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:27:13,072] [    INFO] -   Total Batch size = 16
[2024-03-04 13:27:13,074] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 0.7901596426963806, eval_accuracy: 0.7585, eval_f1: 0.7366088780528793, eval_runtime: 20.9692, eval_samples_per_second: 95.378, eval_steps_per_second: 5.961, epoch: 2.3041


[2024-03-04 13:27:34,039] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-4000
[2024-03-04 13:27:37,982] [    INFO] - Deleting older checkpoint [work/output/checkpoint-2000] due to args.save_total_limit


loss: 0.49133897, learning_rate: 3.845046082949309e-05, global_step: 4010, interval_runtime: 29.3501, interval_samples_per_second: 0.545, interval_steps_per_second: 0.341, epoch: 2.3099
loss: 0.27962902, learning_rate: 3.842165898617512e-05, global_step: 4020, interval_runtime: 4.4368, interval_samples_per_second: 3.606, interval_steps_per_second: 2.254, epoch: 2.3157
loss: 0.3503716, learning_rate: 3.839285714285715e-05, global_step: 4030, interval_runtime: 4.4172, interval_samples_per_second: 3.622, interval_steps_per_second: 2.264, epoch: 2.3214
loss: 0.29625206, learning_rate: 3.836405529953917e-05, global_step: 4040, interval_runtime: 4.4128, interval_samples_per_second: 3.626, interval_steps_per_second: 2.266, epoch: 2.3272
loss: 0.28694792, learning_rate: 3.83352534562212e-05, global_step: 4050, interval_runtime: 4.4409, interval_samples_per_second: 3.603, interval_steps_per_second: 2.252, epoch: 2.3329
loss: 0.32431254, learning_rate: 3.8306451612903224e-05, global_step: 4

[2024-03-04 13:29:06,431] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:29:06,433] [    INFO] -   Num examples = 2000
[2024-03-04 13:29:06,435] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:29:06,437] [    INFO] -   Total Batch size = 16
[2024-03-04 13:29:06,439] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 0.7923582196235657, eval_accuracy: 0.7665, eval_f1: 0.732854593425742, eval_runtime: 20.9976, eval_samples_per_second: 95.249, eval_steps_per_second: 5.953, epoch: 2.4194


[2024-03-04 13:29:27,438] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-4200
[2024-03-04 13:29:31,488] [    INFO] - Deleting older checkpoint [work/output/checkpoint-2200] due to args.save_total_limit


loss: 0.55519691, learning_rate: 3.787442396313364e-05, global_step: 4210, interval_runtime: 29.4961, interval_samples_per_second: 0.542, interval_steps_per_second: 0.339, epoch: 2.4251
loss: 0.38759074, learning_rate: 3.784562211981567e-05, global_step: 4220, interval_runtime: 4.4309, interval_samples_per_second: 3.611, interval_steps_per_second: 2.257, epoch: 2.4309
loss: 0.39801898, learning_rate: 3.78168202764977e-05, global_step: 4230, interval_runtime: 4.4095, interval_samples_per_second: 3.629, interval_steps_per_second: 2.268, epoch: 2.4366
loss: 0.31962936, learning_rate: 3.7788018433179724e-05, global_step: 4240, interval_runtime: 4.4158, interval_samples_per_second: 3.623, interval_steps_per_second: 2.265, epoch: 2.4424
loss: 0.33378267, learning_rate: 3.775921658986175e-05, global_step: 4250, interval_runtime: 4.411, interval_samples_per_second: 3.627, interval_steps_per_second: 2.267, epoch: 2.4482
loss: 0.25384252, learning_rate: 3.773041474654378e-05, global_step: 4

[2024-03-04 13:30:59,873] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:30:59,876] [    INFO] -   Num examples = 2000
[2024-03-04 13:30:59,878] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:30:59,880] [    INFO] -   Total Batch size = 16
[2024-03-04 13:30:59,882] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 0.7613644003868103, eval_accuracy: 0.764, eval_f1: 0.7319214309772385, eval_runtime: 21.016, eval_samples_per_second: 95.166, eval_steps_per_second: 5.948, epoch: 2.5346


[2024-03-04 13:31:20,896] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-4400
[2024-03-04 13:31:24,932] [    INFO] - Deleting older checkpoint [work/output/checkpoint-2400] due to args.save_total_limit


loss: 0.42356181, learning_rate: 3.72983870967742e-05, global_step: 4410, interval_runtime: 29.5198, interval_samples_per_second: 0.542, interval_steps_per_second: 0.339, epoch: 2.5403
loss: 0.36477451, learning_rate: 3.7269585253456224e-05, global_step: 4420, interval_runtime: 4.432, interval_samples_per_second: 3.61, interval_steps_per_second: 2.256, epoch: 2.5461
loss: 0.31618834, learning_rate: 3.724078341013825e-05, global_step: 4430, interval_runtime: 4.4179, interval_samples_per_second: 3.622, interval_steps_per_second: 2.264, epoch: 2.5518
loss: 0.30127001, learning_rate: 3.721198156682028e-05, global_step: 4440, interval_runtime: 4.4167, interval_samples_per_second: 3.623, interval_steps_per_second: 2.264, epoch: 2.5576
loss: 0.2527745, learning_rate: 3.71831797235023e-05, global_step: 4450, interval_runtime: 4.4155, interval_samples_per_second: 3.624, interval_steps_per_second: 2.265, epoch: 2.5634
loss: 0.31926725, learning_rate: 3.7154377880184334e-05, global_step: 446

[2024-03-04 13:32:53,394] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:32:53,396] [    INFO] -   Num examples = 2000
[2024-03-04 13:32:53,398] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:32:53,402] [    INFO] -   Total Batch size = 16
[2024-03-04 13:32:53,404] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 0.8164881467819214, eval_accuracy: 0.7605, eval_f1: 0.7247137706212174, eval_runtime: 20.9763, eval_samples_per_second: 95.346, eval_steps_per_second: 5.959, epoch: 2.6498


[2024-03-04 13:33:14,376] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-4600
[2024-03-04 13:33:18,358] [    INFO] - Deleting older checkpoint [work/output/checkpoint-2600] due to args.save_total_limit


loss: 0.50270438, learning_rate: 3.672235023041475e-05, global_step: 4610, interval_runtime: 29.3915, interval_samples_per_second: 0.544, interval_steps_per_second: 0.34, epoch: 2.6555
loss: 0.40321679, learning_rate: 3.6693548387096776e-05, global_step: 4620, interval_runtime: 4.4127, interval_samples_per_second: 3.626, interval_steps_per_second: 2.266, epoch: 2.6613
loss: 0.38213627, learning_rate: 3.666474654377881e-05, global_step: 4630, interval_runtime: 4.4121, interval_samples_per_second: 3.626, interval_steps_per_second: 2.267, epoch: 2.6671
loss: 0.31355026, learning_rate: 3.6635944700460834e-05, global_step: 4640, interval_runtime: 4.4253, interval_samples_per_second: 3.616, interval_steps_per_second: 2.26, epoch: 2.6728
loss: 0.43771744, learning_rate: 3.6607142857142853e-05, global_step: 4650, interval_runtime: 4.4239, interval_samples_per_second: 3.617, interval_steps_per_second: 2.26, epoch: 2.6786
loss: 0.37841005, learning_rate: 3.6578341013824886e-05, global_step:

[2024-03-04 13:34:47,008] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:34:47,019] [    INFO] -   Num examples = 2000
[2024-03-04 13:34:47,024] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:34:47,027] [    INFO] -   Total Batch size = 16
[2024-03-04 13:34:47,030] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 0.7542080879211426, eval_accuracy: 0.762, eval_f1: 0.7312677660201178, eval_runtime: 21.04, eval_samples_per_second: 95.057, eval_steps_per_second: 5.941, epoch: 2.765


[2024-03-04 13:35:08,054] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-4800
[2024-03-04 13:35:12,209] [    INFO] - Deleting older checkpoint [work/output/checkpoint-3000] due to args.save_total_limit


loss: 0.3100065, learning_rate: 3.61463133640553e-05, global_step: 4810, interval_runtime: 29.7615, interval_samples_per_second: 0.538, interval_steps_per_second: 0.336, epoch: 2.7707
loss: 0.40383272, learning_rate: 3.611751152073733e-05, global_step: 4820, interval_runtime: 4.451, interval_samples_per_second: 3.595, interval_steps_per_second: 2.247, epoch: 2.7765
loss: 0.31247797, learning_rate: 3.608870967741936e-05, global_step: 4830, interval_runtime: 4.4451, interval_samples_per_second: 3.599, interval_steps_per_second: 2.25, epoch: 2.7823
loss: 0.30432167, learning_rate: 3.6059907834101386e-05, global_step: 4840, interval_runtime: 4.4267, interval_samples_per_second: 3.614, interval_steps_per_second: 2.259, epoch: 2.788
loss: 0.37502439, learning_rate: 3.603110599078341e-05, global_step: 4850, interval_runtime: 4.4258, interval_samples_per_second: 3.615, interval_steps_per_second: 2.259, epoch: 2.7938
loss: 0.36484344, learning_rate: 3.600230414746544e-05, global_step: 4860

[2024-03-04 13:36:40,608] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:36:40,611] [    INFO] -   Num examples = 2000
[2024-03-04 13:36:40,615] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:36:40,618] [    INFO] -   Total Batch size = 16
[2024-03-04 13:36:40,621] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 0.8032001852989197, eval_accuracy: 0.7565, eval_f1: 0.7240742819011797, eval_runtime: 20.8703, eval_samples_per_second: 95.83, eval_steps_per_second: 5.989, epoch: 2.8802


[2024-03-04 13:37:01,484] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-5000
[2024-03-04 13:37:05,415] [    INFO] - Deleting older checkpoint [work/output/checkpoint-3200] due to args.save_total_limit


loss: 0.45200834, learning_rate: 3.557027649769585e-05, global_step: 5010, interval_runtime: 29.3422, interval_samples_per_second: 0.545, interval_steps_per_second: 0.341, epoch: 2.8859
loss: 0.36737604, learning_rate: 3.554147465437788e-05, global_step: 5020, interval_runtime: 4.4177, interval_samples_per_second: 3.622, interval_steps_per_second: 2.264, epoch: 2.8917
loss: 0.33173196, learning_rate: 3.551267281105991e-05, global_step: 5030, interval_runtime: 4.4141, interval_samples_per_second: 3.625, interval_steps_per_second: 2.265, epoch: 2.8975
loss: 0.3819082, learning_rate: 3.548387096774194e-05, global_step: 5040, interval_runtime: 4.4052, interval_samples_per_second: 3.632, interval_steps_per_second: 2.27, epoch: 2.9032
loss: 0.45826569, learning_rate: 3.545506912442397e-05, global_step: 5050, interval_runtime: 4.4185, interval_samples_per_second: 3.621, interval_steps_per_second: 2.263, epoch: 2.909
loss: 0.42491097, learning_rate: 3.542626728110599e-05, global_step: 506

[2024-03-04 13:38:34,263] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:38:34,267] [    INFO] -   Num examples = 2000
[2024-03-04 13:38:34,269] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:38:34,271] [    INFO] -   Total Batch size = 16
[2024-03-04 13:38:34,274] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 0.8250638246536255, eval_accuracy: 0.7615, eval_f1: 0.7317158745791442, eval_runtime: 20.9882, eval_samples_per_second: 95.292, eval_steps_per_second: 5.956, epoch: 2.9954


[2024-03-04 13:38:55,257] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-5200
[2024-03-04 13:38:59,109] [    INFO] - Deleting older checkpoint [work/output/checkpoint-3400] due to args.save_total_limit


loss: 0.38672242, learning_rate: 3.4994239631336405e-05, global_step: 5210, interval_runtime: 29.4321, interval_samples_per_second: 0.544, interval_steps_per_second: 0.34, epoch: 3.0012
loss: 0.23307571, learning_rate: 3.496543778801844e-05, global_step: 5220, interval_runtime: 4.4322, interval_samples_per_second: 3.61, interval_steps_per_second: 2.256, epoch: 3.0069
loss: 0.1578375, learning_rate: 3.493663594470046e-05, global_step: 5230, interval_runtime: 4.435, interval_samples_per_second: 3.608, interval_steps_per_second: 2.255, epoch: 3.0127
loss: 0.19054056, learning_rate: 3.490783410138249e-05, global_step: 5240, interval_runtime: 4.444, interval_samples_per_second: 3.6, interval_steps_per_second: 2.25, epoch: 3.0184
loss: 0.24588144, learning_rate: 3.487903225806452e-05, global_step: 5250, interval_runtime: 4.4406, interval_samples_per_second: 3.603, interval_steps_per_second: 2.252, epoch: 3.0242
loss: 0.24973896, learning_rate: 3.485023041474654e-05, global_step: 5260, i

[2024-03-04 13:40:28,048] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:40:28,052] [    INFO] -   Num examples = 2000
[2024-03-04 13:40:28,055] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:40:28,059] [    INFO] -   Total Batch size = 16
[2024-03-04 13:40:28,062] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.0157666206359863, eval_accuracy: 0.7625, eval_f1: 0.7347861655667295, eval_runtime: 20.9588, eval_samples_per_second: 95.425, eval_steps_per_second: 5.964, epoch: 3.1106


[2024-03-04 13:40:49,019] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-5400
[2024-03-04 13:40:53,051] [    INFO] - Deleting older checkpoint [work/output/checkpoint-3600] due to args.save_total_limit


loss: 0.34689691, learning_rate: 3.4418202764976956e-05, global_step: 5410, interval_runtime: 29.5765, interval_samples_per_second: 0.541, interval_steps_per_second: 0.338, epoch: 3.1164
loss: 0.23214025, learning_rate: 3.438940092165899e-05, global_step: 5420, interval_runtime: 4.4261, interval_samples_per_second: 3.615, interval_steps_per_second: 2.259, epoch: 3.1221
loss: 0.11028234, learning_rate: 3.4360599078341015e-05, global_step: 5430, interval_runtime: 4.4423, interval_samples_per_second: 3.602, interval_steps_per_second: 2.251, epoch: 3.1279
loss: 0.3300288, learning_rate: 3.433179723502305e-05, global_step: 5440, interval_runtime: 4.4507, interval_samples_per_second: 3.595, interval_steps_per_second: 2.247, epoch: 3.1336
loss: 0.37358413, learning_rate: 3.4302995391705074e-05, global_step: 5450, interval_runtime: 4.4176, interval_samples_per_second: 3.622, interval_steps_per_second: 2.264, epoch: 3.1394
loss: 0.20473547, learning_rate: 3.427419354838709e-05, global_step

[2024-03-04 13:42:21,805] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:42:21,809] [    INFO] -   Num examples = 2000
[2024-03-04 13:42:21,824] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:42:21,826] [    INFO] -   Total Batch size = 16
[2024-03-04 13:42:21,829] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.0583391189575195, eval_accuracy: 0.765, eval_f1: 0.7283194832870458, eval_runtime: 20.967, eval_samples_per_second: 95.388, eval_steps_per_second: 5.962, epoch: 3.2258


[2024-03-04 13:42:42,778] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-5600
[2024-03-04 13:42:46,687] [    INFO] - Deleting older checkpoint [work/output/checkpoint-3800] due to args.save_total_limit


loss: 0.27033935, learning_rate: 3.3842165898617515e-05, global_step: 5610, interval_runtime: 29.4481, interval_samples_per_second: 0.543, interval_steps_per_second: 0.34, epoch: 3.2316
loss: 0.20886796, learning_rate: 3.381336405529954e-05, global_step: 5620, interval_runtime: 4.4193, interval_samples_per_second: 3.62, interval_steps_per_second: 2.263, epoch: 3.2373
loss: 0.26056314, learning_rate: 3.378456221198157e-05, global_step: 5630, interval_runtime: 4.4351, interval_samples_per_second: 3.608, interval_steps_per_second: 2.255, epoch: 3.2431
loss: 0.26085184, learning_rate: 3.37557603686636e-05, global_step: 5640, interval_runtime: 4.4299, interval_samples_per_second: 3.612, interval_steps_per_second: 2.257, epoch: 3.2488
loss: 0.26923742, learning_rate: 3.3726958525345625e-05, global_step: 5650, interval_runtime: 4.4386, interval_samples_per_second: 3.605, interval_steps_per_second: 2.253, epoch: 3.2546
loss: 0.22934272, learning_rate: 3.369815668202765e-05, global_step: 5

[2024-03-04 13:44:15,306] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:44:15,309] [    INFO] -   Num examples = 2000
[2024-03-04 13:44:15,314] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:44:15,329] [    INFO] -   Total Batch size = 16
[2024-03-04 13:44:15,334] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.0493983030319214, eval_accuracy: 0.754, eval_f1: 0.7195989592247335, eval_runtime: 21.0738, eval_samples_per_second: 94.904, eval_steps_per_second: 5.932, epoch: 3.341


[2024-03-04 13:44:36,386] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-5800
[2024-03-04 13:44:40,442] [    INFO] - Deleting older checkpoint [work/output/checkpoint-4000] due to args.save_total_limit


loss: 0.32802291, learning_rate: 3.3266129032258067e-05, global_step: 5810, interval_runtime: 29.7035, interval_samples_per_second: 0.539, interval_steps_per_second: 0.337, epoch: 3.3468
loss: 0.20460596, learning_rate: 3.323732718894009e-05, global_step: 5820, interval_runtime: 4.4481, interval_samples_per_second: 3.597, interval_steps_per_second: 2.248, epoch: 3.3525
loss: 0.16951203, learning_rate: 3.320852534562212e-05, global_step: 5830, interval_runtime: 4.4236, interval_samples_per_second: 3.617, interval_steps_per_second: 2.261, epoch: 3.3583
loss: 0.16004534, learning_rate: 3.317972350230415e-05, global_step: 5840, interval_runtime: 4.4493, interval_samples_per_second: 3.596, interval_steps_per_second: 2.248, epoch: 3.3641
loss: 0.34099565, learning_rate: 3.315092165898618e-05, global_step: 5850, interval_runtime: 4.4346, interval_samples_per_second: 3.608, interval_steps_per_second: 2.255, epoch: 3.3698
loss: 0.16588264, learning_rate: 3.312211981566821e-05, global_step:

[2024-03-04 13:46:09,433] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:46:09,436] [    INFO] -   Num examples = 2000
[2024-03-04 13:46:09,439] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:46:09,442] [    INFO] -   Total Batch size = 16
[2024-03-04 13:46:09,445] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.0656288862228394, eval_accuracy: 0.7615, eval_f1: 0.726993604700357, eval_runtime: 20.9839, eval_samples_per_second: 95.311, eval_steps_per_second: 5.957, epoch: 3.4562


[2024-03-04 13:46:30,433] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-6000
[2024-03-04 13:46:34,373] [    INFO] - Deleting older checkpoint [work/output/checkpoint-4200] due to args.save_total_limit


loss: 0.33348258, learning_rate: 3.269009216589862e-05, global_step: 6010, interval_runtime: 29.5024, interval_samples_per_second: 0.542, interval_steps_per_second: 0.339, epoch: 3.462
loss: 0.26268454, learning_rate: 3.2661290322580644e-05, global_step: 6020, interval_runtime: 4.4135, interval_samples_per_second: 3.625, interval_steps_per_second: 2.266, epoch: 3.4677
loss: 0.09405248, learning_rate: 3.263248847926268e-05, global_step: 6030, interval_runtime: 4.4152, interval_samples_per_second: 3.624, interval_steps_per_second: 2.265, epoch: 3.4735
loss: 0.21512749, learning_rate: 3.26036866359447e-05, global_step: 6040, interval_runtime: 4.4193, interval_samples_per_second: 3.62, interval_steps_per_second: 2.263, epoch: 3.4793
loss: 0.2410398, learning_rate: 3.257488479262673e-05, global_step: 6050, interval_runtime: 4.4303, interval_samples_per_second: 3.611, interval_steps_per_second: 2.257, epoch: 3.485
loss: 0.25374789, learning_rate: 3.254608294930876e-05, global_step: 6060

[2024-03-04 13:48:02,932] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:48:02,936] [    INFO] -   Num examples = 2000
[2024-03-04 13:48:02,939] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:48:02,942] [    INFO] -   Total Batch size = 16
[2024-03-04 13:48:02,945] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.0138381719589233, eval_accuracy: 0.753, eval_f1: 0.7101007855292187, eval_runtime: 21.0244, eval_samples_per_second: 95.128, eval_steps_per_second: 5.945, epoch: 3.5714


[2024-03-04 13:48:23,962] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-6200
[2024-03-04 13:48:27,953] [    INFO] - Deleting older checkpoint [work/output/checkpoint-4400] due to args.save_total_limit


loss: 0.233797, learning_rate: 3.211405529953917e-05, global_step: 6210, interval_runtime: 29.585, interval_samples_per_second: 0.541, interval_steps_per_second: 0.338, epoch: 3.5772
loss: 0.22962713, learning_rate: 3.2085253456221196e-05, global_step: 6220, interval_runtime: 4.4322, interval_samples_per_second: 3.61, interval_steps_per_second: 2.256, epoch: 3.5829
loss: 0.24458847, learning_rate: 3.205645161290323e-05, global_step: 6230, interval_runtime: 4.4528, interval_samples_per_second: 3.593, interval_steps_per_second: 2.246, epoch: 3.5887
loss: 0.41156135, learning_rate: 3.2027649769585254e-05, global_step: 6240, interval_runtime: 4.4296, interval_samples_per_second: 3.612, interval_steps_per_second: 2.258, epoch: 3.5945
loss: 0.26240647, learning_rate: 3.199884792626729e-05, global_step: 6250, interval_runtime: 4.4251, interval_samples_per_second: 3.616, interval_steps_per_second: 2.26, epoch: 3.6002
loss: 0.23492732, learning_rate: 3.197004608294931e-05, global_step: 626

[2024-03-04 13:49:56,535] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:49:56,538] [    INFO] -   Num examples = 2000
[2024-03-04 13:49:56,541] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:49:56,543] [    INFO] -   Total Batch size = 16
[2024-03-04 13:49:56,546] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.108582615852356, eval_accuracy: 0.7525, eval_f1: 0.7156437180056661, eval_runtime: 20.9503, eval_samples_per_second: 95.464, eval_steps_per_second: 5.966, epoch: 3.6866


[2024-03-04 13:50:17,490] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-6400
[2024-03-04 13:50:21,289] [    INFO] - Deleting older checkpoint [work/output/checkpoint-4600] due to args.save_total_limit


loss: 0.1657282, learning_rate: 3.153801843317972e-05, global_step: 6410, interval_runtime: 29.2862, interval_samples_per_second: 0.546, interval_steps_per_second: 0.341, epoch: 3.6924
loss: 0.35092707, learning_rate: 3.1509216589861754e-05, global_step: 6420, interval_runtime: 4.4359, interval_samples_per_second: 3.607, interval_steps_per_second: 2.254, epoch: 3.6982
loss: 0.35557344, learning_rate: 3.148041474654378e-05, global_step: 6430, interval_runtime: 4.4475, interval_samples_per_second: 3.598, interval_steps_per_second: 2.248, epoch: 3.7039
loss: 0.2726212, learning_rate: 3.1451612903225806e-05, global_step: 6440, interval_runtime: 4.4563, interval_samples_per_second: 3.59, interval_steps_per_second: 2.244, epoch: 3.7097
loss: 0.40153775, learning_rate: 3.142281105990784e-05, global_step: 6450, interval_runtime: 4.4291, interval_samples_per_second: 3.612, interval_steps_per_second: 2.258, epoch: 3.7154
loss: 0.28646472, learning_rate: 3.1394009216589864e-05, global_step: 

[2024-03-04 13:51:50,265] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:51:50,268] [    INFO] -   Num examples = 2000
[2024-03-04 13:51:50,270] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:51:50,272] [    INFO] -   Total Batch size = 16
[2024-03-04 13:51:50,274] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 0.9626110196113586, eval_accuracy: 0.7595, eval_f1: 0.7265967256465192, eval_runtime: 21.0213, eval_samples_per_second: 95.142, eval_steps_per_second: 5.946, epoch: 3.8018


[2024-03-04 13:52:11,304] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-6600
[2024-03-04 13:52:15,051] [    INFO] - Deleting older checkpoint [work/output/checkpoint-4800] due to args.save_total_limit


loss: 0.39392595, learning_rate: 3.096198156682028e-05, global_step: 6610, interval_runtime: 29.3336, interval_samples_per_second: 0.545, interval_steps_per_second: 0.341, epoch: 3.8076
loss: 0.25154412, learning_rate: 3.0933179723502306e-05, global_step: 6620, interval_runtime: 4.4236, interval_samples_per_second: 3.617, interval_steps_per_second: 2.261, epoch: 3.8134
loss: 0.33419662, learning_rate: 3.090437788018433e-05, global_step: 6630, interval_runtime: 4.4271, interval_samples_per_second: 3.614, interval_steps_per_second: 2.259, epoch: 3.8191
loss: 0.3014729, learning_rate: 3.087557603686636e-05, global_step: 6640, interval_runtime: 4.4478, interval_samples_per_second: 3.597, interval_steps_per_second: 2.248, epoch: 3.8249
loss: 0.24467092, learning_rate: 3.084677419354839e-05, global_step: 6650, interval_runtime: 4.4255, interval_samples_per_second: 3.615, interval_steps_per_second: 2.26, epoch: 3.8306
loss: 0.17296417, learning_rate: 3.0817972350230416e-05, global_step: 

[2024-03-04 13:53:43,700] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:53:43,702] [    INFO] -   Num examples = 2000
[2024-03-04 13:53:43,705] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:53:43,712] [    INFO] -   Total Batch size = 16
[2024-03-04 13:53:43,723] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.1439751386642456, eval_accuracy: 0.751, eval_f1: 0.7208575610475174, eval_runtime: 20.9675, eval_samples_per_second: 95.386, eval_steps_per_second: 5.962, epoch: 3.9171


[2024-03-04 13:54:04,675] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-6800
[2024-03-04 13:54:08,661] [    INFO] - Deleting older checkpoint [work/output/checkpoint-5000] due to args.save_total_limit


loss: 0.27391002, learning_rate: 3.0385944700460835e-05, global_step: 6810, interval_runtime: 29.4883, interval_samples_per_second: 0.543, interval_steps_per_second: 0.339, epoch: 3.9228
loss: 0.22667007, learning_rate: 3.0357142857142857e-05, global_step: 6820, interval_runtime: 4.4133, interval_samples_per_second: 3.625, interval_steps_per_second: 2.266, epoch: 3.9286
loss: 0.2873189, learning_rate: 3.0328341013824883e-05, global_step: 6830, interval_runtime: 4.4291, interval_samples_per_second: 3.612, interval_steps_per_second: 2.258, epoch: 3.9343
loss: 0.20289805, learning_rate: 3.0299539170506913e-05, global_step: 6840, interval_runtime: 4.4165, interval_samples_per_second: 3.623, interval_steps_per_second: 2.264, epoch: 3.9401
loss: 0.24991474, learning_rate: 3.0270737327188942e-05, global_step: 6850, interval_runtime: 4.4252, interval_samples_per_second: 3.616, interval_steps_per_second: 2.26, epoch: 3.9459
loss: 0.33479147, learning_rate: 3.024193548387097e-05, global_ste

[2024-03-04 13:55:37,295] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:55:37,298] [    INFO] -   Num examples = 2000
[2024-03-04 13:55:37,300] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:55:37,302] [    INFO] -   Total Batch size = 16
[2024-03-04 13:55:37,304] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.1237232685089111, eval_accuracy: 0.764, eval_f1: 0.7319912650530465, eval_runtime: 20.9272, eval_samples_per_second: 95.569, eval_steps_per_second: 5.973, epoch: 4.0323


[2024-03-04 13:55:58,231] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-7000
[2024-03-04 13:56:01,844] [    INFO] - Deleting older checkpoint [work/output/checkpoint-5200] due to args.save_total_limit


loss: 0.11104003, learning_rate: 2.9809907834101387e-05, global_step: 7010, interval_runtime: 29.0841, interval_samples_per_second: 0.55, interval_steps_per_second: 0.344, epoch: 4.038
loss: 0.1995888, learning_rate: 2.978110599078341e-05, global_step: 7020, interval_runtime: 4.4148, interval_samples_per_second: 3.624, interval_steps_per_second: 2.265, epoch: 4.0438
loss: 0.2746177, learning_rate: 2.9752304147465438e-05, global_step: 7030, interval_runtime: 4.4141, interval_samples_per_second: 3.625, interval_steps_per_second: 2.265, epoch: 4.0495
loss: 0.25716934, learning_rate: 2.9723502304147464e-05, global_step: 7040, interval_runtime: 4.4045, interval_samples_per_second: 3.633, interval_steps_per_second: 2.27, epoch: 4.0553
loss: 0.28699315, learning_rate: 2.9694700460829493e-05, global_step: 7050, interval_runtime: 4.4212, interval_samples_per_second: 3.619, interval_steps_per_second: 2.262, epoch: 4.0611
loss: 0.09837654, learning_rate: 2.9665898617511523e-05, global_step: 

[2024-03-04 13:57:30,585] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:57:30,588] [    INFO] -   Num examples = 2000
[2024-03-04 13:57:30,590] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:57:30,592] [    INFO] -   Total Batch size = 16
[2024-03-04 13:57:30,595] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.211502194404602, eval_accuracy: 0.756, eval_f1: 0.7228244001549363, eval_runtime: 21.0103, eval_samples_per_second: 95.191, eval_steps_per_second: 5.949, epoch: 4.1475


[2024-03-04 13:57:51,603] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-7200
[2024-03-04 13:57:55,144] [    INFO] - Deleting older checkpoint [work/output/checkpoint-5400] due to args.save_total_limit


loss: 0.19503452, learning_rate: 2.9233870967741938e-05, global_step: 7210, interval_runtime: 29.1313, interval_samples_per_second: 0.549, interval_steps_per_second: 0.343, epoch: 4.1532
loss: 0.21043072, learning_rate: 2.9205069124423967e-05, global_step: 7220, interval_runtime: 4.4415, interval_samples_per_second: 3.602, interval_steps_per_second: 2.251, epoch: 4.159
loss: 0.06353274, learning_rate: 2.917626728110599e-05, global_step: 7230, interval_runtime: 4.4628, interval_samples_per_second: 3.585, interval_steps_per_second: 2.241, epoch: 4.1647
loss: 0.19824883, learning_rate: 2.914746543778802e-05, global_step: 7240, interval_runtime: 4.426, interval_samples_per_second: 3.615, interval_steps_per_second: 2.259, epoch: 4.1705
loss: 0.18249538, learning_rate: 2.911866359447005e-05, global_step: 7250, interval_runtime: 4.4336, interval_samples_per_second: 3.609, interval_steps_per_second: 2.256, epoch: 4.1763
loss: 0.09690414, learning_rate: 2.9089861751152074e-05, global_step:

[2024-03-04 13:59:23,980] [    INFO] - ***** Running Evaluation *****
[2024-03-04 13:59:23,983] [    INFO] -   Num examples = 2000
[2024-03-04 13:59:23,986] [    INFO] -   Pre device batch size = 16
[2024-03-04 13:59:23,989] [    INFO] -   Total Batch size = 16
[2024-03-04 13:59:23,992] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.1964232921600342, eval_accuracy: 0.758, eval_f1: 0.7260271249912477, eval_runtime: 20.8474, eval_samples_per_second: 95.935, eval_steps_per_second: 5.996, epoch: 4.2627


[2024-03-04 13:59:44,834] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-7400
[2024-03-04 13:59:48,314] [    INFO] - Deleting older checkpoint [work/output/checkpoint-5600] due to args.save_total_limit


loss: 0.21284571, learning_rate: 2.865783410138249e-05, global_step: 7410, interval_runtime: 28.8618, interval_samples_per_second: 0.554, interval_steps_per_second: 0.346, epoch: 4.2684
loss: 0.15459744, learning_rate: 2.862903225806452e-05, global_step: 7420, interval_runtime: 4.4455, interval_samples_per_second: 3.599, interval_steps_per_second: 2.249, epoch: 4.2742
loss: 0.18118241, learning_rate: 2.860023041474654e-05, global_step: 7430, interval_runtime: 4.45, interval_samples_per_second: 3.596, interval_steps_per_second: 2.247, epoch: 4.28
loss: 0.0744012, learning_rate: 2.857142857142857e-05, global_step: 7440, interval_runtime: 4.4363, interval_samples_per_second: 3.607, interval_steps_per_second: 2.254, epoch: 4.2857
loss: 0.16331795, learning_rate: 2.85426267281106e-05, global_step: 7450, interval_runtime: 4.4317, interval_samples_per_second: 3.61, interval_steps_per_second: 2.256, epoch: 4.2915
loss: 0.24254694, learning_rate: 2.851382488479263e-05, global_step: 7460, i

[2024-03-04 14:01:17,195] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:01:17,198] [    INFO] -   Num examples = 2000
[2024-03-04 14:01:17,201] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:01:17,203] [    INFO] -   Total Batch size = 16
[2024-03-04 14:01:17,205] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.2025922536849976, eval_accuracy: 0.7555, eval_f1: 0.7158979148920518, eval_runtime: 20.8661, eval_samples_per_second: 95.849, eval_steps_per_second: 5.991, epoch: 4.3779


[2024-03-04 14:01:38,067] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-7600
[2024-03-04 14:01:41,620] [    INFO] - Deleting older checkpoint [work/output/checkpoint-5800] due to args.save_total_limit


loss: 0.23910329, learning_rate: 2.8081797235023045e-05, global_step: 7610, interval_runtime: 28.9462, interval_samples_per_second: 0.553, interval_steps_per_second: 0.345, epoch: 4.3836
loss: 0.28550997, learning_rate: 2.8052995391705074e-05, global_step: 7620, interval_runtime: 4.4232, interval_samples_per_second: 3.617, interval_steps_per_second: 2.261, epoch: 4.3894
loss: 0.21221628, learning_rate: 2.8024193548387097e-05, global_step: 7630, interval_runtime: 4.4097, interval_samples_per_second: 3.628, interval_steps_per_second: 2.268, epoch: 4.3952
loss: 0.29253914, learning_rate: 2.7995391705069123e-05, global_step: 7640, interval_runtime: 4.4213, interval_samples_per_second: 3.619, interval_steps_per_second: 2.262, epoch: 4.4009
loss: 0.05249949, learning_rate: 2.7966589861751152e-05, global_step: 7650, interval_runtime: 4.4183, interval_samples_per_second: 3.621, interval_steps_per_second: 2.263, epoch: 4.4067
loss: 0.11405091, learning_rate: 2.793778801843318e-05, global_s

[2024-03-04 14:03:10,109] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:03:10,114] [    INFO] -   Num examples = 2000
[2024-03-04 14:03:10,119] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:03:10,122] [    INFO] -   Total Batch size = 16
[2024-03-04 14:03:10,124] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.245071530342102, eval_accuracy: 0.7505, eval_f1: 0.71761776347455, eval_runtime: 20.7955, eval_samples_per_second: 96.175, eval_steps_per_second: 6.011, epoch: 4.4931


[2024-03-04 14:03:30,910] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-7800
[2024-03-04 14:03:34,497] [    INFO] - Deleting older checkpoint [work/output/checkpoint-6000] due to args.save_total_limit


loss: 0.15601759, learning_rate: 2.7505760368663596e-05, global_step: 7810, interval_runtime: 28.9991, interval_samples_per_second: 0.552, interval_steps_per_second: 0.345, epoch: 4.4988
loss: 0.11316512, learning_rate: 2.7476958525345626e-05, global_step: 7820, interval_runtime: 4.4245, interval_samples_per_second: 3.616, interval_steps_per_second: 2.26, epoch: 4.5046
loss: 0.22432878, learning_rate: 2.7448156682027655e-05, global_step: 7830, interval_runtime: 4.4204, interval_samples_per_second: 3.62, interval_steps_per_second: 2.262, epoch: 4.5104
loss: 0.1840627, learning_rate: 2.7419354838709678e-05, global_step: 7840, interval_runtime: 4.4235, interval_samples_per_second: 3.617, interval_steps_per_second: 2.261, epoch: 4.5161
loss: 0.17279503, learning_rate: 2.7390552995391703e-05, global_step: 7850, interval_runtime: 4.4174, interval_samples_per_second: 3.622, interval_steps_per_second: 2.264, epoch: 4.5219
loss: 0.20998838, learning_rate: 2.7361751152073733e-05, global_ste

[2024-03-04 14:05:03,388] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:05:03,391] [    INFO] -   Num examples = 2000
[2024-03-04 14:05:03,393] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:05:03,396] [    INFO] -   Total Batch size = 16
[2024-03-04 14:05:03,398] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.3018957376480103, eval_accuracy: 0.756, eval_f1: 0.719030184965622, eval_runtime: 20.9392, eval_samples_per_second: 95.515, eval_steps_per_second: 5.97, epoch: 4.6083


[2024-03-04 14:05:24,333] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-8000
[2024-03-04 14:05:27,954] [    INFO] - Deleting older checkpoint [work/output/checkpoint-6200] due to args.save_total_limit


loss: 0.20867774, learning_rate: 2.6929723502304148e-05, global_step: 8010, interval_runtime: 29.0837, interval_samples_per_second: 0.55, interval_steps_per_second: 0.344, epoch: 4.6141
loss: 0.09312252, learning_rate: 2.6900921658986177e-05, global_step: 8020, interval_runtime: 4.4058, interval_samples_per_second: 3.632, interval_steps_per_second: 2.27, epoch: 4.6198
loss: 0.15758908, learning_rate: 2.6872119815668207e-05, global_step: 8030, interval_runtime: 4.4089, interval_samples_per_second: 3.629, interval_steps_per_second: 2.268, epoch: 4.6256
loss: 0.17303996, learning_rate: 2.684331797235023e-05, global_step: 8040, interval_runtime: 4.4033, interval_samples_per_second: 3.634, interval_steps_per_second: 2.271, epoch: 4.6313
loss: 0.15929981, learning_rate: 2.681451612903226e-05, global_step: 8050, interval_runtime: 4.4011, interval_samples_per_second: 3.635, interval_steps_per_second: 2.272, epoch: 4.6371
loss: 0.11647, learning_rate: 2.6785714285714288e-05, global_step: 8

[2024-03-04 14:06:56,197] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:06:56,200] [    INFO] -   Num examples = 2000
[2024-03-04 14:06:56,203] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:06:56,206] [    INFO] -   Total Batch size = 16
[2024-03-04 14:06:56,209] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.2940471172332764, eval_accuracy: 0.7605, eval_f1: 0.720294945428216, eval_runtime: 20.9104, eval_samples_per_second: 95.646, eval_steps_per_second: 5.978, epoch: 4.7235


[2024-03-04 14:07:17,112] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-8200
[2024-03-04 14:07:20,697] [    INFO] - Deleting older checkpoint [work/output/checkpoint-6400] due to args.save_total_limit


loss: 0.11966499, learning_rate: 2.6353686635944703e-05, global_step: 8210, interval_runtime: 29.0219, interval_samples_per_second: 0.551, interval_steps_per_second: 0.345, epoch: 4.7293
loss: 0.17847214, learning_rate: 2.632488479262673e-05, global_step: 8220, interval_runtime: 4.4175, interval_samples_per_second: 3.622, interval_steps_per_second: 2.264, epoch: 4.735
loss: 0.17695539, learning_rate: 2.629608294930876e-05, global_step: 8230, interval_runtime: 4.4236, interval_samples_per_second: 3.617, interval_steps_per_second: 2.261, epoch: 4.7408
loss: 0.12802118, learning_rate: 2.626728110599078e-05, global_step: 8240, interval_runtime: 4.4451, interval_samples_per_second: 3.599, interval_steps_per_second: 2.25, epoch: 4.7465
loss: 0.13170058, learning_rate: 2.623847926267281e-05, global_step: 8250, interval_runtime: 4.4157, interval_samples_per_second: 3.623, interval_steps_per_second: 2.265, epoch: 4.7523
loss: 0.0835596, learning_rate: 2.620967741935484e-05, global_step: 82

[2024-03-04 14:08:49,101] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:08:49,104] [    INFO] -   Num examples = 2000
[2024-03-04 14:08:49,107] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:08:49,109] [    INFO] -   Total Batch size = 16
[2024-03-04 14:08:49,111] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.226168155670166, eval_accuracy: 0.759, eval_f1: 0.7219092134128992, eval_runtime: 20.8739, eval_samples_per_second: 95.813, eval_steps_per_second: 5.988, epoch: 4.8387


[2024-03-04 14:09:09,982] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-8400
[2024-03-04 14:09:13,526] [    INFO] - Deleting older checkpoint [work/output/checkpoint-6600] due to args.save_total_limit


loss: 0.04786511, learning_rate: 2.5777649769585255e-05, global_step: 8410, interval_runtime: 28.9978, interval_samples_per_second: 0.552, interval_steps_per_second: 0.345, epoch: 4.8445
loss: 0.16544447, learning_rate: 2.5748847926267284e-05, global_step: 8420, interval_runtime: 4.4303, interval_samples_per_second: 3.611, interval_steps_per_second: 2.257, epoch: 4.8502
loss: 0.10542387, learning_rate: 2.5720046082949313e-05, global_step: 8430, interval_runtime: 4.4283, interval_samples_per_second: 3.613, interval_steps_per_second: 2.258, epoch: 4.856
loss: 0.24585145, learning_rate: 2.569124423963134e-05, global_step: 8440, interval_runtime: 4.4372, interval_samples_per_second: 3.606, interval_steps_per_second: 2.254, epoch: 4.8618
loss: 0.27050524, learning_rate: 2.5662442396313362e-05, global_step: 8450, interval_runtime: 4.4391, interval_samples_per_second: 3.604, interval_steps_per_second: 2.253, epoch: 4.8675
loss: 0.17599173, learning_rate: 2.563364055299539e-05, global_ste

[2024-03-04 14:10:42,387] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:10:42,390] [    INFO] -   Num examples = 2000
[2024-03-04 14:10:42,393] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:10:42,397] [    INFO] -   Total Batch size = 16
[2024-03-04 14:10:42,399] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.19387948513031, eval_accuracy: 0.7525, eval_f1: 0.7134484481583194, eval_runtime: 21.0431, eval_samples_per_second: 95.043, eval_steps_per_second: 5.94, epoch: 4.9539


[2024-03-04 14:11:03,436] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-8600
[2024-03-04 14:11:07,337] [    INFO] - Deleting older checkpoint [work/output/checkpoint-6800] due to args.save_total_limit


loss: 0.26409075, learning_rate: 2.5201612903225806e-05, global_step: 8610, interval_runtime: 29.5222, interval_samples_per_second: 0.542, interval_steps_per_second: 0.339, epoch: 4.9597
loss: 0.17554272, learning_rate: 2.5172811059907836e-05, global_step: 8620, interval_runtime: 4.4217, interval_samples_per_second: 3.619, interval_steps_per_second: 2.262, epoch: 4.9654
loss: 0.22497826, learning_rate: 2.5144009216589865e-05, global_step: 8630, interval_runtime: 4.4347, interval_samples_per_second: 3.608, interval_steps_per_second: 2.255, epoch: 4.9712
loss: 0.13039347, learning_rate: 2.5115207373271894e-05, global_step: 8640, interval_runtime: 4.4299, interval_samples_per_second: 3.612, interval_steps_per_second: 2.257, epoch: 4.977
loss: 0.18472946, learning_rate: 2.5086405529953917e-05, global_step: 8650, interval_runtime: 4.4253, interval_samples_per_second: 3.616, interval_steps_per_second: 2.26, epoch: 4.9827
loss: 0.16210779, learning_rate: 2.5057603686635943e-05, global_st

[2024-03-04 14:12:36,141] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:12:36,144] [    INFO] -   Num examples = 2000
[2024-03-04 14:12:36,146] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:12:36,148] [    INFO] -   Total Batch size = 16
[2024-03-04 14:12:36,151] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.4034923315048218, eval_accuracy: 0.7485, eval_f1: 0.7062971527489035, eval_runtime: 20.8958, eval_samples_per_second: 95.713, eval_steps_per_second: 5.982, epoch: 5.0691


[2024-03-04 14:12:57,042] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-8800
[2024-03-04 14:13:00,890] [    INFO] - Deleting older checkpoint [work/output/checkpoint-7000] due to args.save_total_limit


loss: 0.073203, learning_rate: 2.462557603686636e-05, global_step: 8810, interval_runtime: 29.3039, interval_samples_per_second: 0.546, interval_steps_per_second: 0.341, epoch: 5.0749
loss: 0.172913, learning_rate: 2.4596774193548387e-05, global_step: 8820, interval_runtime: 4.4127, interval_samples_per_second: 3.626, interval_steps_per_second: 2.266, epoch: 5.0806
loss: 0.15355314, learning_rate: 2.4567972350230417e-05, global_step: 8830, interval_runtime: 4.4242, interval_samples_per_second: 3.616, interval_steps_per_second: 2.26, epoch: 5.0864
loss: 0.14039481, learning_rate: 2.4539170506912443e-05, global_step: 8840, interval_runtime: 4.433, interval_samples_per_second: 3.609, interval_steps_per_second: 2.256, epoch: 5.0922
loss: 0.105303, learning_rate: 2.4510368663594472e-05, global_step: 8850, interval_runtime: 4.4116, interval_samples_per_second: 3.627, interval_steps_per_second: 2.267, epoch: 5.0979
loss: 0.213535, learning_rate: 2.44815668202765e-05, global_step: 8860, i

[2024-03-04 14:14:29,463] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:14:29,466] [    INFO] -   Num examples = 2000
[2024-03-04 14:14:29,470] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:14:29,472] [    INFO] -   Total Batch size = 16
[2024-03-04 14:14:29,475] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.5052510499954224, eval_accuracy: 0.7475, eval_f1: 0.7112206954370995, eval_runtime: 21.2555, eval_samples_per_second: 94.093, eval_steps_per_second: 5.881, epoch: 5.1843


[2024-03-04 14:14:50,738] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-9000
[2024-03-04 14:14:54,581] [    INFO] - Deleting older checkpoint [work/output/checkpoint-7200] due to args.save_total_limit


loss: 0.05982295, learning_rate: 2.4049539170506913e-05, global_step: 9010, interval_runtime: 29.6576, interval_samples_per_second: 0.539, interval_steps_per_second: 0.337, epoch: 5.1901
loss: 0.09168118, learning_rate: 2.4020737327188942e-05, global_step: 9020, interval_runtime: 4.4238, interval_samples_per_second: 3.617, interval_steps_per_second: 2.26, epoch: 5.1959
loss: 0.14483883, learning_rate: 2.3991935483870968e-05, global_step: 9030, interval_runtime: 4.4343, interval_samples_per_second: 3.608, interval_steps_per_second: 2.255, epoch: 5.2016
loss: 0.09603243, learning_rate: 2.3963133640552994e-05, global_step: 9040, interval_runtime: 4.4219, interval_samples_per_second: 3.618, interval_steps_per_second: 2.261, epoch: 5.2074
loss: 0.15184326, learning_rate: 2.3934331797235023e-05, global_step: 9050, interval_runtime: 4.4332, interval_samples_per_second: 3.609, interval_steps_per_second: 2.256, epoch: 5.2131
loss: 0.23313606, learning_rate: 2.3905529953917053e-05, global_s

[2024-03-04 14:16:23,292] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:16:23,295] [    INFO] -   Num examples = 2000
[2024-03-04 14:16:23,297] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:16:23,300] [    INFO] -   Total Batch size = 16
[2024-03-04 14:16:23,302] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.4954060316085815, eval_accuracy: 0.7515, eval_f1: 0.7148685448955758, eval_runtime: 20.8861, eval_samples_per_second: 95.757, eval_steps_per_second: 5.985, epoch: 5.2995


[2024-03-04 14:16:44,184] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-9200
[2024-03-04 14:16:47,958] [    INFO] - Deleting older checkpoint [work/output/checkpoint-7400] due to args.save_total_limit


loss: 0.10118865, learning_rate: 2.3473502304147465e-05, global_step: 9210, interval_runtime: 29.2002, interval_samples_per_second: 0.548, interval_steps_per_second: 0.342, epoch: 5.3053
loss: 0.1859093, learning_rate: 2.3444700460829494e-05, global_step: 9220, interval_runtime: 4.4224, interval_samples_per_second: 3.618, interval_steps_per_second: 2.261, epoch: 5.3111
loss: 0.05876986, learning_rate: 2.3415898617511523e-05, global_step: 9230, interval_runtime: 4.424, interval_samples_per_second: 3.617, interval_steps_per_second: 2.26, epoch: 5.3168
loss: 0.08743584, learning_rate: 2.338709677419355e-05, global_step: 9240, interval_runtime: 4.4173, interval_samples_per_second: 3.622, interval_steps_per_second: 2.264, epoch: 5.3226
loss: 0.11484735, learning_rate: 2.3358294930875575e-05, global_step: 9250, interval_runtime: 4.4341, interval_samples_per_second: 3.608, interval_steps_per_second: 2.255, epoch: 5.3283
loss: 0.20153346, learning_rate: 2.3329493087557604e-05, global_step

[2024-03-04 14:18:16,892] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:18:16,895] [    INFO] -   Num examples = 2000
[2024-03-04 14:18:16,898] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:18:16,900] [    INFO] -   Total Batch size = 16
[2024-03-04 14:18:16,903] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.4842028617858887, eval_accuracy: 0.753, eval_f1: 0.7165224638262165, eval_runtime: 21.0203, eval_samples_per_second: 95.146, eval_steps_per_second: 5.947, epoch: 5.4147


[2024-03-04 14:18:37,926] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-9400
[2024-03-04 14:18:42,097] [    INFO] - Deleting older checkpoint [work/output/checkpoint-7600] due to args.save_total_limit


loss: 0.21283071, learning_rate: 2.289746543778802e-05, global_step: 9410, interval_runtime: 29.7589, interval_samples_per_second: 0.538, interval_steps_per_second: 0.336, epoch: 5.4205
loss: 0.20038271, learning_rate: 2.2868663594470046e-05, global_step: 9420, interval_runtime: 4.4262, interval_samples_per_second: 3.615, interval_steps_per_second: 2.259, epoch: 5.4263
loss: 0.14830137, learning_rate: 2.2839861751152075e-05, global_step: 9430, interval_runtime: 4.4173, interval_samples_per_second: 3.622, interval_steps_per_second: 2.264, epoch: 5.432
loss: 0.13677902, learning_rate: 2.2811059907834104e-05, global_step: 9440, interval_runtime: 4.4206, interval_samples_per_second: 3.619, interval_steps_per_second: 2.262, epoch: 5.4378
loss: 0.16447132, learning_rate: 2.278225806451613e-05, global_step: 9450, interval_runtime: 4.418, interval_samples_per_second: 3.622, interval_steps_per_second: 2.263, epoch: 5.4435
loss: 0.12329724, learning_rate: 2.275345622119816e-05, global_step:

[2024-03-04 14:20:10,638] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:20:10,644] [    INFO] -   Num examples = 2000
[2024-03-04 14:20:10,646] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:20:10,648] [    INFO] -   Total Batch size = 16
[2024-03-04 14:20:10,650] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.4720219373703003, eval_accuracy: 0.757, eval_f1: 0.7235324866965794, eval_runtime: 20.9654, eval_samples_per_second: 95.395, eval_steps_per_second: 5.962, epoch: 5.53


[2024-03-04 14:20:31,610] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-9600
[2024-03-04 14:20:35,779] [    INFO] - Deleting older checkpoint [work/output/checkpoint-7800] due to args.save_total_limit


loss: 0.13073313, learning_rate: 2.2321428571428575e-05, global_step: 9610, interval_runtime: 29.6828, interval_samples_per_second: 0.539, interval_steps_per_second: 0.337, epoch: 5.5357
loss: 0.28995621, learning_rate: 2.22926267281106e-05, global_step: 9620, interval_runtime: 4.4341, interval_samples_per_second: 3.608, interval_steps_per_second: 2.255, epoch: 5.5415
loss: 0.12399304, learning_rate: 2.2263824884792627e-05, global_step: 9630, interval_runtime: 4.4265, interval_samples_per_second: 3.615, interval_steps_per_second: 2.259, epoch: 5.5472
loss: 0.16509697, learning_rate: 2.2235023041474656e-05, global_step: 9640, interval_runtime: 4.4204, interval_samples_per_second: 3.62, interval_steps_per_second: 2.262, epoch: 5.553
loss: 0.27411056, learning_rate: 2.2206221198156682e-05, global_step: 9650, interval_runtime: 4.4313, interval_samples_per_second: 3.611, interval_steps_per_second: 2.257, epoch: 5.5588
loss: 0.10819063, learning_rate: 2.217741935483871e-05, global_step:

[2024-03-04 14:22:04,308] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:22:04,316] [    INFO] -   Num examples = 2000
[2024-03-04 14:22:04,326] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:22:04,329] [    INFO] -   Total Batch size = 16
[2024-03-04 14:22:04,337] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.535119891166687, eval_accuracy: 0.7535, eval_f1: 0.7191140508194347, eval_runtime: 20.8987, eval_samples_per_second: 95.7, eval_steps_per_second: 5.981, epoch: 5.6452


[2024-03-04 14:22:25,216] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-9800
[2024-03-04 14:22:29,100] [    INFO] - Deleting older checkpoint [work/output/checkpoint-8000] due to args.save_total_limit


loss: 0.12478402, learning_rate: 2.1745391705069126e-05, global_step: 9810, interval_runtime: 29.3244, interval_samples_per_second: 0.546, interval_steps_per_second: 0.341, epoch: 5.6509
loss: 0.13985096, learning_rate: 2.1716589861751152e-05, global_step: 9820, interval_runtime: 4.4362, interval_samples_per_second: 3.607, interval_steps_per_second: 2.254, epoch: 5.6567
loss: 0.18357253, learning_rate: 2.168778801843318e-05, global_step: 9830, interval_runtime: 4.4101, interval_samples_per_second: 3.628, interval_steps_per_second: 2.268, epoch: 5.6624
loss: 0.17462034, learning_rate: 2.1658986175115207e-05, global_step: 9840, interval_runtime: 4.4384, interval_samples_per_second: 3.605, interval_steps_per_second: 2.253, epoch: 5.6682
loss: 0.20781736, learning_rate: 2.1630184331797233e-05, global_step: 9850, interval_runtime: 4.4158, interval_samples_per_second: 3.623, interval_steps_per_second: 2.265, epoch: 5.674
loss: 0.03881186, learning_rate: 2.1601382488479263e-05, global_st

[2024-03-04 14:23:58,288] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:23:58,291] [    INFO] -   Num examples = 2000
[2024-03-04 14:23:58,293] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:23:58,299] [    INFO] -   Total Batch size = 16
[2024-03-04 14:23:58,301] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.4492703676223755, eval_accuracy: 0.7555, eval_f1: 0.7216977912529221, eval_runtime: 21.1959, eval_samples_per_second: 94.358, eval_steps_per_second: 5.897, epoch: 5.7604


[2024-03-04 14:24:19,490] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-10000
[2024-03-04 14:24:23,565] [    INFO] - Deleting older checkpoint [work/output/checkpoint-8200] due to args.save_total_limit


loss: 0.04413256, learning_rate: 2.1169354838709678e-05, global_step: 10010, interval_runtime: 29.8131, interval_samples_per_second: 0.537, interval_steps_per_second: 0.335, epoch: 5.7661
loss: 0.01505273, learning_rate: 2.1140552995391704e-05, global_step: 10020, interval_runtime: 4.449, interval_samples_per_second: 3.596, interval_steps_per_second: 2.248, epoch: 5.7719
loss: 0.18365763, learning_rate: 2.1111751152073733e-05, global_step: 10030, interval_runtime: 4.44, interval_samples_per_second: 3.604, interval_steps_per_second: 2.252, epoch: 5.7776
loss: 0.34205046, learning_rate: 2.1082949308755763e-05, global_step: 10040, interval_runtime: 4.434, interval_samples_per_second: 3.608, interval_steps_per_second: 2.255, epoch: 5.7834
loss: 0.15401697, learning_rate: 2.1054147465437792e-05, global_step: 10050, interval_runtime: 4.4339, interval_samples_per_second: 3.609, interval_steps_per_second: 2.255, epoch: 5.7892
loss: 0.09339328, learning_rate: 2.1025345622119814e-05, global

[2024-03-04 14:25:52,176] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:25:52,179] [    INFO] -   Num examples = 2000
[2024-03-04 14:25:52,181] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:25:52,184] [    INFO] -   Total Batch size = 16
[2024-03-04 14:25:52,186] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.4451391696929932, eval_accuracy: 0.7575, eval_f1: 0.7197480175835328, eval_runtime: 20.9719, eval_samples_per_second: 95.366, eval_steps_per_second: 5.96, epoch: 5.8756


[2024-03-04 14:26:13,153] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-10200
[2024-03-04 14:26:17,000] [    INFO] - Deleting older checkpoint [work/output/checkpoint-8400] due to args.save_total_limit


loss: 0.14119321, learning_rate: 2.0593317972350233e-05, global_step: 10210, interval_runtime: 29.3802, interval_samples_per_second: 0.545, interval_steps_per_second: 0.34, epoch: 5.8813
loss: 0.14625233, learning_rate: 2.056451612903226e-05, global_step: 10220, interval_runtime: 4.4201, interval_samples_per_second: 3.62, interval_steps_per_second: 2.262, epoch: 5.8871
loss: 0.07400839, learning_rate: 2.0535714285714285e-05, global_step: 10230, interval_runtime: 4.4143, interval_samples_per_second: 3.625, interval_steps_per_second: 2.265, epoch: 5.8929
loss: 0.24902408, learning_rate: 2.0506912442396314e-05, global_step: 10240, interval_runtime: 4.4264, interval_samples_per_second: 3.615, interval_steps_per_second: 2.259, epoch: 5.8986
loss: 0.27204762, learning_rate: 2.0478110599078343e-05, global_step: 10250, interval_runtime: 4.4346, interval_samples_per_second: 3.608, interval_steps_per_second: 2.255, epoch: 5.9044
loss: 0.04761764, learning_rate: 2.044930875576037e-05, global

[2024-03-04 14:27:45,847] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:27:45,850] [    INFO] -   Num examples = 2000
[2024-03-04 14:27:45,852] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:27:45,855] [    INFO] -   Total Batch size = 16
[2024-03-04 14:27:45,857] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.5038610696792603, eval_accuracy: 0.7545, eval_f1: 0.7219389258473784, eval_runtime: 20.9073, eval_samples_per_second: 95.661, eval_steps_per_second: 5.979, epoch: 5.9908


[2024-03-04 14:28:06,759] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-10400
[2024-03-04 14:28:10,860] [    INFO] - Deleting older checkpoint [work/output/checkpoint-8600] due to args.save_total_limit


loss: 0.07687179, learning_rate: 2.0017281105990785e-05, global_step: 10410, interval_runtime: 29.5352, interval_samples_per_second: 0.542, interval_steps_per_second: 0.339, epoch: 5.9965
loss: 0.14508058, learning_rate: 1.9988479262672814e-05, global_step: 10420, interval_runtime: 4.4048, interval_samples_per_second: 3.632, interval_steps_per_second: 2.27, epoch: 6.0023
loss: 0.13780593, learning_rate: 1.995967741935484e-05, global_step: 10430, interval_runtime: 4.4492, interval_samples_per_second: 3.596, interval_steps_per_second: 2.248, epoch: 6.0081
loss: 0.10707407, learning_rate: 1.9930875576036866e-05, global_step: 10440, interval_runtime: 4.4271, interval_samples_per_second: 3.614, interval_steps_per_second: 2.259, epoch: 6.0138
loss: 0.10160891, learning_rate: 1.9902073732718895e-05, global_step: 10450, interval_runtime: 4.4131, interval_samples_per_second: 3.626, interval_steps_per_second: 2.266, epoch: 6.0196
loss: 0.1118139, learning_rate: 1.9873271889400924e-05, globa

[2024-03-04 14:29:39,531] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:29:39,536] [    INFO] -   Num examples = 2000
[2024-03-04 14:29:39,541] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:29:39,548] [    INFO] -   Total Batch size = 16
[2024-03-04 14:29:39,555] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.5975456237792969, eval_accuracy: 0.7535, eval_f1: 0.713623225437189, eval_runtime: 20.9743, eval_samples_per_second: 95.355, eval_steps_per_second: 5.96, epoch: 6.106


[2024-03-04 14:30:00,514] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-10600
[2024-03-04 14:30:04,315] [    INFO] - Deleting older checkpoint [work/output/checkpoint-8800] due to args.save_total_limit


loss: 0.02605353, learning_rate: 1.9441244239631336e-05, global_step: 10610, interval_runtime: 29.3291, interval_samples_per_second: 0.546, interval_steps_per_second: 0.341, epoch: 6.1118
loss: 0.04576273, learning_rate: 1.9412442396313366e-05, global_step: 10620, interval_runtime: 4.4204, interval_samples_per_second: 3.62, interval_steps_per_second: 2.262, epoch: 6.1175
loss: 0.06354821, learning_rate: 1.938364055299539e-05, global_step: 10630, interval_runtime: 4.4362, interval_samples_per_second: 3.607, interval_steps_per_second: 2.254, epoch: 6.1233
loss: 0.1419184, learning_rate: 1.935483870967742e-05, global_step: 10640, interval_runtime: 4.4301, interval_samples_per_second: 3.612, interval_steps_per_second: 2.257, epoch: 6.129
loss: 0.13966699, learning_rate: 1.9326036866359447e-05, global_step: 10650, interval_runtime: 4.4314, interval_samples_per_second: 3.611, interval_steps_per_second: 2.257, epoch: 6.1348
loss: 0.04694709, learning_rate: 1.9297235023041476e-05, global_

[2024-03-04 14:31:32,933] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:31:32,936] [    INFO] -   Num examples = 2000
[2024-03-04 14:31:32,939] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:31:32,942] [    INFO] -   Total Batch size = 16
[2024-03-04 14:31:32,945] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.6430104970932007, eval_accuracy: 0.7535, eval_f1: 0.7186876349097914, eval_runtime: 20.9406, eval_samples_per_second: 95.508, eval_steps_per_second: 5.969, epoch: 6.2212


[2024-03-04 14:31:53,889] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-10800
[2024-03-04 14:31:57,870] [    INFO] - Deleting older checkpoint [work/output/checkpoint-9000] due to args.save_total_limit


loss: 0.09215513, learning_rate: 1.886520737327189e-05, global_step: 10810, interval_runtime: 29.4932, interval_samples_per_second: 0.542, interval_steps_per_second: 0.339, epoch: 6.227
loss: 0.02260468, learning_rate: 1.8836405529953917e-05, global_step: 10820, interval_runtime: 4.4196, interval_samples_per_second: 3.62, interval_steps_per_second: 2.263, epoch: 6.2327
loss: 0.06934943, learning_rate: 1.8807603686635947e-05, global_step: 10830, interval_runtime: 4.4178, interval_samples_per_second: 3.622, interval_steps_per_second: 2.264, epoch: 6.2385
loss: 0.13542886, learning_rate: 1.8778801843317972e-05, global_step: 10840, interval_runtime: 4.4275, interval_samples_per_second: 3.614, interval_steps_per_second: 2.259, epoch: 6.2442
loss: 0.31828985, learning_rate: 1.8750000000000002e-05, global_step: 10850, interval_runtime: 4.4265, interval_samples_per_second: 3.615, interval_steps_per_second: 2.259, epoch: 6.25
loss: 0.01370533, learning_rate: 1.872119815668203e-05, global_s

[2024-03-04 14:33:26,328] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:33:26,330] [    INFO] -   Num examples = 2000
[2024-03-04 14:33:26,333] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:33:26,335] [    INFO] -   Total Batch size = 16
[2024-03-04 14:33:26,337] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.6000356674194336, eval_accuracy: 0.753, eval_f1: 0.7172216627529965, eval_runtime: 20.9477, eval_samples_per_second: 95.476, eval_steps_per_second: 5.967, epoch: 6.3364


[2024-03-04 14:33:47,281] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-11000
[2024-03-04 14:33:51,360] [    INFO] - Deleting older checkpoint [work/output/checkpoint-9200] due to args.save_total_limit


loss: 0.11456225, learning_rate: 1.8289170506912443e-05, global_step: 11010, interval_runtime: 29.5914, interval_samples_per_second: 0.541, interval_steps_per_second: 0.338, epoch: 6.3422
loss: 0.01107028, learning_rate: 1.8260368663594472e-05, global_step: 11020, interval_runtime: 4.4457, interval_samples_per_second: 3.599, interval_steps_per_second: 2.249, epoch: 6.3479
loss: 0.11854219, learning_rate: 1.8231566820276498e-05, global_step: 11030, interval_runtime: 4.4361, interval_samples_per_second: 3.607, interval_steps_per_second: 2.254, epoch: 6.3537
loss: 0.00468061, learning_rate: 1.8202764976958524e-05, global_step: 11040, interval_runtime: 4.4283, interval_samples_per_second: 3.613, interval_steps_per_second: 2.258, epoch: 6.3594
loss: 0.04697812, learning_rate: 1.8173963133640553e-05, global_step: 11050, interval_runtime: 4.4322, interval_samples_per_second: 3.61, interval_steps_per_second: 2.256, epoch: 6.3652
loss: 0.14498675, learning_rate: 1.8145161290322583e-05, glo

[2024-03-04 14:35:20,072] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:35:20,075] [    INFO] -   Num examples = 2000
[2024-03-04 14:35:20,077] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:35:20,079] [    INFO] -   Total Batch size = 16
[2024-03-04 14:35:20,082] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.565563678741455, eval_accuracy: 0.7505, eval_f1: 0.7125073033552126, eval_runtime: 20.8699, eval_samples_per_second: 95.832, eval_steps_per_second: 5.989, epoch: 6.4516


[2024-03-04 14:35:40,947] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-11200
[2024-03-04 14:35:44,783] [    INFO] - Deleting older checkpoint [work/output/checkpoint-9400] due to args.save_total_limit


loss: 0.11476383, learning_rate: 1.7713133640552995e-05, global_step: 11210, interval_runtime: 29.2401, interval_samples_per_second: 0.547, interval_steps_per_second: 0.342, epoch: 6.4574
loss: 0.07137604, learning_rate: 1.7684331797235024e-05, global_step: 11220, interval_runtime: 4.4352, interval_samples_per_second: 3.607, interval_steps_per_second: 2.255, epoch: 6.4631
loss: 0.06899961, learning_rate: 1.7655529953917053e-05, global_step: 11230, interval_runtime: 4.4363, interval_samples_per_second: 3.607, interval_steps_per_second: 2.254, epoch: 6.4689
loss: 0.13609204, learning_rate: 1.762672811059908e-05, global_step: 11240, interval_runtime: 4.423, interval_samples_per_second: 3.617, interval_steps_per_second: 2.261, epoch: 6.4747
loss: 0.08431775, learning_rate: 1.7597926267281105e-05, global_step: 11250, interval_runtime: 4.4196, interval_samples_per_second: 3.62, interval_steps_per_second: 2.263, epoch: 6.4804
loss: 0.04237277, learning_rate: 1.7569124423963134e-05, globa

[2024-03-04 14:37:13,436] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:37:13,439] [    INFO] -   Num examples = 2000
[2024-03-04 14:37:13,442] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:37:13,448] [    INFO] -   Total Batch size = 16
[2024-03-04 14:37:13,451] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.605291724205017, eval_accuracy: 0.7495, eval_f1: 0.715395030341443, eval_runtime: 20.9083, eval_samples_per_second: 95.656, eval_steps_per_second: 5.978, epoch: 6.5668


[2024-03-04 14:37:34,352] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-11400
[2024-03-04 14:37:38,683] [    INFO] - Deleting older checkpoint [work/output/checkpoint-9600] due to args.save_total_limit


loss: 0.08004939, learning_rate: 1.7137096774193546e-05, global_step: 11410, interval_runtime: 29.7779, interval_samples_per_second: 0.537, interval_steps_per_second: 0.336, epoch: 6.5726
loss: 0.06493801, learning_rate: 1.7108294930875576e-05, global_step: 11420, interval_runtime: 4.4194, interval_samples_per_second: 3.62, interval_steps_per_second: 2.263, epoch: 6.5783
loss: 0.08475181, learning_rate: 1.7079493087557605e-05, global_step: 11430, interval_runtime: 4.4325, interval_samples_per_second: 3.61, interval_steps_per_second: 2.256, epoch: 6.5841
loss: 0.07673042, learning_rate: 1.7050691244239634e-05, global_step: 11440, interval_runtime: 4.4511, interval_samples_per_second: 3.595, interval_steps_per_second: 2.247, epoch: 6.5899
loss: 0.10514381, learning_rate: 1.702188940092166e-05, global_step: 11450, interval_runtime: 4.4439, interval_samples_per_second: 3.6, interval_steps_per_second: 2.25, epoch: 6.5956
loss: 0.06628569, learning_rate: 1.6993087557603686e-05, global_s

[2024-03-04 14:39:07,590] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:39:07,593] [    INFO] -   Num examples = 2000
[2024-03-04 14:39:07,595] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:39:07,597] [    INFO] -   Total Batch size = 16
[2024-03-04 14:39:07,600] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.57943594455719, eval_accuracy: 0.7605, eval_f1: 0.7273816615222817, eval_runtime: 20.9495, eval_samples_per_second: 95.468, eval_steps_per_second: 5.967, epoch: 6.682


[2024-03-04 14:39:28,545] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-11600
[2024-03-04 14:39:32,357] [    INFO] - Deleting older checkpoint [work/output/checkpoint-9800] due to args.save_total_limit


loss: 0.07399665, learning_rate: 1.6561059907834105e-05, global_step: 11610, interval_runtime: 29.3453, interval_samples_per_second: 0.545, interval_steps_per_second: 0.341, epoch: 6.6878
loss: 0.06962731, learning_rate: 1.653225806451613e-05, global_step: 11620, interval_runtime: 4.4316, interval_samples_per_second: 3.61, interval_steps_per_second: 2.257, epoch: 6.6935
loss: 0.17381606, learning_rate: 1.6503456221198157e-05, global_step: 11630, interval_runtime: 4.4271, interval_samples_per_second: 3.614, interval_steps_per_second: 2.259, epoch: 6.6993
loss: 0.00260564, learning_rate: 1.6474654377880186e-05, global_step: 11640, interval_runtime: 4.4417, interval_samples_per_second: 3.602, interval_steps_per_second: 2.251, epoch: 6.7051
loss: 0.11519141, learning_rate: 1.6445852534562212e-05, global_step: 11650, interval_runtime: 4.4241, interval_samples_per_second: 3.617, interval_steps_per_second: 2.26, epoch: 6.7108
loss: 0.17413645, learning_rate: 1.641705069124424e-05, global

[2024-03-04 14:41:00,921] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:41:00,929] [    INFO] -   Num examples = 2000
[2024-03-04 14:41:00,943] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:41:00,967] [    INFO] -   Total Batch size = 16
[2024-03-04 14:41:00,983] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.5709463357925415, eval_accuracy: 0.7555, eval_f1: 0.7234648932587794, eval_runtime: 21.347, eval_samples_per_second: 93.69, eval_steps_per_second: 5.856, epoch: 6.7972


[2024-03-04 14:41:22,274] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-11800
[2024-03-04 14:41:26,104] [    INFO] - Deleting older checkpoint [work/output/checkpoint-10000] due to args.save_total_limit


loss: 0.03842135, learning_rate: 1.5985023041474656e-05, global_step: 11810, interval_runtime: 29.7076, interval_samples_per_second: 0.539, interval_steps_per_second: 0.337, epoch: 6.803
loss: 0.07701749, learning_rate: 1.5956221198156682e-05, global_step: 11820, interval_runtime: 4.4113, interval_samples_per_second: 3.627, interval_steps_per_second: 2.267, epoch: 6.8088
loss: 0.22885118, learning_rate: 1.592741935483871e-05, global_step: 11830, interval_runtime: 4.4491, interval_samples_per_second: 3.596, interval_steps_per_second: 2.248, epoch: 6.8145
loss: 0.04113282, learning_rate: 1.5898617511520737e-05, global_step: 11840, interval_runtime: 4.4308, interval_samples_per_second: 3.611, interval_steps_per_second: 2.257, epoch: 6.8203
loss: 0.0539931, learning_rate: 1.5869815668202767e-05, global_step: 11850, interval_runtime: 4.4438, interval_samples_per_second: 3.6, interval_steps_per_second: 2.25, epoch: 6.826
loss: 0.05726588, learning_rate: 1.5841013824884793e-05, global_st

[2024-03-04 14:42:54,830] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:42:54,835] [    INFO] -   Num examples = 2000
[2024-03-04 14:42:54,839] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:42:54,841] [    INFO] -   Total Batch size = 16
[2024-03-04 14:42:54,845] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.5844402313232422, eval_accuracy: 0.751, eval_f1: 0.7186027126130482, eval_runtime: 21.1168, eval_samples_per_second: 94.711, eval_steps_per_second: 5.919, epoch: 6.9124


[2024-03-04 14:43:15,954] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-12000
[2024-03-04 14:43:19,854] [    INFO] - Deleting older checkpoint [work/output/checkpoint-10200] due to args.save_total_limit


loss: 0.15287248, learning_rate: 1.5408986175115208e-05, global_step: 12010, interval_runtime: 29.5691, interval_samples_per_second: 0.541, interval_steps_per_second: 0.338, epoch: 6.9182
loss: 0.16372072, learning_rate: 1.5380184331797234e-05, global_step: 12020, interval_runtime: 4.4378, interval_samples_per_second: 3.605, interval_steps_per_second: 2.253, epoch: 6.924
loss: 0.11628516, learning_rate: 1.5351382488479263e-05, global_step: 12030, interval_runtime: 4.4184, interval_samples_per_second: 3.621, interval_steps_per_second: 2.263, epoch: 6.9297
loss: 0.027183, learning_rate: 1.5322580645161292e-05, global_step: 12040, interval_runtime: 4.4241, interval_samples_per_second: 3.617, interval_steps_per_second: 2.26, epoch: 6.9355
loss: 0.04796409, learning_rate: 1.529377880184332e-05, global_step: 12050, interval_runtime: 4.4325, interval_samples_per_second: 3.61, interval_steps_per_second: 2.256, epoch: 6.9412
loss: 0.08261654, learning_rate: 1.5264976958525344e-05, global_s

[2024-03-04 14:44:48,512] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:44:48,518] [    INFO] -   Num examples = 2000
[2024-03-04 14:44:48,521] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:44:48,524] [    INFO] -   Total Batch size = 16
[2024-03-04 14:44:48,528] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.7088056802749634, eval_accuracy: 0.752, eval_f1: 0.7181166846764343, eval_runtime: 20.9716, eval_samples_per_second: 95.367, eval_steps_per_second: 5.96, epoch: 7.0276


[2024-03-04 14:45:09,492] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-12200
[2024-03-04 14:45:13,586] [    INFO] - Deleting older checkpoint [work/output/checkpoint-10400] due to args.save_total_limit


loss: 0.05014001, learning_rate: 1.4832949308755761e-05, global_step: 12210, interval_runtime: 29.6256, interval_samples_per_second: 0.54, interval_steps_per_second: 0.338, epoch: 7.0334
loss: 0.04505325, learning_rate: 1.480414746543779e-05, global_step: 12220, interval_runtime: 4.4385, interval_samples_per_second: 3.605, interval_steps_per_second: 2.253, epoch: 7.0392
loss: 0.11993586, learning_rate: 1.4775345622119815e-05, global_step: 12230, interval_runtime: 4.4219, interval_samples_per_second: 3.618, interval_steps_per_second: 2.261, epoch: 7.0449
loss: 0.10074357, learning_rate: 1.4746543778801844e-05, global_step: 12240, interval_runtime: 4.4295, interval_samples_per_second: 3.612, interval_steps_per_second: 2.258, epoch: 7.0507
loss: 0.09928536, learning_rate: 1.4717741935483872e-05, global_step: 12250, interval_runtime: 4.4353, interval_samples_per_second: 3.607, interval_steps_per_second: 2.255, epoch: 7.0565
loss: 0.11934873, learning_rate: 1.4688940092165898e-05, glob

[2024-03-04 14:46:42,280] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:46:42,282] [    INFO] -   Num examples = 2000
[2024-03-04 14:46:42,284] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:46:42,286] [    INFO] -   Total Batch size = 16
[2024-03-04 14:46:42,288] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.664333701133728, eval_accuracy: 0.7565, eval_f1: 0.719776047872045, eval_runtime: 20.9232, eval_samples_per_second: 95.588, eval_steps_per_second: 5.974, epoch: 7.1429


[2024-03-04 14:47:03,208] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-12400
[2024-03-04 14:47:06,969] [    INFO] - Deleting older checkpoint [work/output/checkpoint-10600] due to args.save_total_limit


loss: 0.03757934, learning_rate: 1.4256912442396315e-05, global_step: 12410, interval_runtime: 29.2676, interval_samples_per_second: 0.547, interval_steps_per_second: 0.342, epoch: 7.1486
loss: 0.11315877, learning_rate: 1.4228110599078342e-05, global_step: 12420, interval_runtime: 4.472, interval_samples_per_second: 3.578, interval_steps_per_second: 2.236, epoch: 7.1544
loss: 0.01801212, learning_rate: 1.4199308755760368e-05, global_step: 12430, interval_runtime: 4.425, interval_samples_per_second: 3.616, interval_steps_per_second: 2.26, epoch: 7.1601
loss: 0.03819338, learning_rate: 1.4170506912442397e-05, global_step: 12440, interval_runtime: 4.4253, interval_samples_per_second: 3.616, interval_steps_per_second: 2.26, epoch: 7.1659
loss: 0.08798847, learning_rate: 1.4141705069124425e-05, global_step: 12450, interval_runtime: 4.4292, interval_samples_per_second: 3.612, interval_steps_per_second: 2.258, epoch: 7.1717
loss: 0.11575915, learning_rate: 1.4112903225806454e-05, global

[2024-03-04 14:48:35,907] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:48:35,909] [    INFO] -   Num examples = 2000
[2024-03-04 14:48:35,912] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:48:35,914] [    INFO] -   Total Batch size = 16
[2024-03-04 14:48:35,916] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.6699740886688232, eval_accuracy: 0.76, eval_f1: 0.7261053502040872, eval_runtime: 20.9664, eval_samples_per_second: 95.391, eval_steps_per_second: 5.962, epoch: 7.2581


[2024-03-04 14:48:56,879] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-12600
[2024-03-04 14:49:00,895] [    INFO] - Deleting older checkpoint [work/output/checkpoint-10800] due to args.save_total_limit


loss: 0.05529826, learning_rate: 1.3680875576036866e-05, global_step: 12610, interval_runtime: 29.5254, interval_samples_per_second: 0.542, interval_steps_per_second: 0.339, epoch: 7.2638
loss: 0.04581306, learning_rate: 1.3652073732718896e-05, global_step: 12620, interval_runtime: 4.4697, interval_samples_per_second: 3.58, interval_steps_per_second: 2.237, epoch: 7.2696
loss: 0.06175195, learning_rate: 1.3623271889400923e-05, global_step: 12630, interval_runtime: 4.431, interval_samples_per_second: 3.611, interval_steps_per_second: 2.257, epoch: 7.2753
loss: 0.08497742, learning_rate: 1.3594470046082949e-05, global_step: 12640, interval_runtime: 4.4134, interval_samples_per_second: 3.625, interval_steps_per_second: 2.266, epoch: 7.2811
loss: 0.06648079, learning_rate: 1.3565668202764978e-05, global_step: 12650, interval_runtime: 4.4178, interval_samples_per_second: 3.622, interval_steps_per_second: 2.264, epoch: 7.2869
loss: 0.00142958, learning_rate: 1.3536866359447006e-05, glob

[2024-03-04 14:50:29,864] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:50:29,866] [    INFO] -   Num examples = 2000
[2024-03-04 14:50:29,869] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:50:29,871] [    INFO] -   Total Batch size = 16
[2024-03-04 14:50:29,873] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.7245545387268066, eval_accuracy: 0.749, eval_f1: 0.712454502560003, eval_runtime: 21.004, eval_samples_per_second: 95.22, eval_steps_per_second: 5.951, epoch: 7.3733


[2024-03-04 14:50:50,878] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-12800
[2024-03-04 14:50:54,675] [    INFO] - Deleting older checkpoint [work/output/checkpoint-11000] due to args.save_total_limit


loss: 0.09388167, learning_rate: 1.310483870967742e-05, global_step: 12810, interval_runtime: 29.3475, interval_samples_per_second: 0.545, interval_steps_per_second: 0.341, epoch: 7.379
loss: 0.01203396, learning_rate: 1.3076036866359447e-05, global_step: 12820, interval_runtime: 4.4348, interval_samples_per_second: 3.608, interval_steps_per_second: 2.255, epoch: 7.3848
loss: 0.08352381, learning_rate: 1.3047235023041477e-05, global_step: 12830, interval_runtime: 4.4197, interval_samples_per_second: 3.62, interval_steps_per_second: 2.263, epoch: 7.3906
loss: 0.0084763, learning_rate: 1.3018433179723502e-05, global_step: 12840, interval_runtime: 4.4278, interval_samples_per_second: 3.614, interval_steps_per_second: 2.258, epoch: 7.3963
loss: 0.07556951, learning_rate: 1.298963133640553e-05, global_step: 12850, interval_runtime: 4.4248, interval_samples_per_second: 3.616, interval_steps_per_second: 2.26, epoch: 7.4021
loss: 0.12935621, learning_rate: 1.296082949308756e-05, global_st

[2024-03-04 14:52:23,464] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:52:23,468] [    INFO] -   Num examples = 2000
[2024-03-04 14:52:23,473] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:52:23,475] [    INFO] -   Total Batch size = 16
[2024-03-04 14:52:23,477] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.6512255668640137, eval_accuracy: 0.756, eval_f1: 0.7221707812596797, eval_runtime: 21.2253, eval_samples_per_second: 94.227, eval_steps_per_second: 5.889, epoch: 7.4885


[2024-03-04 14:52:44,696] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-13000
[2024-03-04 14:52:48,885] [    INFO] - Deleting older checkpoint [work/output/checkpoint-11200] due to args.save_total_limit


loss: 0.12638092, learning_rate: 1.2528801843317971e-05, global_step: 13010, interval_runtime: 30.0553, interval_samples_per_second: 0.532, interval_steps_per_second: 0.333, epoch: 7.4942
loss: 0.21494141, learning_rate: 1.25e-05, global_step: 13020, interval_runtime: 4.4536, interval_samples_per_second: 3.593, interval_steps_per_second: 2.245, epoch: 7.5
loss: 0.05606225, learning_rate: 1.2471198156682028e-05, global_step: 13030, interval_runtime: 4.4249, interval_samples_per_second: 3.616, interval_steps_per_second: 2.26, epoch: 7.5058
loss: 0.05226091, learning_rate: 1.2442396313364056e-05, global_step: 13040, interval_runtime: 4.4303, interval_samples_per_second: 3.612, interval_steps_per_second: 2.257, epoch: 7.5115
loss: 0.07247798, learning_rate: 1.2413594470046083e-05, global_step: 13050, interval_runtime: 4.4331, interval_samples_per_second: 3.609, interval_steps_per_second: 2.256, epoch: 7.5173
loss: 0.00475624, learning_rate: 1.2384792626728111e-05, global_step: 13060, 

[2024-03-04 14:54:17,624] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:54:17,628] [    INFO] -   Num examples = 2000
[2024-03-04 14:54:17,631] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:54:17,634] [    INFO] -   Total Batch size = 16
[2024-03-04 14:54:17,637] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.6826635599136353, eval_accuracy: 0.748, eval_f1: 0.712096309536733, eval_runtime: 21.056, eval_samples_per_second: 94.985, eval_steps_per_second: 5.937, epoch: 7.6037


[2024-03-04 14:54:38,686] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-13200
[2024-03-04 14:54:42,869] [    INFO] - Deleting older checkpoint [work/output/checkpoint-11400] due to args.save_total_limit


loss: 0.10403789, learning_rate: 1.1952764976958526e-05, global_step: 13210, interval_runtime: 29.7865, interval_samples_per_second: 0.537, interval_steps_per_second: 0.336, epoch: 7.6094
loss: 0.03976419, learning_rate: 1.1923963133640554e-05, global_step: 13220, interval_runtime: 4.4107, interval_samples_per_second: 3.628, interval_steps_per_second: 2.267, epoch: 7.6152
loss: 0.03838813, learning_rate: 1.1895161290322582e-05, global_step: 13230, interval_runtime: 4.4452, interval_samples_per_second: 3.599, interval_steps_per_second: 2.25, epoch: 7.621
loss: 0.00680158, learning_rate: 1.1866359447004609e-05, global_step: 13240, interval_runtime: 4.4238, interval_samples_per_second: 3.617, interval_steps_per_second: 2.261, epoch: 7.6267
loss: 0.06909469, learning_rate: 1.1837557603686637e-05, global_step: 13250, interval_runtime: 4.4371, interval_samples_per_second: 3.606, interval_steps_per_second: 2.254, epoch: 7.6325
loss: 0.06300283, learning_rate: 1.1808755760368664e-05, glob

[2024-03-04 14:56:11,665] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:56:11,669] [    INFO] -   Num examples = 2000
[2024-03-04 14:56:11,672] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:56:11,674] [    INFO] -   Total Batch size = 16
[2024-03-04 14:56:11,677] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.6773031949996948, eval_accuracy: 0.7525, eval_f1: 0.7204742984842504, eval_runtime: 21.0172, eval_samples_per_second: 95.16, eval_steps_per_second: 5.948, epoch: 7.7189


[2024-03-04 14:56:32,693] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-13400
[2024-03-04 14:56:36,400] [    INFO] - Deleting older checkpoint [work/output/checkpoint-11600] due to args.save_total_limit


loss: 0.03769239, learning_rate: 1.137672811059908e-05, global_step: 13410, interval_runtime: 29.2691, interval_samples_per_second: 0.547, interval_steps_per_second: 0.342, epoch: 7.7247
loss: 0.00819317, learning_rate: 1.1347926267281106e-05, global_step: 13420, interval_runtime: 4.4331, interval_samples_per_second: 3.609, interval_steps_per_second: 2.256, epoch: 7.7304
loss: 0.12944831, learning_rate: 1.1319124423963135e-05, global_step: 13430, interval_runtime: 4.4341, interval_samples_per_second: 3.608, interval_steps_per_second: 2.255, epoch: 7.7362
loss: 0.0040658, learning_rate: 1.129032258064516e-05, global_step: 13440, interval_runtime: 4.422, interval_samples_per_second: 3.618, interval_steps_per_second: 2.261, epoch: 7.7419
loss: 0.09064798, learning_rate: 1.126152073732719e-05, global_step: 13450, interval_runtime: 4.423, interval_samples_per_second: 3.617, interval_steps_per_second: 2.261, epoch: 7.7477
loss: 0.08926095, learning_rate: 1.1232718894009218e-05, global_s

[2024-03-04 14:58:05,136] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:58:05,139] [    INFO] -   Num examples = 2000
[2024-03-04 14:58:05,141] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:58:05,144] [    INFO] -   Total Batch size = 16
[2024-03-04 14:58:05,146] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.727809190750122, eval_accuracy: 0.7505, eval_f1: 0.7183432893927483, eval_runtime: 20.8859, eval_samples_per_second: 95.759, eval_steps_per_second: 5.985, epoch: 7.8341


[2024-03-04 14:58:26,029] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-13600
[2024-03-04 14:58:30,230] [    INFO] - Deleting older checkpoint [work/output/checkpoint-11800] due to args.save_total_limit


loss: 0.01639923, learning_rate: 1.0800691244239631e-05, global_step: 13610, interval_runtime: 29.6181, interval_samples_per_second: 0.54, interval_steps_per_second: 0.338, epoch: 7.8399
loss: 0.06225177, learning_rate: 1.077188940092166e-05, global_step: 13620, interval_runtime: 4.4184, interval_samples_per_second: 3.621, interval_steps_per_second: 2.263, epoch: 7.8456
loss: 0.04954706, learning_rate: 1.0743087557603686e-05, global_step: 13630, interval_runtime: 4.443, interval_samples_per_second: 3.601, interval_steps_per_second: 2.251, epoch: 7.8514
loss: 0.05428475, learning_rate: 1.0714285714285714e-05, global_step: 13640, interval_runtime: 4.4278, interval_samples_per_second: 3.614, interval_steps_per_second: 2.258, epoch: 7.8571
loss: 0.06755075, learning_rate: 1.0685483870967743e-05, global_step: 13650, interval_runtime: 4.4138, interval_samples_per_second: 3.625, interval_steps_per_second: 2.266, epoch: 7.8629
loss: 0.12932518, learning_rate: 1.065668202764977e-05, global

[2024-03-04 14:59:58,928] [    INFO] - ***** Running Evaluation *****
[2024-03-04 14:59:58,932] [    INFO] -   Num examples = 2000
[2024-03-04 14:59:58,936] [    INFO] -   Pre device batch size = 16
[2024-03-04 14:59:58,940] [    INFO] -   Total Batch size = 16
[2024-03-04 14:59:58,948] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.7387136220932007, eval_accuracy: 0.751, eval_f1: 0.7187547779388971, eval_runtime: 21.0864, eval_samples_per_second: 94.848, eval_steps_per_second: 5.928, epoch: 7.9493


[2024-03-04 15:00:20,036] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-13800
[2024-03-04 15:00:23,895] [    INFO] - Deleting older checkpoint [work/output/checkpoint-12000] due to args.save_total_limit


loss: 0.13850396, learning_rate: 1.0224654377880185e-05, global_step: 13810, interval_runtime: 29.4861, interval_samples_per_second: 0.543, interval_steps_per_second: 0.339, epoch: 7.9551
loss: 0.02328836, learning_rate: 1.0195852534562212e-05, global_step: 13820, interval_runtime: 4.4144, interval_samples_per_second: 3.625, interval_steps_per_second: 2.265, epoch: 7.9608
loss: 0.12632141, learning_rate: 1.016705069124424e-05, global_step: 13830, interval_runtime: 4.4151, interval_samples_per_second: 3.624, interval_steps_per_second: 2.265, epoch: 7.9666
loss: 0.07777366, learning_rate: 1.0138248847926269e-05, global_step: 13840, interval_runtime: 4.4143, interval_samples_per_second: 3.625, interval_steps_per_second: 2.265, epoch: 7.9724
loss: 0.15155225, learning_rate: 1.0109447004608295e-05, global_step: 13850, interval_runtime: 4.4335, interval_samples_per_second: 3.609, interval_steps_per_second: 2.256, epoch: 7.9781
loss: 0.16737278, learning_rate: 1.0080645161290323e-05, glo

[2024-03-04 15:01:52,557] [    INFO] - ***** Running Evaluation *****
[2024-03-04 15:01:52,560] [    INFO] -   Num examples = 2000
[2024-03-04 15:01:52,564] [    INFO] -   Pre device batch size = 16
[2024-03-04 15:01:52,567] [    INFO] -   Total Batch size = 16
[2024-03-04 15:01:52,572] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.776201605796814, eval_accuracy: 0.7565, eval_f1: 0.7259585731931238, eval_runtime: 21.0321, eval_samples_per_second: 95.093, eval_steps_per_second: 5.943, epoch: 8.0645


[2024-03-04 15:02:13,602] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-14000
[2024-03-04 15:02:17,475] [    INFO] - Deleting older checkpoint [work/output/checkpoint-12200] due to args.save_total_limit


loss: 0.01305067, learning_rate: 9.648617511520738e-06, global_step: 14010, interval_runtime: 29.4289, interval_samples_per_second: 0.544, interval_steps_per_second: 0.34, epoch: 8.0703
loss: 0.00073375, learning_rate: 9.619815668202766e-06, global_step: 14020, interval_runtime: 4.4144, interval_samples_per_second: 3.624, interval_steps_per_second: 2.265, epoch: 8.076
loss: 0.00046537, learning_rate: 9.591013824884793e-06, global_step: 14030, interval_runtime: 4.4157, interval_samples_per_second: 3.623, interval_steps_per_second: 2.265, epoch: 8.0818
loss: 0.15131826, learning_rate: 9.56221198156682e-06, global_step: 14040, interval_runtime: 4.4248, interval_samples_per_second: 3.616, interval_steps_per_second: 2.26, epoch: 8.0876
loss: 0.0003939, learning_rate: 9.533410138248848e-06, global_step: 14050, interval_runtime: 4.4298, interval_samples_per_second: 3.612, interval_steps_per_second: 2.257, epoch: 8.0933
loss: 0.00241157, learning_rate: 9.504608294930876e-06, global_step: 

[2024-03-04 15:03:46,089] [    INFO] - ***** Running Evaluation *****
[2024-03-04 15:03:46,092] [    INFO] -   Num examples = 2000
[2024-03-04 15:03:46,095] [    INFO] -   Pre device batch size = 16
[2024-03-04 15:03:46,097] [    INFO] -   Total Batch size = 16
[2024-03-04 15:03:46,099] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.8574953079223633, eval_accuracy: 0.752, eval_f1: 0.7178012501933028, eval_runtime: 21.0347, eval_samples_per_second: 95.081, eval_steps_per_second: 5.943, epoch: 8.1797


[2024-03-04 15:04:07,130] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-14200
[2024-03-04 15:04:11,207] [    INFO] - Deleting older checkpoint [work/output/checkpoint-12400] due to args.save_total_limit


loss: 0.0244807, learning_rate: 9.072580645161291e-06, global_step: 14210, interval_runtime: 29.6703, interval_samples_per_second: 0.539, interval_steps_per_second: 0.337, epoch: 8.1855
loss: 0.00278344, learning_rate: 9.043778801843319e-06, global_step: 14220, interval_runtime: 4.4244, interval_samples_per_second: 3.616, interval_steps_per_second: 2.26, epoch: 8.1912
loss: 0.00157202, learning_rate: 9.014976958525346e-06, global_step: 14230, interval_runtime: 4.4449, interval_samples_per_second: 3.6, interval_steps_per_second: 2.25, epoch: 8.197
loss: 0.00091637, learning_rate: 8.986175115207374e-06, global_step: 14240, interval_runtime: 4.4206, interval_samples_per_second: 3.619, interval_steps_per_second: 2.262, epoch: 8.2028
loss: 0.04587781, learning_rate: 8.9573732718894e-06, global_step: 14250, interval_runtime: 4.4265, interval_samples_per_second: 3.615, interval_steps_per_second: 2.259, epoch: 8.2085
loss: 0.08989353, learning_rate: 8.92857142857143e-06, global_step: 1426

[2024-03-04 15:05:40,124] [    INFO] - ***** Running Evaluation *****
[2024-03-04 15:05:40,129] [    INFO] -   Num examples = 2000
[2024-03-04 15:05:40,132] [    INFO] -   Pre device batch size = 16
[2024-03-04 15:05:40,135] [    INFO] -   Total Batch size = 16
[2024-03-04 15:05:40,137] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.849353551864624, eval_accuracy: 0.7505, eval_f1: 0.7130468407311404, eval_runtime: 21.1223, eval_samples_per_second: 94.687, eval_steps_per_second: 5.918, epoch: 8.2949


[2024-03-04 15:06:01,252] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-14400
[2024-03-04 15:06:05,328] [    INFO] - Deleting older checkpoint [work/output/checkpoint-12600] due to args.save_total_limit


loss: 0.04338367, learning_rate: 8.496543778801843e-06, global_step: 14410, interval_runtime: 29.728, interval_samples_per_second: 0.538, interval_steps_per_second: 0.336, epoch: 8.3007
loss: 0.01421435, learning_rate: 8.46774193548387e-06, global_step: 14420, interval_runtime: 4.4299, interval_samples_per_second: 3.612, interval_steps_per_second: 2.257, epoch: 8.3065
loss: 0.07181915, learning_rate: 8.4389400921659e-06, global_step: 14430, interval_runtime: 4.4167, interval_samples_per_second: 3.623, interval_steps_per_second: 2.264, epoch: 8.3122
loss: 0.04183153, learning_rate: 8.410138248847926e-06, global_step: 14440, interval_runtime: 4.4278, interval_samples_per_second: 3.614, interval_steps_per_second: 2.258, epoch: 8.318
loss: 0.00140253, learning_rate: 8.381336405529955e-06, global_step: 14450, interval_runtime: 4.4327, interval_samples_per_second: 3.61, interval_steps_per_second: 2.256, epoch: 8.3237
loss: 0.07069847, learning_rate: 8.352534562211983e-06, global_step: 1

[2024-03-04 15:07:33,782] [    INFO] - ***** Running Evaluation *****
[2024-03-04 15:07:33,784] [    INFO] -   Num examples = 2000
[2024-03-04 15:07:33,787] [    INFO] -   Pre device batch size = 16
[2024-03-04 15:07:33,789] [    INFO] -   Total Batch size = 16
[2024-03-04 15:07:33,791] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.8360484838485718, eval_accuracy: 0.747, eval_f1: 0.7130981355761857, eval_runtime: 20.9224, eval_samples_per_second: 95.591, eval_steps_per_second: 5.974, epoch: 8.4101


[2024-03-04 15:07:54,711] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-14600
[2024-03-04 15:07:58,866] [    INFO] - Deleting older checkpoint [work/output/checkpoint-12800] due to args.save_total_limit


loss: 0.06055274, learning_rate: 7.920506912442396e-06, global_step: 14610, interval_runtime: 29.624, interval_samples_per_second: 0.54, interval_steps_per_second: 0.338, epoch: 8.4159
loss: 0.0119016, learning_rate: 7.891705069124426e-06, global_step: 14620, interval_runtime: 4.4098, interval_samples_per_second: 3.628, interval_steps_per_second: 2.268, epoch: 8.4217
loss: 0.02800602, learning_rate: 7.862903225806451e-06, global_step: 14630, interval_runtime: 4.4193, interval_samples_per_second: 3.62, interval_steps_per_second: 2.263, epoch: 8.4274
loss: 0.01140745, learning_rate: 7.834101382488479e-06, global_step: 14640, interval_runtime: 4.4528, interval_samples_per_second: 3.593, interval_steps_per_second: 2.246, epoch: 8.4332
loss: 0.04333201, learning_rate: 7.805299539170508e-06, global_step: 14650, interval_runtime: 4.4105, interval_samples_per_second: 3.628, interval_steps_per_second: 2.267, epoch: 8.4389
loss: 0.00146749, learning_rate: 7.776497695852534e-06, global_step:

[2024-03-04 15:09:27,530] [    INFO] - ***** Running Evaluation *****
[2024-03-04 15:09:27,534] [    INFO] -   Num examples = 2000
[2024-03-04 15:09:27,537] [    INFO] -   Pre device batch size = 16
[2024-03-04 15:09:27,539] [    INFO] -   Total Batch size = 16
[2024-03-04 15:09:27,545] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.8700973987579346, eval_accuracy: 0.7555, eval_f1: 0.7215406324074918, eval_runtime: 20.9003, eval_samples_per_second: 95.693, eval_steps_per_second: 5.981, epoch: 8.5253


[2024-03-04 15:09:48,436] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-14800
[2024-03-04 15:09:52,396] [    INFO] - Deleting older checkpoint [work/output/checkpoint-13000] due to args.save_total_limit


loss: 0.03106871, learning_rate: 7.344470046082949e-06, global_step: 14810, interval_runtime: 29.4081, interval_samples_per_second: 0.544, interval_steps_per_second: 0.34, epoch: 8.5311
loss: 0.03284041, learning_rate: 7.315668202764977e-06, global_step: 14820, interval_runtime: 4.4571, interval_samples_per_second: 3.59, interval_steps_per_second: 2.244, epoch: 8.5369
loss: 0.0443229, learning_rate: 7.286866359447005e-06, global_step: 14830, interval_runtime: 4.4434, interval_samples_per_second: 3.601, interval_steps_per_second: 2.251, epoch: 8.5426
loss: 0.04792559, learning_rate: 7.258064516129033e-06, global_step: 14840, interval_runtime: 4.4466, interval_samples_per_second: 3.598, interval_steps_per_second: 2.249, epoch: 8.5484
loss: 0.01471901, learning_rate: 7.22926267281106e-06, global_step: 14850, interval_runtime: 4.438, interval_samples_per_second: 3.605, interval_steps_per_second: 2.253, epoch: 8.5541
loss: 0.05740531, learning_rate: 7.2004608294930876e-06, global_step:

[2024-03-04 15:11:21,172] [    INFO] - ***** Running Evaluation *****
[2024-03-04 15:11:21,175] [    INFO] -   Num examples = 2000
[2024-03-04 15:11:21,177] [    INFO] -   Pre device batch size = 16
[2024-03-04 15:11:21,179] [    INFO] -   Total Batch size = 16
[2024-03-04 15:11:21,181] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.761148452758789, eval_accuracy: 0.7515, eval_f1: 0.7154229802672805, eval_runtime: 20.932, eval_samples_per_second: 95.547, eval_steps_per_second: 5.972, epoch: 8.6406


[2024-03-04 15:11:42,124] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-15000
[2024-03-04 15:11:47,855] [    INFO] - Deleting older checkpoint [work/output/checkpoint-13200] due to args.save_total_limit


loss: 0.03439203, learning_rate: 6.768433179723503e-06, global_step: 15010, interval_runtime: 31.2466, interval_samples_per_second: 0.512, interval_steps_per_second: 0.32, epoch: 8.6463
loss: 0.02446317, learning_rate: 6.7396313364055306e-06, global_step: 15020, interval_runtime: 4.4333, interval_samples_per_second: 3.609, interval_steps_per_second: 2.256, epoch: 8.6521
loss: 0.12390158, learning_rate: 6.710829493087557e-06, global_step: 15030, interval_runtime: 4.4131, interval_samples_per_second: 3.626, interval_steps_per_second: 2.266, epoch: 8.6578
loss: 0.00494118, learning_rate: 6.682027649769586e-06, global_step: 15040, interval_runtime: 4.4085, interval_samples_per_second: 3.629, interval_steps_per_second: 2.268, epoch: 8.6636
loss: 0.00072384, learning_rate: 6.6532258064516125e-06, global_step: 15050, interval_runtime: 4.4071, interval_samples_per_second: 3.631, interval_steps_per_second: 2.269, epoch: 8.6694
loss: 0.06571988, learning_rate: 6.624423963133642e-06, global_

[2024-03-04 15:13:16,245] [    INFO] - ***** Running Evaluation *****
[2024-03-04 15:13:16,247] [    INFO] -   Num examples = 2000
[2024-03-04 15:13:16,250] [    INFO] -   Pre device batch size = 16
[2024-03-04 15:13:16,252] [    INFO] -   Total Batch size = 16
[2024-03-04 15:13:16,255] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.7651604413986206, eval_accuracy: 0.755, eval_f1: 0.7226850244764393, eval_runtime: 20.9308, eval_samples_per_second: 95.553, eval_steps_per_second: 5.972, epoch: 8.7558


[2024-03-04 15:13:37,181] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-15200
[2024-03-04 15:13:41,044] [    INFO] - Deleting older checkpoint [work/output/checkpoint-13400] due to args.save_total_limit


loss: 0.00280881, learning_rate: 6.1923963133640555e-06, global_step: 15210, interval_runtime: 29.3313, interval_samples_per_second: 0.545, interval_steps_per_second: 0.341, epoch: 8.7615
loss: 0.00338304, learning_rate: 6.163594470046083e-06, global_step: 15220, interval_runtime: 4.4416, interval_samples_per_second: 3.602, interval_steps_per_second: 2.251, epoch: 8.7673
loss: 0.00063156, learning_rate: 6.134792626728111e-06, global_step: 15230, interval_runtime: 4.4466, interval_samples_per_second: 3.598, interval_steps_per_second: 2.249, epoch: 8.773
loss: 0.00084587, learning_rate: 6.105990783410138e-06, global_step: 15240, interval_runtime: 4.4473, interval_samples_per_second: 3.598, interval_steps_per_second: 2.249, epoch: 8.7788
loss: 0.00508065, learning_rate: 6.077188940092167e-06, global_step: 15250, interval_runtime: 4.4507, interval_samples_per_second: 3.595, interval_steps_per_second: 2.247, epoch: 8.7846
loss: 0.05755515, learning_rate: 6.048387096774194e-06, global_s

[2024-03-04 15:15:10,116] [    INFO] - ***** Running Evaluation *****
[2024-03-04 15:15:10,134] [    INFO] -   Num examples = 2000
[2024-03-04 15:15:10,144] [    INFO] -   Pre device batch size = 16
[2024-03-04 15:15:10,146] [    INFO] -   Total Batch size = 16
[2024-03-04 15:15:10,148] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.8003169298171997, eval_accuracy: 0.7565, eval_f1: 0.7197224777892663, eval_runtime: 21.0571, eval_samples_per_second: 94.98, eval_steps_per_second: 5.936, epoch: 8.871


[2024-03-04 15:15:31,179] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-15400
[2024-03-04 15:15:35,085] [    INFO] - Deleting older checkpoint [work/output/checkpoint-13600] due to args.save_total_limit


loss: 0.0304238, learning_rate: 5.616359447004609e-06, global_step: 15410, interval_runtime: 29.5641, interval_samples_per_second: 0.541, interval_steps_per_second: 0.338, epoch: 8.8767
loss: 0.06642622, learning_rate: 5.587557603686636e-06, global_step: 15420, interval_runtime: 4.4378, interval_samples_per_second: 3.605, interval_steps_per_second: 2.253, epoch: 8.8825
loss: 0.04301959, learning_rate: 5.558755760368664e-06, global_step: 15430, interval_runtime: 4.4581, interval_samples_per_second: 3.589, interval_steps_per_second: 2.243, epoch: 8.8882
loss: 0.03451086, learning_rate: 5.5299539170506915e-06, global_step: 15440, interval_runtime: 4.414, interval_samples_per_second: 3.625, interval_steps_per_second: 2.266, epoch: 8.894
loss: 0.01466711, learning_rate: 5.501152073732719e-06, global_step: 15450, interval_runtime: 4.4382, interval_samples_per_second: 3.605, interval_steps_per_second: 2.253, epoch: 8.8998
loss: 0.0462088, learning_rate: 5.472350230414747e-06, global_step

[2024-03-04 15:17:03,630] [    INFO] - ***** Running Evaluation *****
[2024-03-04 15:17:03,633] [    INFO] -   Num examples = 2000
[2024-03-04 15:17:03,639] [    INFO] -   Pre device batch size = 16
[2024-03-04 15:17:03,646] [    INFO] -   Total Batch size = 16
[2024-03-04 15:17:03,654] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.7787641286849976, eval_accuracy: 0.753, eval_f1: 0.7191711565470751, eval_runtime: 21.06, eval_samples_per_second: 94.967, eval_steps_per_second: 5.935, epoch: 8.9862


[2024-03-04 15:17:24,739] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-15600
[2024-03-04 15:17:28,855] [    INFO] - Deleting older checkpoint [work/output/checkpoint-13800] due to args.save_total_limit


loss: 0.14393142, learning_rate: 5.040322580645161e-06, global_step: 15610, interval_runtime: 29.7578, interval_samples_per_second: 0.538, interval_steps_per_second: 0.336, epoch: 8.9919
loss: 0.05431042, learning_rate: 5.011520737327189e-06, global_step: 15620, interval_runtime: 4.4357, interval_samples_per_second: 3.607, interval_steps_per_second: 2.254, epoch: 8.9977
loss: 0.00559215, learning_rate: 4.9827188940092165e-06, global_step: 15630, interval_runtime: 4.4125, interval_samples_per_second: 3.626, interval_steps_per_second: 2.266, epoch: 9.0035
loss: 0.06475911, learning_rate: 4.953917050691245e-06, global_step: 15640, interval_runtime: 4.432, interval_samples_per_second: 3.61, interval_steps_per_second: 2.256, epoch: 9.0092
loss: 0.03627209, learning_rate: 4.9251152073732725e-06, global_step: 15650, interval_runtime: 4.4119, interval_samples_per_second: 3.627, interval_steps_per_second: 2.267, epoch: 9.015
loss: 0.02170688, learning_rate: 4.8963133640553e-06, global_step

[2024-03-04 15:18:57,555] [    INFO] - ***** Running Evaluation *****
[2024-03-04 15:18:57,558] [    INFO] -   Num examples = 2000
[2024-03-04 15:18:57,560] [    INFO] -   Pre device batch size = 16
[2024-03-04 15:18:57,563] [    INFO] -   Total Batch size = 16
[2024-03-04 15:18:57,565] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.8051788806915283, eval_accuracy: 0.7545, eval_f1: 0.719206996103887, eval_runtime: 21.0014, eval_samples_per_second: 95.232, eval_steps_per_second: 5.952, epoch: 9.1014


[2024-03-04 15:19:18,570] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-15800
[2024-03-04 15:19:22,569] [    INFO] - Deleting older checkpoint [work/output/checkpoint-14000] due to args.save_total_limit


loss: 0.00061119, learning_rate: 4.464285714285715e-06, global_step: 15810, interval_runtime: 29.5325, interval_samples_per_second: 0.542, interval_steps_per_second: 0.339, epoch: 9.1071
loss: 0.0212767, learning_rate: 4.435483870967742e-06, global_step: 15820, interval_runtime: 4.4372, interval_samples_per_second: 3.606, interval_steps_per_second: 2.254, epoch: 9.1129
loss: 0.00338467, learning_rate: 4.40668202764977e-06, global_step: 15830, interval_runtime: 4.4269, interval_samples_per_second: 3.614, interval_steps_per_second: 2.259, epoch: 9.1187
loss: 0.06070588, learning_rate: 4.377880184331797e-06, global_step: 15840, interval_runtime: 4.4203, interval_samples_per_second: 3.62, interval_steps_per_second: 2.262, epoch: 9.1244
loss: 0.02410859, learning_rate: 4.349078341013825e-06, global_step: 15850, interval_runtime: 4.4284, interval_samples_per_second: 3.613, interval_steps_per_second: 2.258, epoch: 9.1302
loss: 0.01290715, learning_rate: 4.3202764976958525e-06, global_ste

[2024-03-04 15:20:51,064] [    INFO] - ***** Running Evaluation *****
[2024-03-04 15:20:51,067] [    INFO] -   Num examples = 2000
[2024-03-04 15:20:51,069] [    INFO] -   Pre device batch size = 16
[2024-03-04 15:20:51,071] [    INFO] -   Total Batch size = 16
[2024-03-04 15:20:51,073] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.865842342376709, eval_accuracy: 0.754, eval_f1: 0.7176154569109775, eval_runtime: 20.939, eval_samples_per_second: 95.516, eval_steps_per_second: 5.97, epoch: 9.2166


[2024-03-04 15:21:12,010] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-16000
[2024-03-04 15:21:15,989] [    INFO] - Deleting older checkpoint [work/output/checkpoint-14200] due to args.save_total_limit


loss: 0.00069423, learning_rate: 3.888248847926267e-06, global_step: 16010, interval_runtime: 29.4507, interval_samples_per_second: 0.543, interval_steps_per_second: 0.34, epoch: 9.2224
loss: 0.03625959, learning_rate: 3.859447004608295e-06, global_step: 16020, interval_runtime: 4.414, interval_samples_per_second: 3.625, interval_steps_per_second: 2.266, epoch: 9.2281
loss: 0.02985746, learning_rate: 3.830645161290323e-06, global_step: 16030, interval_runtime: 4.4448, interval_samples_per_second: 3.6, interval_steps_per_second: 2.25, epoch: 9.2339
loss: 0.03311375, learning_rate: 3.8018433179723507e-06, global_step: 16040, interval_runtime: 4.4224, interval_samples_per_second: 3.618, interval_steps_per_second: 2.261, epoch: 9.2396
loss: 0.06693748, learning_rate: 3.7730414746543783e-06, global_step: 16050, interval_runtime: 4.4181, interval_samples_per_second: 3.621, interval_steps_per_second: 2.263, epoch: 9.2454
loss: 0.01904279, learning_rate: 3.7442396313364054e-06, global_ste

[2024-03-04 15:22:44,783] [    INFO] - ***** Running Evaluation *****
[2024-03-04 15:22:44,786] [    INFO] -   Num examples = 2000
[2024-03-04 15:22:44,788] [    INFO] -   Pre device batch size = 16
[2024-03-04 15:22:44,791] [    INFO] -   Total Batch size = 16
[2024-03-04 15:22:44,793] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.8576014041900635, eval_accuracy: 0.758, eval_f1: 0.7227583006723051, eval_runtime: 21.0528, eval_samples_per_second: 94.999, eval_steps_per_second: 5.937, epoch: 9.3318


[2024-03-04 15:23:05,841] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-16200
[2024-03-04 15:23:09,978] [    INFO] - Deleting older checkpoint [work/output/checkpoint-14400] due to args.save_total_limit


loss: 0.05149931, learning_rate: 3.312211981566821e-06, global_step: 16210, interval_runtime: 29.735, interval_samples_per_second: 0.538, interval_steps_per_second: 0.336, epoch: 9.3376
loss: 0.00047091, learning_rate: 3.2834101382488476e-06, global_step: 16220, interval_runtime: 4.4185, interval_samples_per_second: 3.621, interval_steps_per_second: 2.263, epoch: 9.3433
loss: 0.07094712, learning_rate: 3.2546082949308756e-06, global_step: 16230, interval_runtime: 4.4222, interval_samples_per_second: 3.618, interval_steps_per_second: 2.261, epoch: 9.3491
loss: 0.01012797, learning_rate: 3.225806451612903e-06, global_step: 16240, interval_runtime: 4.4197, interval_samples_per_second: 3.62, interval_steps_per_second: 2.263, epoch: 9.3548
loss: 0.00058532, learning_rate: 3.197004608294931e-06, global_step: 16250, interval_runtime: 4.4198, interval_samples_per_second: 3.62, interval_steps_per_second: 2.263, epoch: 9.3606
loss: 0.01129222, learning_rate: 3.1682027649769588e-06, global_s

[2024-03-04 15:24:38,640] [    INFO] - ***** Running Evaluation *****
[2024-03-04 15:24:38,643] [    INFO] -   Num examples = 2000
[2024-03-04 15:24:38,645] [    INFO] -   Pre device batch size = 16
[2024-03-04 15:24:38,647] [    INFO] -   Total Batch size = 16
[2024-03-04 15:24:38,649] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.8743770122528076, eval_accuracy: 0.757, eval_f1: 0.7246947606262628, eval_runtime: 21.0584, eval_samples_per_second: 94.974, eval_steps_per_second: 5.936, epoch: 9.447


[2024-03-04 15:24:59,707] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-16400
[2024-03-04 15:25:03,701] [    INFO] - Deleting older checkpoint [work/output/checkpoint-14600] due to args.save_total_limit


loss: 0.03037249, learning_rate: 2.7361751152073734e-06, global_step: 16410, interval_runtime: 29.5909, interval_samples_per_second: 0.541, interval_steps_per_second: 0.338, epoch: 9.4528
loss: 0.01636276, learning_rate: 2.707373271889401e-06, global_step: 16420, interval_runtime: 4.4293, interval_samples_per_second: 3.612, interval_steps_per_second: 2.258, epoch: 9.4585
loss: 0.04253039, learning_rate: 2.6785714285714285e-06, global_step: 16430, interval_runtime: 4.4156, interval_samples_per_second: 3.624, interval_steps_per_second: 2.265, epoch: 9.4643
loss: 0.00981339, learning_rate: 2.6497695852534565e-06, global_step: 16440, interval_runtime: 4.4351, interval_samples_per_second: 3.608, interval_steps_per_second: 2.255, epoch: 9.47
loss: 0.05344759, learning_rate: 2.620967741935484e-06, global_step: 16450, interval_runtime: 4.4247, interval_samples_per_second: 3.616, interval_steps_per_second: 2.26, epoch: 9.4758
loss: 0.05346448, learning_rate: 2.5921658986175117e-06, global_

[2024-03-04 15:26:32,397] [    INFO] - ***** Running Evaluation *****
[2024-03-04 15:26:32,404] [    INFO] -   Num examples = 2000
[2024-03-04 15:26:32,407] [    INFO] -   Pre device batch size = 16
[2024-03-04 15:26:32,409] [    INFO] -   Total Batch size = 16
[2024-03-04 15:26:32,421] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.8881374597549438, eval_accuracy: 0.757, eval_f1: 0.7214275713416797, eval_runtime: 20.9243, eval_samples_per_second: 95.582, eval_steps_per_second: 5.974, epoch: 9.5622


[2024-03-04 15:26:53,331] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-16600
[2024-03-04 15:26:57,389] [    INFO] - Deleting older checkpoint [work/output/checkpoint-14800] due to args.save_total_limit


loss: 0.00185545, learning_rate: 2.1601382488479263e-06, global_step: 16610, interval_runtime: 29.5151, interval_samples_per_second: 0.542, interval_steps_per_second: 0.339, epoch: 9.568
loss: 0.04670738, learning_rate: 2.1313364055299543e-06, global_step: 16620, interval_runtime: 4.4233, interval_samples_per_second: 3.617, interval_steps_per_second: 2.261, epoch: 9.5737
loss: 0.03081559, learning_rate: 2.1025345622119814e-06, global_step: 16630, interval_runtime: 4.4339, interval_samples_per_second: 3.609, interval_steps_per_second: 2.255, epoch: 9.5795
loss: 0.07528453, learning_rate: 2.0737327188940094e-06, global_step: 16640, interval_runtime: 4.4414, interval_samples_per_second: 3.602, interval_steps_per_second: 2.252, epoch: 9.5853
loss: 0.06823815, learning_rate: 2.044930875576037e-06, global_step: 16650, interval_runtime: 4.4412, interval_samples_per_second: 3.603, interval_steps_per_second: 2.252, epoch: 9.591
loss: 0.00028593, learning_rate: 2.0161290322580646e-06, globa

[2024-03-04 15:28:26,125] [    INFO] - ***** Running Evaluation *****
[2024-03-04 15:28:26,127] [    INFO] -   Num examples = 2000
[2024-03-04 15:28:26,130] [    INFO] -   Pre device batch size = 16
[2024-03-04 15:28:26,132] [    INFO] -   Total Batch size = 16
[2024-03-04 15:28:26,134] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.8780180215835571, eval_accuracy: 0.752, eval_f1: 0.7138392063416873, eval_runtime: 20.8949, eval_samples_per_second: 95.717, eval_steps_per_second: 5.982, epoch: 9.6774


[2024-03-04 15:28:47,027] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-16800
[2024-03-04 15:28:50,954] [    INFO] - Deleting older checkpoint [work/output/checkpoint-15000] due to args.save_total_limit


loss: 0.01259866, learning_rate: 1.5841013824884794e-06, global_step: 16810, interval_runtime: 29.3494, interval_samples_per_second: 0.545, interval_steps_per_second: 0.341, epoch: 9.6832
loss: 0.00038193, learning_rate: 1.555299539170507e-06, global_step: 16820, interval_runtime: 4.4459, interval_samples_per_second: 3.599, interval_steps_per_second: 2.249, epoch: 9.6889
loss: 0.0233151, learning_rate: 1.5264976958525346e-06, global_step: 16830, interval_runtime: 4.4135, interval_samples_per_second: 3.625, interval_steps_per_second: 2.266, epoch: 9.6947
loss: 0.00198847, learning_rate: 1.4976958525345621e-06, global_step: 16840, interval_runtime: 4.4294, interval_samples_per_second: 3.612, interval_steps_per_second: 2.258, epoch: 9.7005
loss: 0.02823533, learning_rate: 1.46889400921659e-06, global_step: 16850, interval_runtime: 4.4353, interval_samples_per_second: 3.607, interval_steps_per_second: 2.255, epoch: 9.7062
loss: 0.04296958, learning_rate: 1.4400921658986175e-06, global

[2024-03-04 15:30:19,546] [    INFO] - ***** Running Evaluation *****
[2024-03-04 15:30:19,550] [    INFO] -   Num examples = 2000
[2024-03-04 15:30:19,555] [    INFO] -   Pre device batch size = 16
[2024-03-04 15:30:19,559] [    INFO] -   Total Batch size = 16
[2024-03-04 15:30:19,566] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.8846123218536377, eval_accuracy: 0.7515, eval_f1: 0.7128970327809884, eval_runtime: 21.0149, eval_samples_per_second: 95.171, eval_steps_per_second: 5.948, epoch: 9.7926


[2024-03-04 15:30:40,566] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-17000
[2024-03-04 15:30:44,747] [    INFO] - Deleting older checkpoint [work/output/checkpoint-15200] due to args.save_total_limit


loss: 0.00122756, learning_rate: 1.0080645161290323e-06, global_step: 17010, interval_runtime: 29.7473, interval_samples_per_second: 0.538, interval_steps_per_second: 0.336, epoch: 9.7984
loss: 0.00046749, learning_rate: 9.792626728110599e-07, global_step: 17020, interval_runtime: 4.4193, interval_samples_per_second: 3.621, interval_steps_per_second: 2.263, epoch: 9.8041
loss: 0.03792058, learning_rate: 9.504608294930877e-07, global_step: 17030, interval_runtime: 4.4414, interval_samples_per_second: 3.602, interval_steps_per_second: 2.252, epoch: 9.8099
loss: 0.0005846, learning_rate: 9.216589861751153e-07, global_step: 17040, interval_runtime: 4.4301, interval_samples_per_second: 3.612, interval_steps_per_second: 2.257, epoch: 9.8157
loss: 0.00035574, learning_rate: 8.928571428571428e-07, global_step: 17050, interval_runtime: 4.4449, interval_samples_per_second: 3.6, interval_steps_per_second: 2.25, epoch: 9.8214
loss: 0.00277006, learning_rate: 8.640552995391706e-07, global_step

[2024-03-04 15:32:13,296] [    INFO] - ***** Running Evaluation *****
[2024-03-04 15:32:13,299] [    INFO] -   Num examples = 2000
[2024-03-04 15:32:13,302] [    INFO] -   Pre device batch size = 16
[2024-03-04 15:32:13,305] [    INFO] -   Total Batch size = 16
[2024-03-04 15:32:13,308] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.8947852849960327, eval_accuracy: 0.753, eval_f1: 0.7151268548597489, eval_runtime: 21.0064, eval_samples_per_second: 95.209, eval_steps_per_second: 5.951, epoch: 9.9078


[2024-03-04 15:32:34,309] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-17200
[2024-03-04 15:32:38,361] [    INFO] - Deleting older checkpoint [work/output/checkpoint-15400] due to args.save_total_limit


loss: 0.06274376, learning_rate: 4.320276497695853e-07, global_step: 17210, interval_runtime: 29.6181, interval_samples_per_second: 0.54, interval_steps_per_second: 0.338, epoch: 9.9136
loss: 0.00184548, learning_rate: 4.032258064516129e-07, global_step: 17220, interval_runtime: 4.4278, interval_samples_per_second: 3.614, interval_steps_per_second: 2.258, epoch: 9.9194
loss: 0.05442246, learning_rate: 3.7442396313364053e-07, global_step: 17230, interval_runtime: 4.4368, interval_samples_per_second: 3.606, interval_steps_per_second: 2.254, epoch: 9.9251
loss: 0.04785992, learning_rate: 3.456221198156682e-07, global_step: 17240, interval_runtime: 4.4303, interval_samples_per_second: 3.612, interval_steps_per_second: 2.257, epoch: 9.9309
loss: 0.01265326, learning_rate: 3.168202764976959e-07, global_step: 17250, interval_runtime: 4.4309, interval_samples_per_second: 3.611, interval_steps_per_second: 2.257, epoch: 9.9366
loss: 0.0508186, learning_rate: 2.8801843317972354e-07, global_s

[2024-03-04 15:33:49,207] [    INFO] - ***** Running Evaluation *****
[2024-03-04 15:33:49,217] [    INFO] -   Num examples = 2000
[2024-03-04 15:33:49,224] [    INFO] -   Pre device batch size = 16
[2024-03-04 15:33:49,231] [    INFO] -   Total Batch size = 16
[2024-03-04 15:33:49,242] [    INFO] -   Total prediction steps = 125


  0%|          | 0/125 [00:00<?, ?it/s]

eval_loss: 1.8923460245132446, eval_accuracy: 0.7515, eval_f1: 0.7134921686802377, eval_runtime: 21.0876, eval_samples_per_second: 94.843, eval_steps_per_second: 5.928, epoch: 10.0


[2024-03-04 15:34:10,300] [    INFO] - Saving model checkpoint to ./work/output/checkpoint-17360
[2024-03-04 15:34:14,719] [    INFO] - Deleting older checkpoint [work/output/checkpoint-15600] due to args.save_total_limit
[2024-03-04 15:34:14,821] [    INFO] - 
Training completed. 

[2024-03-04 15:34:14,830] [    INFO] - Loading best model from ./work/output/checkpoint-2800 (score: 0.7386200102986379).


train_runtime: 9889.5561, train_samples_per_second: 28.076, train_steps_per_second: 1.755, train_loss: 0.24620741825558307, epoch: 10.0


[2024-03-04 15:34:18,037] [    INFO] - Saving model checkpoint to ./work/model
