In [25]:
from transformers import BasicTokenizer, EarlyStoppingCallback, Trainer
from datasets import concatenate_datasets, load_from_disk
import kagglehub
import optuna
import torch
import math
import base

In [26]:
base.reset_seed()

In [27]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


In [28]:
my_glove = kagglehub.dataset_download("thanakomsn/glove6b300dtxt")
print(my_glove)

/home/jovyan/.cache/kagglehub/datasets/thanakomsn/glove6b300dtxt/versions/1


In [29]:
GLOVE_FILE = f"{my_glove}/glove.6B.300d.txt"
DATASET = "trec"

In [30]:
train_data = load_from_disk(f"~/data/{DATASET}/train-logits_fine")
eval_data = load_from_disk(f"~/data/{DATASET}/eval-logits_fine")
test_data = load_from_disk(f"~/data/{DATASET}/test-logits_fine")

all_train_data = load_from_disk(f"~/data/{DATASET}/train-logits-augmented_fine")

all_data = concatenate_datasets([load_from_disk(file) for file in [f"~/data/{DATASET}/eval-logits_fine", f"~/data/{DATASET}/test-logits_fine", f"~/data/{DATASET}/train-logits-augmented_fine"]])
tokenizer = BasicTokenizer(do_lower_case=True)

In [31]:
train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), train_data))
eval_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), eval_data))
test_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), test_data))

all_train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_train_data))

all_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_data))

In [32]:
vocab = base.get_vocab(all_data_tokens)

In [33]:
word_index = dict(zip(vocab, range(len(vocab))))

In [34]:
embeddings_index = base.get_embeddings_indeces(GLOVE_FILE)

Found 400000 word vectors.


In [35]:
print(len(vocab))
num_tokens = len(vocab) + 2
embedding_dim = 300

8766


In [36]:
embedding_matrix = base.get_embedding_matrix(num_tokens, embedding_dim, word_index, embeddings_index)

Converted 8551 words (215) misses


In [37]:
train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),train_data_tokens))
eval_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),eval_data_tokens))
test_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),test_data_tokens))

all_train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),all_train_data_tokens))

In [38]:
train_padded_data = list(map(lambda x: base.padd(x,60), train_data_index))
eval_padded_data = list(map(lambda x: base.padd(x,60), eval_data_index))
test_padded_data = list(map(lambda x: base.padd(x,60), test_data_index))

all_train_padded_data = list(map(lambda x: base.padd(x,60), all_train_data_index))

In [39]:
train_data = train_data.add_column("input_ids", train_padded_data)
eval_data = eval_data.add_column("input_ids", eval_padded_data)
test_data = test_data.add_column("input_ids", test_padded_data)

all_train_data = all_train_data.add_column("input_ids", all_train_padded_data)

In [40]:
num_epochs = 30
batch_size = 128

In [41]:
#Nápočet epoch na steps
data_length = len(train_data)
min_r = math.ceil(data_length/batch_size)*5
max_r = math.ceil(data_length/batch_size)*num_epochs
warm_up = math.ceil(data_length/batch_size/10)

In [42]:
def hp_space(trial):
    params =  {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-3, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0, 1e-2, step=1e-3),
        "adam_beta1" : trial.suggest_float("adam_beta1", 0.9, 0.99, step=0.01),
        "warmup_steps" : trial.suggest_int("warmup_steps", 0, warm_up)
    }   
    print(f"Trial {trial.number} with params: {params}")
    return params

In [43]:
pruner = optuna.pruners.HyperbandPruner(min_resource=min_r, max_resource=max_r, reduction_factor=2, bootstrap_count=2)
sampler = optuna.samplers.TPESampler(seed=42, multivariate=True)



In [44]:
def get_BiLSTM():
    return base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=50, freeze_embed=False)

In [45]:
base.reset_seed()

In [46]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base-embedd_fine_hp-search", logging_dir=f"~/logs/{DATASET}/bilstm-base-embedd_fine_hp-search", epochs=num_epochs, batch_size=batch_size)

In [47]:
trainer = Trainer(
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    model_init = lambda: get_BiLSTM(),
    #callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)
  

In [48]:
best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    compute_objective=lambda metrics: metrics["eval_f1"],
    pruner=pruner,
    sampler=sampler,
    study_name="Base-embedd",
    n_trials=150
)

[I 2025-03-15 09:38:56,798] A new study created in memory with name: Base-embedd


Trial 0 with params: {'learning_rate': 0.0001025350969016849, 'weight_decay': 0.01, 'adam_beta1': 0.97, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.7662,3.467665,0.176902,0.003538,0.02,0.006012
2,3.272,3.152797,0.176902,0.003538,0.02,0.006012
3,3.119,3.034884,0.180568,0.033554,0.021365,0.008638
4,2.954,2.86143,0.310724,0.038926,0.05835,0.040288
5,2.8198,2.703281,0.377635,0.044616,0.075895,0.05116


[I 2025-03-15 09:39:23,379] Trial 0 pruned. 


Trial 1 with params: {'learning_rate': 2.6368755339723032e-05, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8831,3.843979,0.176902,0.003538,0.02,0.006012
2,3.7878,3.708517,0.176902,0.003538,0.02,0.006012
3,3.5599,3.357535,0.176902,0.003538,0.02,0.006012
4,3.2544,3.193452,0.176902,0.003538,0.02,0.006012
5,3.2001,3.140543,0.176902,0.003538,0.02,0.006012
6,3.1331,3.094779,0.176902,0.003538,0.02,0.006012
7,3.081,3.037039,0.179652,0.015139,0.020763,0.007519
8,3.0271,2.979816,0.316224,0.031197,0.056594,0.038267
9,2.968,2.928306,0.321723,0.028796,0.057997,0.037462
10,2.9123,2.891033,0.356554,0.039179,0.067808,0.040811


[I 2025-03-15 09:41:02,597] Trial 1 pruned. 


Trial 2 with params: {'learning_rate': 0.00041917115166952007, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4091,2.957724,0.223648,0.039003,0.036786,0.022396
2,2.6357,2.343858,0.429881,0.083758,0.096146,0.068517
3,2.1376,1.979398,0.504125,0.139412,0.133879,0.111255
4,1.8054,1.718941,0.560953,0.155692,0.171206,0.148114
5,1.5407,1.529718,0.613199,0.239281,0.219198,0.208007
6,1.294,1.500095,0.616865,0.292611,0.2537,0.242872
7,1.0963,1.310706,0.663611,0.278736,0.289681,0.277771
8,0.9293,1.297412,0.668194,0.345958,0.304173,0.308679
9,0.7938,1.209317,0.692026,0.389977,0.357717,0.360135
10,0.643,1.18971,0.692026,0.431458,0.358664,0.365923


[I 2025-03-15 09:41:49,339] Trial 2 pruned. 


Trial 3 with params: {'learning_rate': 0.001764971584817572, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8762,2.256522,0.43538,0.092661,0.108164,0.089069
2,1.8757,1.659886,0.575619,0.21211,0.192697,0.1746
3,1.289,1.270443,0.696609,0.358904,0.342719,0.334867
4,0.7887,1.185635,0.706691,0.424508,0.400474,0.397891
5,0.4675,1.107848,0.743355,0.522848,0.494544,0.489284


[I 2025-03-15 09:42:14,720] Trial 3 pruned. 


Trial 4 with params: {'learning_rate': 6.62431060594998e-05, 'weight_decay': 0.005, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8252,3.673488,0.176902,0.003538,0.02,0.006012
2,3.3985,3.200449,0.176902,0.003538,0.02,0.006012
3,3.1609,3.086489,0.176902,0.003538,0.02,0.006012
4,3.028,2.940696,0.310724,0.030285,0.055278,0.037293
5,2.9153,2.816975,0.376719,0.044875,0.075352,0.051209
6,2.7555,2.695348,0.36572,0.040308,0.072287,0.046636
7,2.6475,2.60407,0.375802,0.040914,0.075054,0.049386
8,2.5586,2.509492,0.405133,0.067984,0.085986,0.056429
9,2.4711,2.437628,0.4033,0.063701,0.085867,0.056961
10,2.3961,2.375822,0.421632,0.082674,0.092759,0.064173


[I 2025-03-15 09:43:53,118] Trial 4 pruned. 


Trial 5 with params: {'learning_rate': 0.0004480975918214954, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3674,2.930986,0.230064,0.039566,0.038178,0.023294
2,2.6209,2.362409,0.432631,0.101789,0.098441,0.071574
3,2.1335,1.969883,0.509624,0.153911,0.135016,0.113865
4,1.7983,1.713415,0.571036,0.196241,0.179292,0.156796
5,1.526,1.528532,0.605866,0.242391,0.213997,0.196257


[I 2025-03-15 09:44:15,615] Trial 5 pruned. 


Trial 6 with params: {'learning_rate': 0.00017018418817029164, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5901,3.168798,0.176902,0.003538,0.02,0.006012
2,3.0402,2.881917,0.296975,0.026626,0.053472,0.028334
3,2.7282,2.565315,0.391384,0.041102,0.080453,0.051881
4,2.4488,2.342694,0.412466,0.070268,0.088199,0.06205
5,2.2618,2.166404,0.466544,0.101904,0.111513,0.086596
6,2.0737,2.038355,0.501375,0.108954,0.131092,0.106399
7,1.9419,1.930727,0.514207,0.120408,0.140527,0.118395
8,1.8386,1.851316,0.520623,0.12195,0.14826,0.127296
9,1.7153,1.773688,0.545371,0.171403,0.161452,0.144331
10,1.61,1.721711,0.56187,0.16477,0.177821,0.159619


[I 2025-03-15 09:46:44,345] Trial 6 finished with value: 0.3056412351258868 and parameters: {'learning_rate': 0.00017018418817029164, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 2}. Best is trial 6 with value: 0.3056412351258868.


Trial 7 with params: {'learning_rate': 0.00039710847107924746, 'weight_decay': 0.0, 'adam_beta1': 0.96, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3964,3.032144,0.208983,0.017437,0.032727,0.019469
2,2.7859,2.539156,0.384051,0.044607,0.080031,0.053775
3,2.348,2.168066,0.452796,0.086186,0.105072,0.080119
4,2.032,1.91282,0.52429,0.133488,0.149073,0.128216
5,1.7705,1.738296,0.562786,0.168192,0.178495,0.15869


[I 2025-03-15 09:47:11,048] Trial 7 pruned. 


Trial 8 with params: {'learning_rate': 1.498208643215546e-05, 'weight_decay': 0.01, 'adam_beta1': 0.99, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8953,3.875494,0.176902,0.005939,0.022555,0.009389
2,3.8575,3.83899,0.176902,0.003538,0.02,0.006012
3,3.8195,3.794893,0.176902,0.003538,0.02,0.006012
4,3.7672,3.73882,0.176902,0.003538,0.02,0.006012
5,3.7107,3.663851,0.176902,0.003538,0.02,0.006012
6,3.6183,3.566598,0.176902,0.003538,0.02,0.006012
7,3.5095,3.448963,0.176902,0.003538,0.02,0.006012
8,3.3967,3.33141,0.176902,0.003538,0.02,0.006012
9,3.3005,3.250067,0.176902,0.003538,0.02,0.006012
10,3.2428,3.219208,0.176902,0.003538,0.02,0.006012


[I 2025-03-15 09:48:46,396] Trial 8 pruned. 


Trial 9 with params: {'learning_rate': 6.639623079859457e-05, 'weight_decay': 0.001, 'adam_beta1': 0.96, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8307,3.694007,0.176902,0.003538,0.02,0.006012
2,3.4395,3.219593,0.176902,0.003538,0.02,0.006012
3,3.1894,3.115845,0.176902,0.003538,0.02,0.006012
4,3.0812,3.016983,0.180568,0.018407,0.020978,0.00788
5,2.9885,2.878479,0.351971,0.029362,0.066541,0.040501
6,2.8225,2.762114,0.389551,0.042738,0.080749,0.054004
7,2.7074,2.656759,0.374885,0.042809,0.075145,0.049821
8,2.6152,2.565108,0.401467,0.04246,0.084812,0.055293
9,2.5182,2.476161,0.402383,0.050646,0.084581,0.054744
10,2.4339,2.41069,0.417965,0.066462,0.091071,0.062792


[I 2025-03-15 09:49:34,374] Trial 9 pruned. 


Trial 10 with params: {'learning_rate': 1.6396488017492608e-05, 'weight_decay': 0.008, 'adam_beta1': 0.92, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8932,3.870639,0.175985,0.005281,0.021376,0.008464
2,3.8468,3.81882,0.176902,0.003538,0.02,0.006012
3,3.7813,3.728068,0.176902,0.003538,0.02,0.006012
4,3.6464,3.546202,0.176902,0.003538,0.02,0.006012
5,3.448,3.315192,0.176902,0.003538,0.02,0.006012
6,3.2662,3.219433,0.176902,0.003538,0.02,0.006012
7,3.2052,3.17483,0.176902,0.003538,0.02,0.006012
8,3.1778,3.144964,0.176902,0.003538,0.02,0.006012
9,3.1576,3.120548,0.176902,0.003538,0.02,0.006012
10,3.1264,3.095126,0.176902,0.003538,0.02,0.006012


[I 2025-03-15 09:50:24,600] Trial 10 pruned. 


Trial 11 with params: {'learning_rate': 0.00011568925627502199, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6823,3.240158,0.176902,0.003538,0.02,0.006012
2,3.1538,3.054266,0.176902,0.003538,0.02,0.006012
3,2.9502,2.801708,0.371219,0.046219,0.074704,0.051578
4,2.7016,2.604085,0.392301,0.041578,0.081728,0.053901
5,2.5453,2.428141,0.417965,0.066069,0.090992,0.062919
6,2.354,2.295208,0.439963,0.08959,0.099385,0.072019
7,2.2287,2.203034,0.457379,0.107909,0.109969,0.087163
8,2.1359,2.103308,0.483043,0.099448,0.120598,0.097019
9,2.0389,2.031825,0.492209,0.106957,0.125269,0.102264
10,1.9614,1.999907,0.498625,0.112202,0.137059,0.115184


[I 2025-03-15 09:52:01,206] Trial 11 pruned. 


Trial 12 with params: {'learning_rate': 0.00014087455613188277, 'weight_decay': 0.01, 'adam_beta1': 0.92, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6201,3.2199,0.176902,0.003538,0.02,0.006012
2,3.1083,2.977117,0.223648,0.034755,0.033868,0.024825
3,2.8565,2.702511,0.386801,0.04273,0.080341,0.053111
4,2.5962,2.492241,0.3978,0.0447,0.083637,0.056738
5,2.4219,2.300469,0.447296,0.085811,0.103019,0.077023


[I 2025-03-15 09:52:24,622] Trial 12 pruned. 


Trial 13 with params: {'learning_rate': 0.0003425346186277151, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4229,3.020042,0.197067,0.018807,0.028,0.016628
2,2.7389,2.489632,0.406966,0.045502,0.087994,0.05822
3,2.275,2.120043,0.474794,0.103291,0.118522,0.092177
4,1.9714,1.864986,0.530706,0.1112,0.153985,0.126857
5,1.7258,1.698101,0.571952,0.172243,0.183286,0.159991
6,1.482,1.580639,0.593951,0.22484,0.206439,0.190168
7,1.2959,1.470914,0.615032,0.246628,0.230193,0.225633
8,1.136,1.351099,0.660862,0.321631,0.280922,0.27762
9,0.9854,1.301944,0.669111,0.315519,0.300171,0.298672
10,0.8435,1.264816,0.665445,0.377984,0.319792,0.325251


[I 2025-03-15 09:54:58,756] Trial 13 finished with value: 0.4899522061045902 and parameters: {'learning_rate': 0.0003425346186277151, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 2}. Best is trial 13 with value: 0.4899522061045902.


Trial 14 with params: {'learning_rate': 0.0020630577320339743, 'weight_decay': 0.01, 'adam_beta1': 0.91, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9342,2.225382,0.428964,0.091039,0.101833,0.083873
2,1.8089,1.637038,0.575619,0.21729,0.196603,0.182507
3,1.1869,1.218027,0.702108,0.382026,0.361062,0.350797
4,0.6912,1.146006,0.716774,0.448313,0.429177,0.430703
5,0.3829,1.11468,0.754354,0.562232,0.50471,0.518999
6,0.2074,1.173383,0.771769,0.574159,0.545488,0.547475
7,0.1063,1.316154,0.769019,0.644016,0.61153,0.605795
8,0.0637,1.425115,0.770852,0.653496,0.61841,0.619302
9,0.0269,1.510882,0.771769,0.651899,0.604052,0.606935
10,0.02,1.449018,0.790101,0.670159,0.648268,0.632409


[I 2025-03-15 09:57:34,445] Trial 14 finished with value: 0.6477477297351035 and parameters: {'learning_rate': 0.0020630577320339743, 'weight_decay': 0.01, 'adam_beta1': 0.91, 'warmup_steps': 2}. Best is trial 14 with value: 0.6477477297351035.


Trial 15 with params: {'learning_rate': 0.0036268588545839992, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.93, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8485,2.052307,0.471127,0.133535,0.126272,0.106902
2,1.6306,1.411072,0.647113,0.283376,0.292672,0.277493
3,0.9622,1.16625,0.713107,0.401363,0.396112,0.388478
4,0.5029,1.203586,0.735105,0.510851,0.497712,0.490878
5,0.243,1.225935,0.761687,0.617229,0.567319,0.568369
6,0.1096,1.307108,0.767186,0.626344,0.606625,0.599075
7,0.0448,1.441261,0.773602,0.652074,0.640436,0.621448
8,0.0184,1.518606,0.775435,0.647768,0.656749,0.630151
9,0.0088,1.638672,0.780018,0.651381,0.642279,0.626238
10,0.0087,1.633986,0.773602,0.656331,0.62308,0.626719


[I 2025-03-15 09:58:35,360] Trial 15 pruned. 


Trial 16 with params: {'learning_rate': 0.0010449944851423528, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0988,2.474734,0.39505,0.062044,0.087751,0.063608
2,2.0941,1.857455,0.539872,0.145087,0.159806,0.137072
3,1.5662,1.432133,0.640697,0.283923,0.251028,0.242849
4,1.0776,1.265608,0.67461,0.364715,0.32931,0.331997
5,0.7599,1.153638,0.724106,0.436742,0.397617,0.394634
6,0.5246,1.145263,0.721357,0.479273,0.46624,0.460674
7,0.3272,1.218716,0.746104,0.536064,0.470178,0.484894
8,0.2075,1.221765,0.743355,0.609538,0.564658,0.571747
9,0.1174,1.291395,0.749771,0.549745,0.502489,0.511027
10,0.0835,1.349641,0.743355,0.669222,0.582127,0.600087


[I 2025-03-15 09:59:28,545] Trial 16 pruned. 


Trial 17 with params: {'learning_rate': 0.0008032888340767154, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.1974,2.663142,0.359303,0.063335,0.07985,0.054907
2,2.2672,2.028739,0.488543,0.121196,0.128202,0.108459
3,1.7423,1.58343,0.59945,0.211579,0.198516,0.183786
4,1.2892,1.344489,0.661778,0.351374,0.286862,0.291806
5,0.9509,1.221563,0.692942,0.36232,0.357742,0.354209
6,0.7019,1.139028,0.704858,0.405507,0.394538,0.386355
7,0.4882,1.147353,0.730522,0.451367,0.423138,0.428269
8,0.3595,1.168188,0.721357,0.519598,0.489979,0.481844
9,0.2507,1.170692,0.735105,0.563168,0.51343,0.521685
10,0.1667,1.225175,0.736939,0.605861,0.531359,0.545337


[I 2025-03-15 10:02:06,013] Trial 17 finished with value: 0.5743836279433004 and parameters: {'learning_rate': 0.0008032888340767154, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 2}. Best is trial 14 with value: 0.6477477297351035.


Trial 18 with params: {'learning_rate': 0.003569066018278081, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6882,1.989016,0.511457,0.156053,0.14982,0.125236
2,1.511,1.361063,0.671861,0.339094,0.317296,0.315533
3,0.8804,1.067384,0.746104,0.452752,0.433623,0.42967
4,0.4155,1.05953,0.737855,0.55795,0.524147,0.524163
5,0.1671,1.144325,0.766269,0.642451,0.612012,0.60037
6,0.0695,1.185622,0.789184,0.662958,0.653574,0.642554
7,0.0302,1.354879,0.773602,0.666152,0.651613,0.636857
8,0.0279,1.326879,0.796517,0.716261,0.691142,0.682901
9,0.0145,1.43883,0.765353,0.700046,0.652137,0.659535
10,0.008,1.358737,0.791934,0.719432,0.690503,0.689181


[I 2025-03-15 10:03:47,758] Trial 18 pruned. 


Trial 19 with params: {'learning_rate': 0.0013513720151519398, 'weight_decay': 0.0, 'adam_beta1': 0.98, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.1656,2.551015,0.390467,0.060588,0.084367,0.059862
2,2.2546,2.042778,0.489459,0.109863,0.128283,0.103996
3,1.8299,1.760145,0.547204,0.203809,0.172867,0.164021
4,1.4688,1.504788,0.633364,0.281114,0.2716,0.257646
5,1.1214,1.351393,0.673694,0.371533,0.336301,0.335721
6,0.819,1.240858,0.706691,0.418027,0.388828,0.385876
7,0.5538,1.258402,0.725023,0.450974,0.417578,0.420862
8,0.36,1.31509,0.736939,0.471128,0.466822,0.461493
9,0.2324,1.3957,0.747021,0.587542,0.508726,0.52693
10,0.1447,1.527377,0.749771,0.596404,0.531419,0.548718


[I 2025-03-15 10:04:41,433] Trial 19 pruned. 


Trial 20 with params: {'learning_rate': 0.001640635512331728, 'weight_decay': 0.006, 'adam_beta1': 0.93, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9027,2.300864,0.423465,0.079109,0.099046,0.078532
2,1.9365,1.713882,0.565536,0.171736,0.188709,0.164036
3,1.366,1.28843,0.672777,0.3213,0.304258,0.296873
4,0.8579,1.20724,0.699358,0.451581,0.407145,0.406851
5,0.5455,1.062371,0.751604,0.49317,0.454423,0.458175
6,0.3094,1.308946,0.729606,0.541719,0.48144,0.490778
7,0.1931,1.26782,0.746104,0.609423,0.520829,0.544411
8,0.1038,1.273413,0.768103,0.613781,0.585326,0.576724
9,0.057,1.461086,0.735105,0.591246,0.553189,0.554887
10,0.0372,1.457878,0.752521,0.625373,0.596087,0.587152


[I 2025-03-15 10:06:25,775] Trial 20 pruned. 


Trial 21 with params: {'learning_rate': 0.0005480196620081863, 'weight_decay': 0.006, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3533,2.887054,0.27956,0.040642,0.05212,0.03954
2,2.5739,2.297851,0.441797,0.104038,0.101819,0.074487
3,2.0959,1.933242,0.513291,0.162721,0.143521,0.119295
4,1.7495,1.672436,0.579285,0.169206,0.184939,0.157696
5,1.4426,1.467683,0.628781,0.253183,0.241535,0.226725
6,1.1709,1.316655,0.665445,0.34174,0.300489,0.299113
7,0.9342,1.240405,0.689276,0.387612,0.333241,0.337688
8,0.7581,1.177185,0.703941,0.405622,0.392955,0.391516
9,0.5938,1.125077,0.718607,0.464249,0.40611,0.419254
10,0.4476,1.125948,0.737855,0.512066,0.449281,0.456976


[I 2025-03-15 10:08:14,777] Trial 21 pruned. 


Trial 22 with params: {'learning_rate': 0.0007933682610517953, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.2017,2.669416,0.363886,0.066755,0.081611,0.056824
2,2.274,2.032687,0.491292,0.123679,0.129804,0.110266
3,1.7476,1.586321,0.59945,0.230483,0.20159,0.187755
4,1.296,1.341743,0.668194,0.361826,0.292888,0.299396
5,0.9566,1.226379,0.696609,0.374501,0.363795,0.361051
6,0.709,1.145377,0.72044,0.424485,0.411341,0.402041
7,0.4904,1.158145,0.72319,0.48094,0.422262,0.434126
8,0.3671,1.160918,0.726856,0.493956,0.504588,0.481744
9,0.2487,1.160814,0.736022,0.537434,0.490877,0.497497
10,0.1665,1.265315,0.727773,0.604041,0.510813,0.529383


[I 2025-03-15 10:11:05,886] Trial 22 finished with value: 0.5644411117167702 and parameters: {'learning_rate': 0.0007933682610517953, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 2}. Best is trial 14 with value: 0.6477477297351035.


Trial 23 with params: {'learning_rate': 0.004758954159319862, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5584,1.83995,0.560037,0.213564,0.207875,0.192748
2,1.3875,1.266631,0.68286,0.358998,0.34435,0.337484
3,0.7055,1.033598,0.754354,0.542211,0.496055,0.488548
4,0.2973,1.086167,0.769936,0.618397,0.578754,0.5854
5,0.1016,1.221517,0.780018,0.703182,0.668167,0.660539
6,0.037,1.258475,0.780018,0.665313,0.662823,0.651401
7,0.0112,1.371368,0.791017,0.666673,0.656126,0.648135
8,0.0063,1.357852,0.787351,0.651742,0.644087,0.635435
9,0.0042,1.418674,0.780935,0.650757,0.64636,0.633857
10,0.0013,1.410791,0.788268,0.654391,0.658943,0.6438


[I 2025-03-15 10:12:48,720] Trial 23 pruned. 


Trial 24 with params: {'learning_rate': 0.000332378305921328, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4342,3.016909,0.203483,0.018929,0.030545,0.018669
2,2.7264,2.455282,0.410632,0.044576,0.087993,0.057689
3,2.2638,2.115779,0.469294,0.096529,0.115579,0.089076
4,1.9664,1.856754,0.533456,0.117876,0.15218,0.127896
5,1.7262,1.683984,0.574702,0.198114,0.183869,0.162578
6,1.4959,1.629734,0.579285,0.182862,0.195952,0.178297
7,1.3294,1.482702,0.617782,0.252146,0.229514,0.224413
8,1.1646,1.378747,0.648029,0.281777,0.261012,0.25709
9,1.0252,1.320514,0.664528,0.296912,0.291973,0.285793
10,0.8828,1.290599,0.661778,0.3528,0.294435,0.300538


[I 2025-03-15 10:13:46,189] Trial 24 pruned. 


Trial 25 with params: {'learning_rate': 0.003296823852602529, 'weight_decay': 0.004, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9692,2.178206,0.44363,0.093809,0.113353,0.096602
2,1.7826,1.518549,0.611366,0.249298,0.248896,0.234638
3,1.1213,1.223728,0.703941,0.381226,0.374924,0.363359
4,0.6059,1.190213,0.728689,0.487557,0.45256,0.45503
5,0.3021,1.273881,0.764436,0.596176,0.5498,0.553539
6,0.1535,1.337558,0.767186,0.594233,0.603195,0.581244
7,0.0696,1.355796,0.780018,0.669225,0.628159,0.627387
8,0.0288,1.479456,0.768103,0.621808,0.628052,0.613554
9,0.0123,1.63751,0.781852,0.677588,0.631662,0.638378
10,0.0103,1.689856,0.784601,0.677835,0.653422,0.649614


[I 2025-03-15 10:15:26,272] Trial 25 pruned. 


Trial 26 with params: {'learning_rate': 0.0018138539557466858, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9838,2.23262,0.43813,0.132273,0.108293,0.095916
2,1.8351,1.603821,0.593951,0.235027,0.223157,0.200879
3,1.2385,1.245353,0.695692,0.383853,0.338121,0.339295
4,0.7426,1.146168,0.715857,0.482076,0.427634,0.430927
5,0.4219,1.092442,0.759853,0.571866,0.527808,0.533805
6,0.2532,1.244642,0.746104,0.585185,0.525091,0.537447
7,0.143,1.237955,0.758937,0.590272,0.554385,0.558873
8,0.0726,1.326625,0.768103,0.637226,0.617064,0.60988
9,0.0368,1.45095,0.771769,0.660844,0.617253,0.626254
10,0.018,1.475888,0.766269,0.657208,0.626952,0.628485


[I 2025-03-15 10:17:58,601] Trial 26 finished with value: 0.6437741504262875 and parameters: {'learning_rate': 0.0018138539557466858, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 3}. Best is trial 14 with value: 0.6477477297351035.


Trial 27 with params: {'learning_rate': 0.003918937149130423, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.91, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.014,2.105359,0.463795,0.108787,0.121012,0.097945
2,1.6469,1.442404,0.637947,0.308636,0.274956,0.269165
3,0.9386,1.144303,0.72044,0.397933,0.389715,0.378668
4,0.4889,1.108363,0.735105,0.528798,0.49556,0.497846
5,0.1989,1.306705,0.757104,0.60967,0.573728,0.563548
6,0.0835,1.27246,0.780018,0.669944,0.680775,0.648902
7,0.0303,1.351128,0.784601,0.665635,0.674753,0.647816
8,0.0143,1.410177,0.790101,0.716746,0.688511,0.680869
9,0.007,1.482946,0.792851,0.695261,0.685358,0.674768
10,0.0036,1.462613,0.792851,0.677481,0.687278,0.665895


[I 2025-03-15 10:20:33,314] Trial 27 finished with value: 0.6834764400359902 and parameters: {'learning_rate': 0.003918937149130423, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.91, 'warmup_steps': 4}. Best is trial 27 with value: 0.6834764400359902.


Trial 28 with params: {'learning_rate': 0.003672217997467515, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.92, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.125,2.151144,0.44363,0.100409,0.111788,0.095097
2,1.7559,1.567529,0.594867,0.247206,0.251586,0.226567
3,1.0697,1.157034,0.719523,0.381354,0.37553,0.364366
4,0.5491,1.176027,0.72594,0.49573,0.461118,0.467903
5,0.2593,1.250407,0.747938,0.584168,0.533249,0.538375
6,0.1201,1.331116,0.769936,0.631242,0.598146,0.596769
7,0.0506,1.352563,0.773602,0.609334,0.588883,0.583877
8,0.0162,1.495653,0.779102,0.628207,0.631856,0.609298
9,0.0074,1.52267,0.782768,0.651999,0.611302,0.614425
10,0.0029,1.596584,0.768103,0.62805,0.607653,0.603868


[I 2025-03-15 10:23:01,124] Trial 28 finished with value: 0.6042108722663051 and parameters: {'learning_rate': 0.003672217997467515, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.92, 'warmup_steps': 4}. Best is trial 27 with value: 0.6834764400359902.


Trial 29 with params: {'learning_rate': 0.0018209424048967803, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9689,2.203332,0.442713,0.135002,0.110141,0.09807
2,1.8033,1.554115,0.618698,0.272444,0.245118,0.229112
3,1.1929,1.213751,0.701192,0.379758,0.347705,0.339216
4,0.7222,1.177035,0.709441,0.469526,0.414419,0.425503
5,0.4141,1.143122,0.738772,0.521359,0.487063,0.488224
6,0.2477,1.205476,0.751604,0.603563,0.544721,0.553653
7,0.121,1.281331,0.759853,0.62768,0.532364,0.559856
8,0.0603,1.372596,0.776352,0.677061,0.613539,0.623313
9,0.0342,1.513735,0.765353,0.660381,0.618359,0.625786
10,0.0167,1.465191,0.76077,0.644498,0.649022,0.630699


[I 2025-03-15 10:25:28,774] Trial 29 finished with value: 0.6638226603017925 and parameters: {'learning_rate': 0.0018209424048967803, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'warmup_steps': 3}. Best is trial 27 with value: 0.6834764400359902.


Trial 30 with params: {'learning_rate': 0.0004374816398895419, 'weight_decay': 0.01, 'adam_beta1': 0.93, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4082,2.990839,0.208983,0.039039,0.03243,0.020542
2,2.6835,2.40649,0.405133,0.079777,0.087435,0.061461
3,2.1989,2.031106,0.494042,0.107516,0.130065,0.104232
4,1.8663,1.786635,0.555454,0.159809,0.171911,0.14715
5,1.5868,1.569895,0.597617,0.211664,0.20698,0.18785
6,1.3179,1.436382,0.624198,0.297249,0.247349,0.240593
7,1.1085,1.334595,0.663611,0.344917,0.291537,0.294382
8,0.9479,1.241458,0.694775,0.367713,0.340358,0.342143
9,0.7735,1.192599,0.697525,0.399734,0.356243,0.360996
10,0.6238,1.16283,0.699358,0.424295,0.376228,0.385491


[I 2025-03-15 10:26:15,556] Trial 30 pruned. 


Trial 31 with params: {'learning_rate': 0.0023397808312471103, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8801,2.162719,0.447296,0.107659,0.113518,0.097679
2,1.7263,1.5032,0.625115,0.257735,0.262979,0.241937
3,1.0881,1.25566,0.690192,0.353513,0.356116,0.338362
4,0.6156,1.172475,0.714024,0.475576,0.448248,0.454682
5,0.308,1.097477,0.775435,0.611143,0.589395,0.581394
6,0.1624,1.262463,0.76077,0.636513,0.575565,0.588708
7,0.0719,1.369923,0.768103,0.623099,0.567989,0.577289
8,0.0452,1.391556,0.777269,0.679104,0.637451,0.642537
9,0.0222,1.433073,0.776352,0.65591,0.636343,0.632826
10,0.0094,1.441643,0.782768,0.678025,0.662828,0.658745


[I 2025-03-15 10:27:53,683] Trial 31 pruned. 


Trial 32 with params: {'learning_rate': 0.002674021891500239, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8132,2.057007,0.472961,0.111006,0.12621,0.109518
2,1.651,1.399241,0.660862,0.343244,0.290278,0.285077
3,1.0067,1.180704,0.706691,0.386467,0.38025,0.365982
4,0.5405,1.156553,0.732356,0.502368,0.460828,0.464351
5,0.2446,1.163972,0.76077,0.681187,0.598132,0.611449
6,0.118,1.228795,0.773602,0.646399,0.576519,0.594106
7,0.0519,1.299118,0.783685,0.654529,0.634112,0.629809
8,0.0376,1.375013,0.772686,0.652639,0.630426,0.627854
9,0.0131,1.435766,0.769019,0.69217,0.642247,0.643366
10,0.0088,1.478962,0.770852,0.673636,0.64155,0.637776


[I 2025-03-15 10:30:21,788] Trial 32 finished with value: 0.6582777650114784 and parameters: {'learning_rate': 0.002674021891500239, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 4}. Best is trial 27 with value: 0.6834764400359902.


Trial 33 with params: {'learning_rate': 0.001989166427254258, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9823,2.192919,0.44363,0.128577,0.11094,0.098505
2,1.7819,1.516891,0.625115,0.27817,0.255368,0.242581
3,1.1549,1.208713,0.701192,0.394111,0.358118,0.352005
4,0.6621,1.129287,0.734189,0.488231,0.446979,0.457566
5,0.3591,1.153604,0.746104,0.551351,0.527647,0.523296
6,0.1999,1.245224,0.742438,0.605436,0.548058,0.550338
7,0.1202,1.219868,0.770852,0.64657,0.570131,0.591742
8,0.0471,1.389326,0.770852,0.619581,0.59918,0.591098
9,0.0262,1.372538,0.788268,0.691415,0.654486,0.652907
10,0.0138,1.45841,0.784601,0.643546,0.619413,0.618878


[I 2025-03-15 10:32:06,089] Trial 33 pruned. 


Trial 34 with params: {'learning_rate': 0.00036875829250628456, 'weight_decay': 0.01, 'adam_beta1': 0.99, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4527,3.100392,0.299725,0.01926,0.054515,0.028271
2,2.9259,2.761599,0.335472,0.032582,0.068847,0.042924
3,2.6243,2.487872,0.387718,0.058583,0.08297,0.058332
4,2.3576,2.279531,0.439047,0.085133,0.105088,0.080878
5,2.1772,2.100983,0.48396,0.105661,0.129748,0.108765


[I 2025-03-15 10:32:31,902] Trial 34 pruned. 


Trial 35 with params: {'learning_rate': 0.004630176875410835, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6294,1.952883,0.520623,0.163313,0.169344,0.147591
2,1.4325,1.28309,0.692026,0.361685,0.341027,0.341477
3,0.7552,1.082331,0.750687,0.483225,0.445736,0.453431
4,0.3382,1.060747,0.769936,0.636592,0.592908,0.60058
5,0.1107,1.256091,0.771769,0.716468,0.633031,0.642815
6,0.0432,1.207896,0.797434,0.724883,0.682496,0.682334
7,0.0215,1.352121,0.786434,0.719129,0.700035,0.69298
8,0.0062,1.459445,0.781852,0.700079,0.657492,0.663945
9,0.0056,1.425082,0.796517,0.711436,0.684761,0.682518
10,0.0071,1.505909,0.79835,0.713144,0.655775,0.670554


[I 2025-03-15 10:35:09,315] Trial 35 finished with value: 0.6814494152088144 and parameters: {'learning_rate': 0.004630176875410835, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 4}. Best is trial 27 with value: 0.6834764400359902.


Trial 36 with params: {'learning_rate': 0.001150128932589517, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.1223,2.429395,0.406966,0.06278,0.09288,0.068112
2,2.0504,1.816551,0.544455,0.147016,0.163211,0.138643
3,1.5214,1.38973,0.659028,0.308278,0.272897,0.269301
4,1.0151,1.213692,0.694775,0.428892,0.366294,0.375225
5,0.6861,1.124289,0.722273,0.438911,0.425169,0.420378
6,0.4539,1.147612,0.732356,0.481228,0.468063,0.464776
7,0.2742,1.220348,0.749771,0.561574,0.496009,0.512707
8,0.1665,1.260745,0.749771,0.59334,0.563486,0.55986
9,0.091,1.279524,0.748854,0.611302,0.546361,0.560083
10,0.0583,1.414242,0.749771,0.679918,0.625956,0.630483


[I 2025-03-15 10:37:46,517] Trial 36 finished with value: 0.6414846370170286 and parameters: {'learning_rate': 0.001150128932589517, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 4}. Best is trial 27 with value: 0.6834764400359902.


Trial 37 with params: {'learning_rate': 0.004656443403526459, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6459,1.967594,0.505041,0.141338,0.159712,0.135117
2,1.4488,1.274781,0.696609,0.353181,0.337891,0.334389
3,0.7531,1.029787,0.748854,0.479844,0.465221,0.459466
4,0.3338,1.078017,0.76077,0.636425,0.555237,0.570291
5,0.1169,1.186399,0.777269,0.698093,0.662048,0.660967
6,0.038,1.243798,0.783685,0.742222,0.69993,0.697502
7,0.0169,1.331129,0.787351,0.719081,0.671697,0.680191
8,0.007,1.378413,0.787351,0.720993,0.69246,0.688894
9,0.0042,1.390759,0.79835,0.719674,0.69595,0.691416
10,0.0013,1.439446,0.800183,0.740672,0.686843,0.693238


[I 2025-03-15 10:40:14,191] Trial 37 finished with value: 0.7019492173211367 and parameters: {'learning_rate': 0.004656443403526459, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 4}. Best is trial 37 with value: 0.7019492173211367.


Trial 38 with params: {'learning_rate': 1.162626851313962e-05, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8974,3.881497,0.177819,0.013126,0.031264,0.014779
2,3.8659,3.848788,0.176902,0.003564,0.02,0.00605
3,3.8286,3.802206,0.176902,0.003538,0.02,0.006012
4,3.7686,3.730148,0.176902,0.003538,0.02,0.006012
5,3.6807,3.599063,0.176902,0.003538,0.02,0.006012
6,3.5138,3.421587,0.176902,0.003538,0.02,0.006012
7,3.3536,3.294755,0.176902,0.003538,0.02,0.006012
8,3.2731,3.234472,0.176902,0.003538,0.02,0.006012
9,3.2356,3.200701,0.176902,0.003538,0.02,0.006012
10,3.2016,3.175962,0.176902,0.003538,0.02,0.006012


[I 2025-03-15 10:41:49,156] Trial 38 pruned. 


Trial 39 with params: {'learning_rate': 0.0047508295443596735, 'weight_decay': 0.007, 'adam_beta1': 0.93, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6847,2.002564,0.507791,0.160033,0.155836,0.131362
2,1.5299,1.382102,0.656279,0.316667,0.31453,0.29642
3,0.8066,1.135335,0.746104,0.475947,0.458942,0.456485
4,0.3599,1.20188,0.750687,0.592701,0.53218,0.541113
5,0.1324,1.18947,0.779102,0.673077,0.651549,0.639631
6,0.0451,1.347876,0.782768,0.651241,0.635304,0.62247
7,0.0202,1.444215,0.773602,0.626516,0.650923,0.620643
8,0.0144,1.431618,0.791017,0.645892,0.660016,0.6401
9,0.011,1.453351,0.797434,0.684574,0.656672,0.656534
10,0.0029,1.536308,0.790101,0.689646,0.651627,0.65794


[I 2025-03-15 10:44:22,846] Trial 39 finished with value: 0.6843868484626101 and parameters: {'learning_rate': 0.0047508295443596735, 'weight_decay': 0.007, 'adam_beta1': 0.93, 'warmup_steps': 4}. Best is trial 37 with value: 0.7019492173211367.


Trial 40 with params: {'learning_rate': 0.00046334003557814836, 'weight_decay': 0.006, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4172,2.995152,0.208066,0.019141,0.032364,0.019963
2,2.6877,2.409988,0.419798,0.072181,0.091636,0.062544
3,2.2094,2.050195,0.487626,0.105188,0.126762,0.100112
4,1.8615,1.763053,0.55912,0.135673,0.170935,0.143833
5,1.5705,1.562753,0.595784,0.2077,0.206994,0.187222
6,1.3038,1.417532,0.64253,0.2981,0.255366,0.250137
7,1.0862,1.320512,0.670027,0.33198,0.297043,0.299632
8,0.9217,1.263707,0.68286,0.368456,0.342527,0.342997
9,0.7453,1.203114,0.694775,0.409345,0.359859,0.364882
10,0.5959,1.186736,0.697525,0.425657,0.384475,0.392029


[I 2025-03-15 10:45:13,055] Trial 40 pruned. 


Trial 41 with params: {'learning_rate': 0.00487281657466962, 'weight_decay': 0.005, 'adam_beta1': 0.92, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6764,1.994579,0.502291,0.192263,0.154077,0.140222
2,1.4983,1.333989,0.673694,0.332362,0.329552,0.322534
3,0.7901,1.074729,0.736022,0.440666,0.418012,0.415405
4,0.3467,1.15252,0.751604,0.584532,0.53877,0.546034
5,0.1229,1.308729,0.748854,0.676337,0.608951,0.616586
6,0.0513,1.363908,0.766269,0.652328,0.632621,0.620943
7,0.0339,1.332017,0.780935,0.703834,0.680301,0.677072
8,0.0122,1.426027,0.771769,0.645638,0.628356,0.619939
9,0.0034,1.496583,0.772686,0.677508,0.642132,0.643886
10,0.0012,1.477288,0.777269,0.684128,0.674342,0.664855


[I 2025-03-15 10:47:47,088] Trial 41 finished with value: 0.6736486913387111 and parameters: {'learning_rate': 0.00487281657466962, 'weight_decay': 0.005, 'adam_beta1': 0.92, 'warmup_steps': 4}. Best is trial 37 with value: 0.7019492173211367.


Trial 42 with params: {'learning_rate': 0.003958511362096176, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0309,2.064815,0.494042,0.121122,0.129221,0.104468
2,1.642,1.480734,0.626948,0.289252,0.274466,0.26749
3,0.9439,1.215329,0.716774,0.391878,0.384554,0.377198
4,0.4822,1.153457,0.747021,0.536592,0.5083,0.510633
5,0.2105,1.320992,0.759853,0.589426,0.554704,0.554567
6,0.092,1.409083,0.754354,0.594981,0.594379,0.573944
7,0.0484,1.474769,0.777269,0.655893,0.639784,0.632882
8,0.0149,1.522968,0.781852,0.643066,0.64644,0.631172
9,0.0084,1.610471,0.772686,0.613692,0.63079,0.608115
10,0.0045,1.632611,0.780018,0.641752,0.61528,0.61715


[I 2025-03-15 10:50:18,921] Trial 42 finished with value: 0.6345549129593345 and parameters: {'learning_rate': 0.003958511362096176, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 4}. Best is trial 37 with value: 0.7019492173211367.


Trial 43 with params: {'learning_rate': 0.0039578660800806175, 'weight_decay': 0.007, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7238,2.047573,0.47571,0.114358,0.133425,0.114077
2,1.6074,1.446213,0.619615,0.264429,0.283853,0.25767
3,0.9412,1.202585,0.72319,0.422702,0.395721,0.394859
4,0.4741,1.197691,0.739688,0.566825,0.480376,0.501055
5,0.2061,1.29944,0.756187,0.635862,0.557719,0.573634
6,0.0887,1.294795,0.780018,0.697826,0.659493,0.652439
7,0.0335,1.430921,0.774519,0.704946,0.634374,0.641875
8,0.0159,1.502868,0.786434,0.698745,0.64274,0.654102
9,0.0057,1.514334,0.784601,0.67679,0.649199,0.639707
10,0.0019,1.541939,0.794684,0.697341,0.663526,0.660681


[I 2025-03-15 10:52:47,426] Trial 43 finished with value: 0.656620544515706 and parameters: {'learning_rate': 0.0039578660800806175, 'weight_decay': 0.007, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 3}. Best is trial 37 with value: 0.7019492173211367.


Trial 44 with params: {'learning_rate': 0.0037112337292158548, 'weight_decay': 0.005, 'adam_beta1': 0.91, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.1093,2.138814,0.449129,0.105285,0.113202,0.094916
2,1.7052,1.485494,0.625115,0.291916,0.264342,0.251144
3,0.9945,1.162171,0.71769,0.387474,0.37739,0.37056
4,0.5192,1.073034,0.750687,0.564451,0.513417,0.529371
5,0.2135,1.192475,0.771769,0.645982,0.577358,0.589299
6,0.0888,1.324829,0.769936,0.633519,0.609895,0.605417
7,0.0535,1.414144,0.767186,0.674947,0.622278,0.626368
8,0.0202,1.42298,0.790101,0.696561,0.667248,0.666485
9,0.0052,1.512788,0.786434,0.690739,0.640491,0.646452
10,0.0022,1.549024,0.793767,0.6851,0.651121,0.654087


[I 2025-03-15 10:55:20,556] Trial 44 finished with value: 0.6780976096788537 and parameters: {'learning_rate': 0.0037112337292158548, 'weight_decay': 0.005, 'adam_beta1': 0.91, 'warmup_steps': 4}. Best is trial 37 with value: 0.7019492173211367.


Trial 45 with params: {'learning_rate': 0.004796737777929377, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6829,1.9743,0.507791,0.149826,0.156415,0.131615
2,1.5052,1.375389,0.670944,0.331452,0.322368,0.308675
3,0.7947,1.107932,0.740605,0.483134,0.462848,0.46035
4,0.3426,1.129773,0.768103,0.6018,0.570815,0.573299
5,0.1221,1.271896,0.773602,0.717265,0.650557,0.657935
6,0.0396,1.39995,0.779102,0.681484,0.676983,0.658842
7,0.0185,1.460582,0.787351,0.751227,0.689908,0.701939
8,0.0138,1.476245,0.778185,0.672534,0.639891,0.634087
9,0.0061,1.466144,0.780935,0.710664,0.668642,0.672666
10,0.0015,1.498484,0.796517,0.732853,0.678722,0.686062


[I 2025-03-15 10:57:54,089] Trial 45 finished with value: 0.7120538118343862 and parameters: {'learning_rate': 0.004796737777929377, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 4}. Best is trial 45 with value: 0.7120538118343862.


Trial 46 with params: {'learning_rate': 0.00133731003644267, 'weight_decay': 0.001, 'adam_beta1': 0.93, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0931,2.44392,0.402383,0.057341,0.090645,0.064581
2,2.0577,1.786111,0.55637,0.172752,0.175221,0.149761
3,1.4935,1.387758,0.660862,0.307361,0.28397,0.274753
4,0.9898,1.22972,0.693859,0.430609,0.377832,0.379386
5,0.6484,1.152208,0.71769,0.452238,0.418315,0.412193
6,0.4144,1.168822,0.738772,0.54175,0.483784,0.498321
7,0.2465,1.198466,0.748854,0.624642,0.541865,0.564115
8,0.154,1.257821,0.75802,0.61473,0.584225,0.586115
9,0.0879,1.359181,0.753437,0.581201,0.556546,0.557408
10,0.0473,1.483208,0.737855,0.607647,0.581218,0.583378


[I 2025-03-15 11:00:28,466] Trial 46 finished with value: 0.5863220053041373 and parameters: {'learning_rate': 0.00133731003644267, 'weight_decay': 0.001, 'adam_beta1': 0.93, 'warmup_steps': 4}. Best is trial 45 with value: 0.7120538118343862.


Trial 47 with params: {'learning_rate': 0.002464718548070002, 'weight_decay': 0.001, 'adam_beta1': 0.91, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8261,2.078651,0.468378,0.108602,0.12331,0.105097
2,1.6801,1.475827,0.64253,0.274889,0.273174,0.255261
3,1.0419,1.177758,0.703941,0.396442,0.382322,0.374568
4,0.5714,1.133904,0.724106,0.480939,0.455484,0.459706
5,0.2868,1.10934,0.771769,0.631752,0.575654,0.583731
6,0.1467,1.325708,0.759853,0.643313,0.560308,0.579898
7,0.0722,1.335049,0.779102,0.67677,0.629443,0.634913
8,0.0259,1.431802,0.783685,0.682424,0.650644,0.652934
9,0.0172,1.475921,0.778185,0.664949,0.644204,0.635444
10,0.0058,1.544701,0.780018,0.674341,0.618014,0.629367


[I 2025-03-15 11:02:09,856] Trial 47 pruned. 


Trial 48 with params: {'learning_rate': 8.153679865827409e-05, 'weight_decay': 0.004, 'adam_beta1': 0.99, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.7978,3.608582,0.176902,0.003538,0.02,0.006012
2,3.3616,3.242942,0.176902,0.003538,0.02,0.006012
3,3.238,3.151965,0.176902,0.003538,0.02,0.006012
4,3.1229,3.068912,0.228231,0.017039,0.035432,0.01911
5,3.0614,2.978385,0.217232,0.051856,0.034837,0.024896
6,2.9375,2.871765,0.302475,0.045186,0.057101,0.043996
7,2.8275,2.788878,0.318057,0.045534,0.061084,0.04185
8,2.7577,2.713018,0.351054,0.038324,0.069329,0.043787
9,2.664,2.615598,0.367553,0.043687,0.074666,0.049917
10,2.5768,2.539274,0.396884,0.043208,0.082133,0.0542


[I 2025-03-15 11:03:48,935] Trial 48 pruned. 


Trial 49 with params: {'learning_rate': 0.003458590909804615, 'weight_decay': 0.0, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8244,2.054361,0.472961,0.109803,0.127629,0.108912
2,1.6575,1.486127,0.633364,0.288162,0.282487,0.266151
3,1.0123,1.172433,0.71769,0.375571,0.38853,0.378051
4,0.5178,1.231081,0.726856,0.531161,0.494363,0.498461
5,0.2528,1.298534,0.746104,0.591958,0.538405,0.541105
6,0.1523,1.287991,0.76352,0.641577,0.608761,0.603245
7,0.0608,1.436684,0.767186,0.635106,0.626684,0.615798
8,0.0223,1.508611,0.784601,0.6866,0.668366,0.662288
9,0.0095,1.638588,0.769019,0.638744,0.600907,0.604929
10,0.0053,1.666211,0.772686,0.656125,0.634279,0.630371


[I 2025-03-15 11:06:23,635] Trial 49 finished with value: 0.6630281634747086 and parameters: {'learning_rate': 0.003458590909804615, 'weight_decay': 0.0, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 2}. Best is trial 45 with value: 0.7120538118343862.


Trial 50 with params: {'learning_rate': 3.464408931098879e-05, 'weight_decay': 0.003, 'adam_beta1': 0.97, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8754,3.825886,0.176902,0.003538,0.02,0.006012
2,3.7599,3.672247,0.176902,0.003538,0.02,0.006012
3,3.5371,3.356167,0.176902,0.003538,0.02,0.006012
4,3.2508,3.204086,0.176902,0.003538,0.02,0.006012
5,3.2048,3.137464,0.176902,0.003538,0.02,0.006012
6,3.1327,3.096572,0.176902,0.003538,0.02,0.006012
7,3.0818,3.038732,0.176902,0.003548,0.02,0.006027
8,3.0263,2.966866,0.273144,0.030452,0.045329,0.030712
9,2.9462,2.900954,0.329056,0.028863,0.060493,0.038331
10,2.8732,2.84995,0.361137,0.03932,0.07012,0.042991


[I 2025-03-15 11:07:12,712] Trial 50 pruned. 


Trial 51 with params: {'learning_rate': 0.002551926520547476, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.97, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.92,2.236063,0.437214,0.090272,0.106919,0.087833
2,1.8516,1.61097,0.606783,0.221775,0.253401,0.231645
3,1.2731,1.377371,0.67736,0.358917,0.335118,0.328347
4,0.7955,1.244831,0.706691,0.414453,0.380704,0.379273
5,0.461,1.2121,0.736939,0.545948,0.489187,0.503827
6,0.2447,1.324004,0.751604,0.596179,0.550617,0.552259
7,0.1229,1.380583,0.773602,0.664589,0.590667,0.601557
8,0.0658,1.633115,0.762603,0.621528,0.543694,0.56166
9,0.0356,1.608434,0.750687,0.61003,0.584463,0.578997
10,0.0177,1.62743,0.761687,0.665085,0.621857,0.626352


[I 2025-03-15 11:08:57,123] Trial 51 pruned. 


Trial 52 with params: {'learning_rate': 0.004185238693319757, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6299,1.951074,0.516957,0.19793,0.165123,0.151279
2,1.4077,1.362797,0.68561,0.341248,0.343447,0.32686
3,0.7551,1.045849,0.747938,0.475446,0.469137,0.453222
4,0.3462,1.032779,0.767186,0.611864,0.543536,0.556504
5,0.1253,1.177574,0.778185,0.722048,0.637631,0.64516
6,0.0448,1.228037,0.792851,0.712734,0.694676,0.679404
7,0.0156,1.343396,0.793767,0.721282,0.679635,0.677917
8,0.0088,1.414463,0.802016,0.742027,0.687984,0.694342
9,0.0053,1.384252,0.806599,0.742667,0.701386,0.706042
10,0.0026,1.3979,0.808433,0.750848,0.710031,0.712598


[I 2025-03-15 11:11:32,701] Trial 52 finished with value: 0.7267722601596618 and parameters: {'learning_rate': 0.004185238693319757, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 4}. Best is trial 52 with value: 0.7267722601596618.


Trial 53 with params: {'learning_rate': 0.004225827160310881, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6999,1.971575,0.516957,0.138252,0.150739,0.123254
2,1.519,1.311838,0.67461,0.344178,0.317034,0.302686
3,0.8413,1.135536,0.738772,0.464983,0.431482,0.434376
4,0.405,1.218328,0.733272,0.525034,0.504581,0.506107
5,0.1593,1.22111,0.769019,0.632176,0.589,0.5877
6,0.0561,1.355188,0.774519,0.624677,0.612224,0.606826
7,0.0273,1.443758,0.782768,0.696565,0.674126,0.663593
8,0.0135,1.51103,0.775435,0.67424,0.655211,0.648503
9,0.0094,1.500841,0.775435,0.705844,0.67198,0.670831
10,0.0022,1.578868,0.785518,0.705832,0.661908,0.666849


[I 2025-03-15 11:14:02,278] Trial 53 finished with value: 0.6743983045730091 and parameters: {'learning_rate': 0.004225827160310881, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 4}. Best is trial 52 with value: 0.7267722601596618.


Trial 54 with params: {'learning_rate': 0.004622954483422155, 'weight_decay': 0.01, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6824,1.995079,0.51604,0.148763,0.155372,0.131891
2,1.5552,1.445528,0.629698,0.258815,0.287806,0.260973
3,0.8548,1.122311,0.72594,0.471172,0.426402,0.428629
4,0.4049,1.194557,0.750687,0.601536,0.539492,0.54885
5,0.1537,1.29542,0.762603,0.653833,0.620862,0.614168
6,0.0535,1.35721,0.786434,0.691236,0.673342,0.664575
7,0.017,1.485995,0.785518,0.696922,0.65087,0.656839
8,0.008,1.573864,0.772686,0.674806,0.66336,0.653486
9,0.0038,1.635554,0.787351,0.722855,0.671081,0.676622
10,0.003,1.607075,0.778185,0.678383,0.660446,0.659127


[I 2025-03-15 11:15:42,816] Trial 54 pruned. 


Trial 55 with params: {'learning_rate': 0.004071127607426018, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6824,1.966617,0.516957,0.143831,0.154845,0.132059
2,1.4452,1.298712,0.690192,0.353015,0.326724,0.32275
3,0.7708,1.088679,0.744271,0.43081,0.423945,0.413011
4,0.3537,1.146968,0.741522,0.615552,0.543561,0.559814
5,0.1449,1.241725,0.752521,0.685649,0.632569,0.627605
6,0.061,1.310309,0.774519,0.674772,0.630861,0.634354
7,0.022,1.331125,0.783685,0.687093,0.684202,0.671293
8,0.0073,1.421131,0.786434,0.699143,0.683281,0.679288
9,0.004,1.46215,0.788268,0.709763,0.693416,0.691525
10,0.0013,1.448879,0.791017,0.695171,0.678626,0.676303


[I 2025-03-15 11:18:17,703] Trial 55 finished with value: 0.6827088081037271 and parameters: {'learning_rate': 0.004071127607426018, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 4}. Best is trial 52 with value: 0.7267722601596618.


Trial 56 with params: {'learning_rate': 0.003975649535190986, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8051,1.989511,0.505958,0.143063,0.142446,0.119544
2,1.5025,1.356836,0.68011,0.328559,0.319841,0.312463
3,0.809,1.10794,0.736939,0.402233,0.416082,0.401429
4,0.3907,1.124147,0.740605,0.614383,0.525237,0.542818
5,0.1632,1.145064,0.76077,0.651082,0.610372,0.609686
6,0.0707,1.210656,0.778185,0.68977,0.649034,0.651924
7,0.025,1.306275,0.783685,0.673902,0.65672,0.649489
8,0.0106,1.359998,0.788268,0.687384,0.682853,0.668502
9,0.0058,1.414076,0.784601,0.681675,0.670117,0.661193
10,0.0025,1.437187,0.79835,0.732323,0.677617,0.68675


[I 2025-03-15 11:20:56,162] Trial 56 finished with value: 0.6787268629496385 and parameters: {'learning_rate': 0.003975649535190986, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 4}. Best is trial 52 with value: 0.7267722601596618.


Trial 57 with params: {'learning_rate': 0.004344662021773019, 'weight_decay': 0.008, 'adam_beta1': 0.92, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6418,1.957918,0.517874,0.175954,0.16202,0.14018
2,1.4814,1.280376,0.691109,0.350416,0.327726,0.321934
3,0.7949,1.119417,0.741522,0.438893,0.430506,0.427246
4,0.3655,1.13759,0.752521,0.578776,0.522645,0.525874
5,0.1361,1.19018,0.782768,0.746447,0.686071,0.686307
6,0.049,1.282331,0.772686,0.708454,0.674349,0.663306
7,0.022,1.356378,0.775435,0.702452,0.670064,0.661997
8,0.0111,1.397045,0.780018,0.732286,0.674174,0.680496
9,0.0056,1.42534,0.792851,0.722495,0.664458,0.675701
10,0.0025,1.420475,0.796517,0.720441,0.660512,0.669635


[I 2025-03-15 11:23:36,525] Trial 57 finished with value: 0.6991205266834366 and parameters: {'learning_rate': 0.004344662021773019, 'weight_decay': 0.008, 'adam_beta1': 0.92, 'warmup_steps': 4}. Best is trial 52 with value: 0.7267722601596618.


Trial 58 with params: {'learning_rate': 0.001023389781692773, 'weight_decay': 0.007, 'adam_beta1': 0.93, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.1884,2.587516,0.366636,0.056887,0.080661,0.05778
2,2.2155,1.948061,0.528873,0.144191,0.155398,0.130802
3,1.686,1.53656,0.603116,0.234244,0.208098,0.191622
4,1.2198,1.32699,0.666361,0.347022,0.31882,0.320244
5,0.8812,1.174481,0.693859,0.386874,0.36644,0.362302
6,0.5966,1.137944,0.727773,0.443253,0.414816,0.420763
7,0.3828,1.145029,0.757104,0.566516,0.495375,0.516043
8,0.2569,1.150941,0.754354,0.562212,0.542007,0.53439
9,0.145,1.251485,0.762603,0.645547,0.550969,0.575641
10,0.1013,1.303253,0.749771,0.620932,0.582331,0.583829


[I 2025-03-15 11:25:20,715] Trial 58 pruned. 


Trial 59 with params: {'learning_rate': 0.0014384211441472192, 'weight_decay': 0.008, 'adam_beta1': 0.92, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0798,2.371388,0.415215,0.102909,0.097108,0.0735
2,1.9931,1.728521,0.569203,0.187823,0.182227,0.161555
3,1.4219,1.34778,0.665445,0.309272,0.291532,0.28411
4,0.9135,1.233582,0.696609,0.450387,0.392672,0.401404
5,0.5749,1.084659,0.745188,0.489092,0.446671,0.449906
6,0.3644,1.234059,0.740605,0.541877,0.504466,0.509368
7,0.2112,1.239137,0.756187,0.630045,0.545946,0.565408
8,0.1244,1.31503,0.757104,0.617382,0.577807,0.571962
9,0.0635,1.399946,0.758937,0.654263,0.576974,0.596545
10,0.0377,1.401762,0.751604,0.629562,0.591874,0.594017


[I 2025-03-15 11:26:15,226] Trial 59 pruned. 


Trial 60 with params: {'learning_rate': 0.004041728132425059, 'weight_decay': 0.008, 'adam_beta1': 0.93, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9854,2.095009,0.482126,0.110775,0.128523,0.107273
2,1.6588,1.443637,0.63428,0.283137,0.284604,0.269109
3,0.9372,1.182489,0.721357,0.414429,0.381409,0.381242
4,0.4851,1.175032,0.730522,0.494694,0.469554,0.459664
5,0.219,1.302521,0.762603,0.632391,0.579067,0.585928
6,0.1002,1.357746,0.774519,0.637969,0.597312,0.598572
7,0.0415,1.424109,0.773602,0.632865,0.627596,0.614506
8,0.0183,1.523935,0.791934,0.647736,0.610702,0.608754
9,0.0066,1.595605,0.791017,0.655337,0.623061,0.625317
10,0.0033,1.576288,0.796517,0.670657,0.672727,0.6522


[I 2025-03-15 11:27:04,928] Trial 60 pruned. 


Trial 61 with params: {'learning_rate': 0.003689966984100666, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0888,2.132041,0.459212,0.105511,0.118326,0.097339
2,1.6905,1.43972,0.633364,0.33174,0.278509,0.278082
3,0.9961,1.154027,0.727773,0.441927,0.389273,0.390205
4,0.5025,1.108257,0.736939,0.534073,0.494234,0.501752
5,0.2075,1.182435,0.769019,0.636164,0.588658,0.59887
6,0.0847,1.31561,0.776352,0.676087,0.625682,0.630214
7,0.0459,1.288207,0.783685,0.691937,0.66433,0.657318
8,0.0201,1.38557,0.785518,0.687165,0.634318,0.635067
9,0.0098,1.477841,0.774519,0.688657,0.639254,0.638957
10,0.0039,1.517162,0.783685,0.678897,0.687249,0.663358


[I 2025-03-15 11:28:48,679] Trial 61 pruned. 


Trial 62 with params: {'learning_rate': 0.0042152655031590484, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6261,1.9362,0.513291,0.159168,0.157348,0.13745
2,1.4022,1.354235,0.681027,0.338178,0.335291,0.323743
3,0.7473,1.029035,0.764436,0.490478,0.484893,0.473727
4,0.3374,1.072345,0.75802,0.616193,0.55123,0.564214
5,0.1359,1.234124,0.776352,0.715007,0.634012,0.65227
6,0.0548,1.245522,0.791934,0.721111,0.67191,0.676738
7,0.0237,1.330548,0.778185,0.689014,0.668012,0.65858
8,0.0115,1.351984,0.794684,0.736796,0.700946,0.698142
9,0.0055,1.36755,0.794684,0.717352,0.66098,0.671273
10,0.009,1.388793,0.791934,0.713627,0.659148,0.669295


[I 2025-03-15 11:31:17,288] Trial 62 finished with value: 0.6765475469128305 and parameters: {'learning_rate': 0.0042152655031590484, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 4}. Best is trial 52 with value: 0.7267722601596618.


Trial 63 with params: {'learning_rate': 0.003315872465888738, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.91, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0247,2.134299,0.452796,0.103777,0.115509,0.099893
2,1.7036,1.413259,0.655362,0.303518,0.295627,0.282201
3,0.9972,1.185978,0.716774,0.435969,0.401034,0.395541
4,0.5341,1.14919,0.734189,0.524943,0.468643,0.484131
5,0.2463,1.308288,0.731439,0.611109,0.542305,0.543522
6,0.1317,1.293741,0.774519,0.625743,0.603103,0.601699
7,0.0599,1.451576,0.770852,0.698593,0.630837,0.650625
8,0.0202,1.484747,0.767186,0.691318,0.63821,0.650081
9,0.0072,1.634074,0.766269,0.720827,0.637962,0.66418
10,0.003,1.574589,0.777269,0.704377,0.661986,0.6681


[I 2025-03-15 11:33:07,888] Trial 63 pruned. 


Trial 64 with params: {'learning_rate': 0.004016823842231186, 'weight_decay': 0.005, 'adam_beta1': 0.99, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.817,2.138553,0.464711,0.099159,0.121419,0.103527
2,1.8243,1.684291,0.571952,0.186468,0.201544,0.182412
3,1.2578,1.434356,0.651696,0.349997,0.330457,0.316578
4,0.8279,1.381103,0.699358,0.429383,0.387388,0.382172
5,0.4859,1.433731,0.732356,0.519487,0.474713,0.481573


[I 2025-03-15 11:33:33,117] Trial 64 pruned. 


Trial 65 with params: {'learning_rate': 0.004101136362812212, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7136,1.984903,0.507791,0.130451,0.147785,0.124677
2,1.4739,1.334681,0.669111,0.352822,0.319171,0.317136
3,0.804,1.079648,0.747938,0.471048,0.441229,0.440379
4,0.369,1.052835,0.758937,0.607039,0.543331,0.557513
5,0.1402,1.19614,0.769019,0.638893,0.613676,0.605118
6,0.0459,1.35698,0.768103,0.689835,0.630252,0.634391
7,0.0315,1.40344,0.775435,0.667032,0.632133,0.632012
8,0.0122,1.371811,0.781852,0.654617,0.646861,0.629766
9,0.0079,1.463572,0.784601,0.710515,0.656393,0.668327
10,0.0028,1.495466,0.788268,0.709884,0.653903,0.667432


[I 2025-03-15 11:36:11,432] Trial 65 finished with value: 0.6774091124564136 and parameters: {'learning_rate': 0.004101136362812212, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 3}. Best is trial 52 with value: 0.7267722601596618.


Trial 66 with params: {'learning_rate': 9.390005154183661e-05, 'weight_decay': 0.007, 'adam_beta1': 0.98, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.7872,3.549056,0.176902,0.003538,0.02,0.006012
2,3.3148,3.217006,0.176902,0.003538,0.02,0.006012
3,3.1639,3.082475,0.208983,0.012362,0.029589,0.015455
4,3.0387,2.966349,0.228231,0.047484,0.037774,0.028836
5,2.9305,2.826618,0.320807,0.042382,0.060808,0.041509
6,2.7599,2.694278,0.378552,0.041112,0.078537,0.051595
7,2.6297,2.56331,0.396884,0.040887,0.083596,0.053835
8,2.5137,2.462738,0.413382,0.071783,0.088851,0.058151
9,2.4013,2.357749,0.423465,0.057349,0.091746,0.062876
10,2.3011,2.269965,0.442713,0.072482,0.101117,0.074942


[I 2025-03-15 11:37:55,074] Trial 66 pruned. 


Trial 67 with params: {'learning_rate': 0.004836330677324087, 'weight_decay': 0.01, 'adam_beta1': 0.91, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6647,1.986629,0.497709,0.175913,0.162747,0.149589
2,1.475,1.320187,0.684693,0.344091,0.333839,0.323671
3,0.7751,1.088961,0.749771,0.479633,0.450972,0.44836
4,0.3427,1.184921,0.753437,0.653891,0.56447,0.584207
5,0.1264,1.230666,0.769936,0.639165,0.572008,0.588033
6,0.0429,1.41629,0.775435,0.684724,0.607517,0.61936
7,0.0183,1.463247,0.773602,0.663899,0.627039,0.630761
8,0.0068,1.556715,0.765353,0.643501,0.637456,0.619522
9,0.0019,1.578109,0.771769,0.625282,0.63847,0.617328
10,0.0011,1.629964,0.773602,0.677833,0.62787,0.631489


[I 2025-03-15 11:38:43,849] Trial 67 pruned. 


Trial 68 with params: {'learning_rate': 0.004883439708041328, 'weight_decay': 0.007, 'adam_beta1': 0.92, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6781,2.008176,0.491292,0.185242,0.150671,0.138463
2,1.5157,1.359141,0.658112,0.312265,0.316448,0.30498
3,0.8078,1.111908,0.72594,0.431802,0.431423,0.422352
4,0.36,1.182284,0.739688,0.558055,0.50807,0.519927
5,0.1338,1.239334,0.779102,0.664594,0.633957,0.632507
6,0.0439,1.432907,0.771769,0.687332,0.659164,0.648156
7,0.0222,1.46202,0.772686,0.700532,0.662902,0.666275
8,0.0096,1.525809,0.775435,0.682961,0.662004,0.651121
9,0.0064,1.542971,0.788268,0.702494,0.664039,0.666672
10,0.0077,1.522474,0.780018,0.690979,0.671575,0.663769


[I 2025-03-15 11:40:23,210] Trial 68 pruned. 


Trial 69 with params: {'learning_rate': 0.001106362141094799, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.1122,2.453805,0.402383,0.063582,0.091162,0.066984
2,2.075,1.846325,0.537122,0.162165,0.158139,0.135837
3,1.5473,1.413086,0.647113,0.268178,0.257508,0.24617
4,1.0402,1.215268,0.693859,0.416113,0.365237,0.370573
5,0.7145,1.119481,0.716774,0.447611,0.412868,0.413077
6,0.4864,1.110985,0.726856,0.483812,0.458511,0.460003
7,0.2939,1.184193,0.740605,0.601912,0.51898,0.539189
8,0.1772,1.244262,0.749771,0.566392,0.561681,0.549003
9,0.1042,1.263136,0.742438,0.580928,0.534267,0.539427
10,0.0684,1.401265,0.744271,0.628545,0.603138,0.595198


[I 2025-03-15 11:42:58,209] Trial 69 finished with value: 0.6336391200098599 and parameters: {'learning_rate': 0.001106362141094799, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 3}. Best is trial 52 with value: 0.7267722601596618.


Trial 70 with params: {'learning_rate': 0.004415863202037318, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8362,1.986824,0.504125,0.137178,0.143153,0.117249
2,1.54,1.391392,0.651696,0.349415,0.305629,0.307579
3,0.8781,1.127702,0.736939,0.411545,0.403549,0.397893
4,0.4227,1.10934,0.751604,0.57001,0.544573,0.54075
5,0.1876,1.225412,0.75527,0.681728,0.635667,0.628495
6,0.0727,1.269253,0.771769,0.638333,0.625124,0.609432
7,0.0339,1.405814,0.771769,0.674543,0.649567,0.634169
8,0.019,1.459715,0.780018,0.70708,0.66485,0.666756
9,0.0137,1.448785,0.778185,0.689999,0.690587,0.67657
10,0.0052,1.529253,0.791017,0.692913,0.664919,0.667306


[I 2025-03-15 11:45:29,772] Trial 70 finished with value: 0.6844390706930475 and parameters: {'learning_rate': 0.004415863202037318, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 3}. Best is trial 52 with value: 0.7267722601596618.


Trial 71 with params: {'learning_rate': 0.003171960832959758, 'weight_decay': 0.007, 'adam_beta1': 0.92, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7181,2.021694,0.505041,0.119734,0.146278,0.120713
2,1.5908,1.377347,0.659028,0.318469,0.313311,0.288099
3,0.9306,1.111618,0.728689,0.440431,0.407106,0.407284
4,0.4663,1.159917,0.729606,0.553284,0.485206,0.497713
5,0.2191,1.187419,0.749771,0.663924,0.602328,0.60615
6,0.0871,1.357284,0.779102,0.704568,0.682744,0.678164
7,0.035,1.48866,0.772686,0.666035,0.639722,0.634137
8,0.0137,1.490249,0.783685,0.702857,0.662355,0.657304
9,0.0069,1.546169,0.770852,0.679539,0.673437,0.660052
10,0.005,1.574914,0.788268,0.683971,0.684328,0.667765


[I 2025-03-15 11:47:09,852] Trial 71 pruned. 


Trial 72 with params: {'learning_rate': 0.004585858395117119, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6635,1.97233,0.52154,0.152121,0.156169,0.12853
2,1.5256,1.408153,0.667278,0.32442,0.320879,0.300764
3,0.8324,1.127598,0.741522,0.488897,0.433495,0.440832
4,0.3692,1.132629,0.75527,0.612003,0.556127,0.567212
5,0.1308,1.246168,0.75802,0.647132,0.604172,0.600488
6,0.0577,1.282829,0.782768,0.698369,0.684102,0.67036
7,0.0285,1.324149,0.792851,0.713578,0.702506,0.688439
8,0.009,1.388942,0.785518,0.700355,0.685241,0.675126
9,0.0026,1.451747,0.804766,0.728624,0.682071,0.687472
10,0.0007,1.510259,0.791017,0.702325,0.673155,0.672692


[I 2025-03-15 11:49:51,964] Trial 72 finished with value: 0.6861994403416486 and parameters: {'learning_rate': 0.004585858395117119, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 4}. Best is trial 52 with value: 0.7267722601596618.


Trial 73 with params: {'learning_rate': 0.001850087735502538, 'weight_decay': 0.002, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9909,2.278877,0.427131,0.086055,0.104671,0.083583
2,1.898,1.63247,0.587534,0.221844,0.210893,0.197394
3,1.2871,1.292698,0.68561,0.346206,0.331191,0.326718
4,0.7935,1.170794,0.710357,0.434707,0.408233,0.40932
5,0.4811,1.170194,0.737855,0.485005,0.457394,0.458999
6,0.3059,1.236163,0.738772,0.521679,0.5168,0.506763
7,0.1655,1.302784,0.757104,0.680888,0.568404,0.594313
8,0.081,1.328823,0.775435,0.674025,0.623945,0.626684
9,0.0512,1.481633,0.758937,0.650214,0.591494,0.603868
10,0.0288,1.501896,0.759853,0.65523,0.615204,0.61774


[I 2025-03-15 11:50:40,705] Trial 73 pruned. 


Trial 74 with params: {'learning_rate': 0.0031198011941025628, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7606,2.01158,0.502291,0.110891,0.140505,0.116479
2,1.6054,1.42623,0.630614,0.297153,0.278501,0.265404
3,0.9461,1.15646,0.71494,0.418443,0.396835,0.38567
4,0.5111,1.127852,0.742438,0.532839,0.5147,0.509159
5,0.2407,1.198237,0.747938,0.585776,0.551243,0.547235
6,0.1242,1.200628,0.780935,0.662276,0.631564,0.631181
7,0.0469,1.294861,0.781852,0.650439,0.664238,0.637329
8,0.0207,1.419976,0.761687,0.678598,0.631583,0.635087
9,0.0121,1.413437,0.778185,0.691778,0.642802,0.653751
10,0.0073,1.46456,0.772686,0.671751,0.64745,0.647685


[I 2025-03-15 11:53:18,451] Trial 74 finished with value: 0.6552996388619845 and parameters: {'learning_rate': 0.0031198011941025628, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 3}. Best is trial 52 with value: 0.7267722601596618.


Trial 75 with params: {'learning_rate': 0.0020321517514322756, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0004,2.248621,0.433547,0.116552,0.106053,0.09006
2,1.8409,1.623539,0.584785,0.220825,0.209668,0.193449
3,1.228,1.265553,0.699358,0.363242,0.348186,0.342257
4,0.7425,1.151308,0.719523,0.438979,0.425515,0.422612
5,0.4052,1.179971,0.739688,0.524256,0.490654,0.488131


[I 2025-03-15 11:53:44,337] Trial 75 pruned. 


Trial 76 with params: {'learning_rate': 0.0038547220232449226, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0896,2.134667,0.44363,0.117656,0.111346,0.094668
2,1.7128,1.483256,0.624198,0.293659,0.265926,0.249099
3,1.0073,1.165752,0.727773,0.38314,0.385285,0.376506
4,0.5002,1.130177,0.741522,0.540169,0.490557,0.500901
5,0.2178,1.23307,0.751604,0.600006,0.578494,0.562507
6,0.0896,1.297144,0.75802,0.626206,0.592948,0.592899
7,0.0379,1.45645,0.76352,0.624924,0.595203,0.588429
8,0.0188,1.434351,0.777269,0.621559,0.615059,0.603266
9,0.0106,1.587537,0.777269,0.650997,0.611952,0.612194
10,0.0074,1.592287,0.780018,0.628682,0.634845,0.612653


[I 2025-03-15 11:56:21,969] Trial 76 finished with value: 0.6519187814023022 and parameters: {'learning_rate': 0.0038547220232449226, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 4}. Best is trial 52 with value: 0.7267722601596618.


Trial 77 with params: {'learning_rate': 0.0015195294514735384, 'weight_decay': 0.003, 'adam_beta1': 0.96, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0349,2.410975,0.420715,0.063133,0.099866,0.072219
2,2.0692,1.835309,0.547204,0.17379,0.16973,0.145177
3,1.5352,1.485196,0.630614,0.297182,0.262159,0.263157
4,1.0611,1.286627,0.693859,0.356002,0.348988,0.342229
5,0.694,1.186414,0.713107,0.413226,0.403291,0.394483
6,0.4206,1.17906,0.734189,0.493009,0.489726,0.480566
7,0.2516,1.296251,0.752521,0.595817,0.522375,0.541549
8,0.142,1.347175,0.757104,0.593699,0.564364,0.561305
9,0.079,1.45785,0.758937,0.626614,0.586071,0.587288
10,0.0431,1.511542,0.759853,0.610314,0.577562,0.578589


[I 2025-03-15 11:58:53,888] Trial 77 finished with value: 0.6304299912320456 and parameters: {'learning_rate': 0.0015195294514735384, 'weight_decay': 0.003, 'adam_beta1': 0.96, 'warmup_steps': 1}. Best is trial 52 with value: 0.7267722601596618.


Trial 78 with params: {'learning_rate': 0.003535680692569326, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.103,2.153384,0.450046,0.097856,0.11633,0.099044
2,1.7399,1.497211,0.626948,0.281471,0.267298,0.254882
3,1.0298,1.167118,0.724106,0.383686,0.392131,0.378618
4,0.5436,1.170529,0.724106,0.470595,0.441891,0.444826
5,0.2447,1.204588,0.769936,0.6132,0.555558,0.559208
6,0.1189,1.275123,0.778185,0.635139,0.600297,0.600697
7,0.0557,1.388478,0.776352,0.669133,0.601659,0.618288
8,0.0367,1.503814,0.75802,0.616305,0.620834,0.586626
9,0.0187,1.427226,0.782768,0.663006,0.644845,0.641066
10,0.0066,1.488725,0.789184,0.670042,0.643733,0.641811


[I 2025-03-15 12:00:40,204] Trial 78 pruned. 


Trial 79 with params: {'learning_rate': 0.0031750056555660045, 'weight_decay': 0.007, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9408,2.176831,0.44363,0.09565,0.111518,0.095449
2,1.7661,1.520432,0.615032,0.258115,0.255577,0.243294
3,1.1117,1.205393,0.709441,0.367317,0.377714,0.36132
4,0.6008,1.167966,0.733272,0.49566,0.456073,0.458328
5,0.2966,1.210934,0.764436,0.607508,0.522232,0.537673


[I 2025-03-15 12:01:04,079] Trial 79 pruned. 


Trial 80 with params: {'learning_rate': 0.004970727576065037, 'weight_decay': 0.007, 'adam_beta1': 0.91, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5589,1.87519,0.527956,0.200375,0.181947,0.167278
2,1.388,1.265324,0.68561,0.33576,0.342386,0.328562
3,0.6956,1.021049,0.765353,0.549403,0.509164,0.510449
4,0.3096,1.129094,0.762603,0.61636,0.568192,0.571614
5,0.0995,1.213535,0.779102,0.670185,0.63254,0.619631
6,0.0275,1.356086,0.783685,0.676768,0.647986,0.645833
7,0.0154,1.409681,0.784601,0.712706,0.67341,0.6774
8,0.0164,1.415169,0.791017,0.717641,0.669889,0.679137
9,0.0043,1.433751,0.794684,0.697336,0.656185,0.66251
10,0.0013,1.428154,0.797434,0.687703,0.666341,0.663568


[I 2025-03-15 12:03:38,582] Trial 80 finished with value: 0.6584063338093696 and parameters: {'learning_rate': 0.004970727576065037, 'weight_decay': 0.007, 'adam_beta1': 0.91, 'warmup_steps': 2}. Best is trial 52 with value: 0.7267722601596618.


Trial 81 with params: {'learning_rate': 0.0015417645761537859, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0204,2.274568,0.419798,0.10689,0.098905,0.081512
2,1.8917,1.704259,0.568286,0.208521,0.184435,0.166286
3,1.3199,1.326243,0.672777,0.328581,0.301555,0.296515
4,0.8275,1.205367,0.701192,0.476982,0.397035,0.410828
5,0.5032,1.117265,0.72594,0.458116,0.440561,0.436267
6,0.3092,1.202323,0.738772,0.527112,0.484832,0.492796
7,0.1944,1.269383,0.744271,0.62226,0.514252,0.542292
8,0.0925,1.3357,0.761687,0.635244,0.596691,0.599847
9,0.0522,1.492396,0.751604,0.646967,0.591908,0.597542
10,0.0353,1.513332,0.754354,0.662638,0.624466,0.625887


[I 2025-03-15 12:04:34,188] Trial 81 pruned. 


Trial 82 with params: {'learning_rate': 0.004761738090422008, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9456,2.082433,0.47846,0.129259,0.131022,0.110469
2,1.6006,1.473645,0.63703,0.308281,0.27996,0.275969
3,0.9315,1.151576,0.729606,0.430953,0.418382,0.414144
4,0.4489,1.127035,0.754354,0.526679,0.504749,0.507299
5,0.1869,1.261245,0.764436,0.637772,0.598478,0.588591
6,0.0798,1.268846,0.775435,0.653096,0.627902,0.620962
7,0.0381,1.429187,0.780935,0.657462,0.660739,0.638747
8,0.018,1.431765,0.785518,0.686956,0.664306,0.657801
9,0.009,1.479229,0.776352,0.691138,0.668862,0.657788
10,0.003,1.500869,0.793767,0.685842,0.665565,0.656692


[I 2025-03-15 12:05:27,001] Trial 82 pruned. 


Trial 83 with params: {'learning_rate': 0.004118426005718762, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.679,2.018876,0.510541,0.156781,0.154026,0.127884
2,1.465,1.295377,0.681943,0.335305,0.323402,0.314139
3,0.7822,1.043971,0.752521,0.478497,0.44251,0.444175
4,0.3553,1.069217,0.762603,0.619057,0.558778,0.573526
5,0.1311,1.176289,0.780018,0.70848,0.644048,0.654295
6,0.0513,1.220455,0.786434,0.688261,0.656385,0.655209
7,0.0323,1.3881,0.769019,0.681347,0.662971,0.654318
8,0.0137,1.378871,0.790101,0.696763,0.678422,0.669326
9,0.0058,1.45709,0.785518,0.685164,0.64399,0.647248
10,0.0027,1.476991,0.791934,0.691455,0.657276,0.661499


[I 2025-03-15 12:08:02,044] Trial 83 finished with value: 0.6549840905583869 and parameters: {'learning_rate': 0.004118426005718762, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 4}. Best is trial 52 with value: 0.7267722601596618.


Trial 84 with params: {'learning_rate': 0.00035402800746304916, 'weight_decay': 0.01, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4113,3.052519,0.2044,0.019455,0.030909,0.019099
2,2.8176,2.577212,0.381302,0.044799,0.078792,0.052823
3,2.3857,2.207757,0.450962,0.087342,0.103501,0.078133
4,2.0709,1.948735,0.522456,0.109925,0.149098,0.124701
5,1.8177,1.769327,0.568286,0.186465,0.179041,0.157905
6,1.5938,1.628682,0.579285,0.187227,0.189176,0.171379
7,1.3967,1.505518,0.614115,0.25433,0.222697,0.217591
8,1.2386,1.410899,0.643446,0.277409,0.261768,0.253774
9,1.0584,1.3459,0.662695,0.305702,0.294268,0.290488
10,0.9071,1.304479,0.659028,0.344941,0.313274,0.315769


[I 2025-03-15 12:08:52,313] Trial 84 pruned. 


Trial 85 with params: {'learning_rate': 0.004467253697261772, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8386,1.976374,0.508708,0.159455,0.145455,0.120298
2,1.504,1.377715,0.658112,0.337241,0.31029,0.310982
3,0.8303,1.099687,0.732356,0.431919,0.423488,0.413403
4,0.3898,1.091679,0.749771,0.621116,0.538971,0.557333
5,0.1594,1.219531,0.771769,0.670245,0.616408,0.612624
6,0.0648,1.259828,0.787351,0.658783,0.643999,0.633184
7,0.0372,1.387508,0.779102,0.676564,0.66796,0.652626
8,0.009,1.47891,0.782768,0.666893,0.659677,0.643865
9,0.004,1.476238,0.790101,0.669126,0.67153,0.657683
10,0.0013,1.506292,0.785518,0.633076,0.654424,0.631881


[I 2025-03-15 12:09:42,754] Trial 85 pruned. 


Trial 86 with params: {'learning_rate': 1.6562808358868146e-05, 'weight_decay': 0.006, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8936,3.871224,0.175985,0.005174,0.021241,0.008309
2,3.8488,3.823352,0.176902,0.003538,0.02,0.006012
3,3.7913,3.74749,0.176902,0.003538,0.02,0.006012
4,3.6854,3.611477,0.176902,0.003538,0.02,0.006012
5,3.5255,3.39432,0.176902,0.003538,0.02,0.006012


[I 2025-03-15 12:10:09,883] Trial 86 pruned. 


Trial 87 with params: {'learning_rate': 0.003914046980570814, 'weight_decay': 0.007, 'adam_beta1': 0.92, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.064,2.078534,0.473877,0.109152,0.122389,0.101355
2,1.6722,1.486483,0.619615,0.271797,0.261435,0.246543
3,0.9647,1.172975,0.722273,0.435827,0.386751,0.387706
4,0.4949,1.201367,0.729606,0.514024,0.472611,0.481137
5,0.2141,1.25777,0.748854,0.600301,0.580631,0.573614
6,0.0836,1.392616,0.765353,0.665358,0.636853,0.63099
7,0.0378,1.486106,0.771769,0.662285,0.647073,0.63621
8,0.0177,1.508029,0.773602,0.65398,0.64125,0.632792
9,0.0088,1.618502,0.767186,0.672793,0.649645,0.639324
10,0.0033,1.53445,0.786434,0.648776,0.653066,0.635867


[I 2025-03-15 12:12:57,385] Trial 87 finished with value: 0.6573897883445775 and parameters: {'learning_rate': 0.003914046980570814, 'weight_decay': 0.007, 'adam_beta1': 0.92, 'warmup_steps': 4}. Best is trial 52 with value: 0.7267722601596618.


Trial 88 with params: {'learning_rate': 0.00390423012235986, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9851,2.06585,0.47846,0.117552,0.127337,0.102671
2,1.6266,1.49122,0.631531,0.28847,0.274051,0.261868
3,0.8957,1.129693,0.731439,0.41783,0.437301,0.418272
4,0.4447,1.093231,0.761687,0.535925,0.484816,0.498719
5,0.1691,1.285125,0.769936,0.623674,0.612341,0.596227
6,0.0767,1.239858,0.780935,0.693589,0.667016,0.655224
7,0.0288,1.413311,0.775435,0.687686,0.669544,0.653562
8,0.0182,1.512277,0.782768,0.667338,0.667085,0.639846
9,0.0095,1.451097,0.790101,0.693218,0.66821,0.661143
10,0.0037,1.480153,0.788268,0.698122,0.693451,0.672982


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--precision/155d3220d6cd4a6553f12da68eeb3d1f97cf431206304a4bc6e2d564c29502e9 (last modified on Fri Jan 10 23:13:59 2025) since it couldn't be found locally at evaluate-metric--precision, or remotely on the Hugging Face Hub.
[I 2025-03-15 12:15:15,553] Trial 88 pruned. 


Trial 89 with params: {'learning_rate': 2.588158862083385e-05, 'weight_decay': 0.002, 'adam_beta1': 0.99, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8829,3.848093,0.176902,0.003538,0.02,0.006012
2,3.8114,3.769159,0.176902,0.003538,0.02,0.006012
3,3.7169,3.646375,0.176902,0.003538,0.02,0.006012
4,3.5569,3.461992,0.176902,0.003538,0.02,0.006012
5,3.3884,3.273108,0.176902,0.003538,0.02,0.006012
6,3.2433,3.222997,0.176902,0.003538,0.02,0.006012
7,3.2245,3.194195,0.176902,0.003538,0.02,0.006012
8,3.1878,3.1465,0.176902,0.003538,0.02,0.006012
9,3.1564,3.121461,0.176902,0.003538,0.02,0.006012
10,3.1281,3.101004,0.180568,0.008591,0.021096,0.007886


[I 2025-03-15 12:16:54,401] Trial 89 pruned. 


Trial 90 with params: {'learning_rate': 0.000320356882928083, 'weight_decay': 0.003, 'adam_beta1': 0.99, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4946,3.154827,0.303391,0.019332,0.056543,0.028233
2,3.0047,2.824184,0.31989,0.039078,0.064128,0.044375
3,2.7105,2.57608,0.376719,0.038553,0.076292,0.047291
4,2.4485,2.347697,0.420715,0.076116,0.096218,0.069848
5,2.2655,2.194136,0.454629,0.086635,0.110582,0.090053


[I 2025-03-15 12:17:20,424] Trial 90 pruned. 


Trial 91 with params: {'learning_rate': 0.003856222057193948, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0885,2.13258,0.448213,0.116635,0.11296,0.095529
2,1.7098,1.465687,0.628781,0.3068,0.270625,0.259277
3,1.0084,1.164509,0.72044,0.377886,0.369559,0.359557
4,0.5219,1.151132,0.739688,0.550076,0.494452,0.503346
5,0.2288,1.263115,0.747938,0.646447,0.585756,0.58849
6,0.088,1.300463,0.777269,0.66776,0.6385,0.63317
7,0.0399,1.43544,0.778185,0.663877,0.655721,0.633677
8,0.0201,1.471254,0.787351,0.711411,0.658534,0.66138
9,0.0141,1.465742,0.791017,0.691793,0.655772,0.660267
10,0.0043,1.531756,0.779102,0.699771,0.689954,0.677221


[I 2025-03-15 12:19:54,722] Trial 91 finished with value: 0.6809981464007638 and parameters: {'learning_rate': 0.003856222057193948, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 4}. Best is trial 52 with value: 0.7267722601596618.


Trial 92 with params: {'learning_rate': 0.00015342701630464132, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6408,3.232568,0.176902,0.003538,0.02,0.006012
2,3.1152,2.987046,0.215399,0.043403,0.033515,0.024958
3,2.855,2.689023,0.373969,0.044855,0.076089,0.051687
4,2.5721,2.470685,0.3978,0.043395,0.083624,0.05592
5,2.3796,2.264113,0.456462,0.071984,0.106791,0.079763
6,2.1691,2.122752,0.471127,0.102499,0.115659,0.092907
7,2.0358,2.007316,0.497709,0.105764,0.129933,0.10591
8,1.9265,1.912912,0.513291,0.110456,0.140771,0.117962
9,1.8105,1.840487,0.523373,0.117461,0.14466,0.122067
10,1.7142,1.80143,0.550871,0.159642,0.17058,0.149063


[I 2025-03-15 12:20:43,113] Trial 92 pruned. 


Trial 93 with params: {'learning_rate': 0.004532930450411124, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6377,1.953583,0.510541,0.142632,0.159817,0.136634
2,1.4325,1.282195,0.692026,0.34864,0.332682,0.328948
3,0.7602,1.046047,0.751604,0.471171,0.46324,0.456059
4,0.3287,1.105213,0.766269,0.602228,0.568126,0.568684
5,0.1118,1.244353,0.772686,0.690254,0.635462,0.641414
6,0.0389,1.213087,0.793767,0.709826,0.669391,0.670252
7,0.0172,1.393168,0.781852,0.729448,0.648569,0.665687
8,0.0099,1.476218,0.768103,0.724603,0.658869,0.670816
9,0.0062,1.366259,0.791017,0.694072,0.690306,0.672892
10,0.003,1.396358,0.780018,0.693336,0.669771,0.667221


[I 2025-03-15 12:22:35,954] Trial 93 pruned. 


Trial 94 with params: {'learning_rate': 0.004548298751096117, 'weight_decay': 0.003, 'adam_beta1': 0.91, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6371,1.938684,0.52154,0.151984,0.163939,0.140612
2,1.4557,1.277815,0.683776,0.337416,0.336804,0.328582
3,0.7714,1.097686,0.744271,0.454048,0.436651,0.432315
4,0.3597,1.06752,0.766269,0.583256,0.556417,0.559392
5,0.1262,1.198527,0.782768,0.68963,0.644704,0.640709
6,0.0385,1.315456,0.774519,0.735286,0.684471,0.68671
7,0.0188,1.412221,0.783685,0.699393,0.647078,0.654955
8,0.0118,1.442509,0.787351,0.728245,0.685412,0.69156
9,0.005,1.449329,0.790101,0.707409,0.686213,0.675738
10,0.0026,1.497111,0.791017,0.702138,0.670493,0.669638


[I 2025-03-15 12:25:21,546] Trial 94 finished with value: 0.6931891677150575 and parameters: {'learning_rate': 0.004548298751096117, 'weight_decay': 0.003, 'adam_beta1': 0.91, 'warmup_steps': 4}. Best is trial 52 with value: 0.7267722601596618.


Trial 95 with params: {'learning_rate': 0.004650492838771561, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9044,2.07429,0.474794,0.134626,0.130522,0.110013
2,1.592,1.392081,0.638863,0.287629,0.291486,0.279801
3,0.902,1.156446,0.726856,0.442214,0.431505,0.421229
4,0.4464,1.141316,0.747021,0.608351,0.543421,0.551175
5,0.1932,1.31056,0.753437,0.639648,0.603534,0.594593
6,0.0718,1.348736,0.778185,0.650606,0.631435,0.619631
7,0.0391,1.504243,0.75802,0.637478,0.607531,0.602052
8,0.0242,1.492725,0.778185,0.676839,0.647213,0.645218
9,0.011,1.502911,0.777269,0.682394,0.657574,0.655302
10,0.0053,1.535814,0.790101,0.683984,0.665812,0.655698


[I 2025-03-15 12:26:19,223] Trial 95 pruned. 


Trial 96 with params: {'learning_rate': 0.004847784354480863, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6829,2.007803,0.504125,0.16395,0.154414,0.139576
2,1.5311,1.415067,0.643446,0.294095,0.299413,0.280271
3,0.8195,1.114587,0.736939,0.473014,0.439094,0.441061
4,0.3541,1.195052,0.754354,0.581081,0.542071,0.546777
5,0.1366,1.285004,0.769936,0.644898,0.642171,0.617281
6,0.0451,1.299127,0.797434,0.709298,0.690653,0.679726
7,0.0152,1.398307,0.793767,0.705492,0.706656,0.689506
8,0.0066,1.524821,0.792851,0.727146,0.699943,0.695276
9,0.0027,1.524935,0.793767,0.673599,0.698555,0.667901
10,0.0015,1.549286,0.796517,0.687183,0.700055,0.677229


[I 2025-03-15 12:28:50,950] Trial 96 finished with value: 0.6849259043641451 and parameters: {'learning_rate': 0.004847784354480863, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 4}. Best is trial 52 with value: 0.7267722601596618.


Trial 97 with params: {'learning_rate': 1.0626063505848356e-05, 'weight_decay': 0.005, 'adam_beta1': 0.93, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8989,3.884492,0.153987,0.015179,0.028705,0.013766
2,3.871,3.856627,0.176902,0.004047,0.020134,0.006611
3,3.8408,3.820432,0.176902,0.003538,0.02,0.006012
4,3.7962,3.770151,0.176902,0.003538,0.02,0.006012
5,3.7402,3.690629,0.176902,0.003538,0.02,0.006012
6,3.6352,3.567652,0.176902,0.003538,0.02,0.006012
7,3.4912,3.415033,0.176902,0.003538,0.02,0.006012
8,3.3634,3.303254,0.176902,0.003538,0.02,0.006012
9,3.2871,3.245034,0.176902,0.003538,0.02,0.006012
10,3.2377,3.210472,0.176902,0.003538,0.02,0.006012


[I 2025-03-15 12:30:31,056] Trial 97 pruned. 


Trial 98 with params: {'learning_rate': 0.002734841907382373, 'weight_decay': 0.003, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.881,2.183909,0.449129,0.090017,0.113092,0.093836
2,1.7798,1.544962,0.604033,0.240625,0.24528,0.229856
3,1.1465,1.251027,0.708524,0.394296,0.366542,0.362066
4,0.6653,1.147003,0.734189,0.513193,0.4445,0.448264
5,0.3579,1.278567,0.736022,0.5598,0.49776,0.50253


[I 2025-03-15 12:30:55,738] Trial 98 pruned. 


Trial 99 with params: {'learning_rate': 0.004280705441119361, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.642,1.961913,0.514207,0.176706,0.159507,0.141376
2,1.4492,1.331709,0.690192,0.350688,0.334085,0.331369
3,0.773,1.075674,0.743355,0.4873,0.439878,0.442158
4,0.3581,1.086968,0.749771,0.602575,0.552238,0.558826
5,0.134,1.234903,0.766269,0.701454,0.622563,0.63034
6,0.0486,1.251355,0.786434,0.713862,0.662127,0.670328
7,0.0208,1.291411,0.791934,0.724632,0.685935,0.685374
8,0.0136,1.316908,0.797434,0.707941,0.69638,0.686783
9,0.005,1.39213,0.793767,0.739239,0.665682,0.682355
10,0.0017,1.388644,0.793767,0.719705,0.673821,0.681205


[I 2025-03-15 12:33:30,068] Trial 99 finished with value: 0.6856631190053506 and parameters: {'learning_rate': 0.004280705441119361, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 4}. Best is trial 52 with value: 0.7267722601596618.


Trial 100 with params: {'learning_rate': 0.0035033529299495557, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0957,2.164339,0.450046,0.094639,0.114912,0.097672
2,1.7639,1.586758,0.603116,0.235886,0.243258,0.225904
3,1.0962,1.210357,0.706691,0.388056,0.373251,0.365318
4,0.5909,1.126809,0.725023,0.477469,0.4248,0.431774
5,0.2818,1.228962,0.740605,0.562578,0.547363,0.525788


[I 2025-03-15 12:33:57,907] Trial 100 pruned. 


Trial 101 with params: {'learning_rate': 0.0022268485190623313, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9298,2.15805,0.460128,0.113505,0.118388,0.103341
2,1.7303,1.485545,0.620532,0.24866,0.250574,0.237925
3,1.0796,1.174244,0.704858,0.379594,0.365057,0.35444
4,0.6189,1.117298,0.72044,0.490361,0.445631,0.452155
5,0.3117,1.127998,0.753437,0.603251,0.52874,0.547385


[I 2025-03-15 12:34:28,337] Trial 101 pruned. 


Trial 102 with params: {'learning_rate': 0.0033436171549278493, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0233,2.124911,0.463795,0.104888,0.118255,0.099937
2,1.6885,1.42943,0.653529,0.3086,0.292885,0.284531
3,0.9877,1.164928,0.721357,0.443018,0.402176,0.400991
4,0.513,1.22343,0.702108,0.52599,0.457483,0.471016
5,0.2542,1.134435,0.775435,0.62362,0.578579,0.583281
6,0.1096,1.299696,0.786434,0.662967,0.622266,0.628635
7,0.0343,1.392248,0.779102,0.647733,0.629164,0.624721
8,0.0142,1.444094,0.785518,0.721859,0.658897,0.67467
9,0.0063,1.497897,0.777269,0.694001,0.642808,0.650156
10,0.0036,1.547492,0.783685,0.687642,0.631657,0.645689


[I 2025-03-15 12:36:13,445] Trial 102 pruned. 


Trial 103 with params: {'learning_rate': 1.546855136785054e-05, 'weight_decay': 0.01, 'adam_beta1': 0.98, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8923,3.871479,0.174152,0.00481,0.020496,0.007769
2,3.8521,3.831584,0.176902,0.003538,0.02,0.006012
3,3.8087,3.779127,0.176902,0.003538,0.02,0.006012
4,3.7436,3.705501,0.176902,0.003538,0.02,0.006012
5,3.664,3.595306,0.176902,0.003538,0.02,0.006012
6,3.5262,3.448095,0.176902,0.003538,0.02,0.006012
7,3.3733,3.303348,0.176902,0.003538,0.02,0.006012
8,3.2711,3.228378,0.176902,0.003538,0.02,0.006012
9,3.2315,3.195194,0.176902,0.003538,0.02,0.006012
10,3.1933,3.16473,0.176902,0.003538,0.02,0.006012


[I 2025-03-15 12:37:03,941] Trial 103 pruned. 


Trial 104 with params: {'learning_rate': 0.004197409910072015, 'weight_decay': 0.0, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8159,2.014834,0.491292,0.138844,0.13386,0.118957
2,1.6174,1.424157,0.633364,0.28919,0.278411,0.264394
3,0.9474,1.189071,0.705775,0.431559,0.3846,0.389931
4,0.482,1.22651,0.726856,0.536514,0.500608,0.501313
5,0.2198,1.266207,0.745188,0.593604,0.561784,0.556208
6,0.0957,1.37986,0.771769,0.642531,0.646551,0.6215
7,0.0405,1.494611,0.764436,0.611646,0.594372,0.586581
8,0.0174,1.577506,0.76352,0.639595,0.633412,0.615339
9,0.0074,1.587631,0.773602,0.636258,0.63692,0.620751
10,0.002,1.649019,0.778185,0.652775,0.641952,0.629749


[I 2025-03-15 12:39:46,231] Trial 104 finished with value: 0.642059306753886 and parameters: {'learning_rate': 0.004197409910072015, 'weight_decay': 0.0, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 4}. Best is trial 52 with value: 0.7267722601596618.


Trial 105 with params: {'learning_rate': 0.004215220912426694, 'weight_decay': 0.001, 'adam_beta1': 0.93, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6806,1.990345,0.509624,0.137875,0.154627,0.125947
2,1.5125,1.333048,0.663611,0.338169,0.310196,0.299654
3,0.8217,1.101007,0.731439,0.43226,0.424548,0.417486
4,0.3874,1.227713,0.737855,0.546689,0.52019,0.519472
5,0.1563,1.343217,0.740605,0.631231,0.580401,0.579636
6,0.056,1.373064,0.772686,0.687473,0.651015,0.653671
7,0.0315,1.499089,0.773602,0.645093,0.643888,0.621709
8,0.0139,1.519152,0.780935,0.669466,0.668491,0.653077
9,0.0094,1.526918,0.780018,0.680066,0.647972,0.652308
10,0.0058,1.576657,0.781852,0.682197,0.662492,0.659215


[I 2025-03-15 12:42:17,361] Trial 105 finished with value: 0.6552525434638119 and parameters: {'learning_rate': 0.004215220912426694, 'weight_decay': 0.001, 'adam_beta1': 0.93, 'warmup_steps': 4}. Best is trial 52 with value: 0.7267722601596618.


Trial 106 with params: {'learning_rate': 0.0031609287359532384, 'weight_decay': 0.004, 'adam_beta1': 0.92, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7288,2.040194,0.496792,0.114114,0.141862,0.117906
2,1.5979,1.433635,0.650779,0.314677,0.304968,0.278239
3,0.9488,1.111995,0.731439,0.436683,0.402809,0.403007
4,0.4687,1.142323,0.725023,0.560007,0.503518,0.513184
5,0.2233,1.18546,0.754354,0.639816,0.583208,0.581849
6,0.0889,1.30304,0.786434,0.709088,0.668396,0.669486
7,0.039,1.447788,0.782768,0.726309,0.662433,0.66797
8,0.0168,1.527739,0.776352,0.68315,0.637763,0.637794
9,0.0126,1.557517,0.773602,0.72147,0.689507,0.68482
10,0.0063,1.501494,0.783685,0.685019,0.662083,0.649674


[I 2025-03-15 12:44:54,114] Trial 106 finished with value: 0.6778141458002667 and parameters: {'learning_rate': 0.0031609287359532384, 'weight_decay': 0.004, 'adam_beta1': 0.92, 'warmup_steps': 3}. Best is trial 52 with value: 0.7267722601596618.


Trial 107 with params: {'learning_rate': 2.633732831164714e-05, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8786,3.838881,0.176902,0.003538,0.02,0.006012
2,3.7826,3.704187,0.176902,0.003538,0.02,0.006012
3,3.5616,3.364728,0.176902,0.003538,0.02,0.006012
4,3.2572,3.193985,0.176902,0.003538,0.02,0.006012
5,3.2003,3.139699,0.176902,0.003538,0.02,0.006012
6,3.1329,3.094459,0.176902,0.003538,0.02,0.006012
7,3.0819,3.037927,0.179652,0.014944,0.020763,0.007527
8,3.0293,2.978152,0.297892,0.032761,0.051735,0.036516
9,2.9681,2.927577,0.31989,0.028822,0.057485,0.037313
10,2.9116,2.8913,0.359303,0.035461,0.068558,0.040793


[I 2025-03-15 12:45:42,766] Trial 107 pruned. 


Trial 108 with params: {'learning_rate': 0.000635895962404134, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3205,2.842177,0.310724,0.039662,0.065298,0.041704
2,2.4823,2.202261,0.468378,0.089473,0.116461,0.090511
3,1.9766,1.825645,0.537122,0.158169,0.153211,0.133329
4,1.5982,1.55368,0.614115,0.243948,0.221647,0.210189
5,1.2705,1.355827,0.662695,0.313413,0.293601,0.286221
6,0.9764,1.251087,0.692026,0.419404,0.348154,0.34983
7,0.7338,1.201743,0.709441,0.408657,0.367882,0.374768
8,0.5757,1.172596,0.719523,0.440372,0.433114,0.427913
9,0.4324,1.14623,0.727773,0.442678,0.425169,0.426155
10,0.3189,1.167365,0.729606,0.520086,0.488628,0.486513


[I 2025-03-15 12:46:30,145] Trial 108 pruned. 


Trial 109 with params: {'learning_rate': 0.0039797330867406425, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8005,1.980764,0.502291,0.144749,0.140133,0.118056
2,1.4949,1.361036,0.678277,0.339061,0.326876,0.320532
3,0.8076,1.111258,0.738772,0.430141,0.41362,0.407425
4,0.4077,1.045986,0.753437,0.566652,0.530978,0.527601
5,0.1578,1.202031,0.770852,0.63635,0.60294,0.598344
6,0.0661,1.226312,0.787351,0.667,0.641071,0.637976
7,0.0317,1.398365,0.779102,0.701398,0.652916,0.656674
8,0.0139,1.451972,0.784601,0.701306,0.657629,0.652839
9,0.0048,1.458474,0.786434,0.696655,0.647981,0.651955
10,0.006,1.400113,0.792851,0.691374,0.65724,0.660872


[I 2025-03-15 12:49:02,696] Trial 109 finished with value: 0.6534086786689987 and parameters: {'learning_rate': 0.0039797330867406425, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 4}. Best is trial 52 with value: 0.7267722601596618.


Trial 110 with params: {'learning_rate': 0.0031966077070707455, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9332,2.137537,0.447296,0.119144,0.113675,0.099479
2,1.7053,1.482962,0.636114,0.274959,0.280252,0.265999
3,1.0308,1.202653,0.710357,0.429607,0.394136,0.392334
4,0.5492,1.12608,0.733272,0.482852,0.474298,0.466257
5,0.2484,1.199437,0.766269,0.607039,0.573605,0.576096
6,0.1294,1.334302,0.770852,0.638328,0.593337,0.598487
7,0.0626,1.413227,0.775435,0.680418,0.650744,0.647359
8,0.0288,1.558926,0.780935,0.690182,0.651297,0.653601
9,0.0123,1.641508,0.769019,0.684861,0.642017,0.644137
10,0.0055,1.607647,0.787351,0.694958,0.669027,0.662922


[I 2025-03-15 12:50:40,884] Trial 110 pruned. 


Trial 111 with params: {'learning_rate': 0.00015060199441375714, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.96, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6702,3.228593,0.176902,0.003538,0.02,0.006012
2,3.1723,3.065033,0.194317,0.027255,0.026461,0.016238
3,2.9696,2.820103,0.304308,0.030404,0.055527,0.032743
4,2.699,2.58078,0.384968,0.049166,0.079514,0.051497
5,2.5081,2.386416,0.427131,0.066061,0.094147,0.067125
6,2.2884,2.214138,0.462878,0.106697,0.109144,0.082625
7,2.1405,2.12461,0.466544,0.095275,0.113848,0.092532
8,2.0216,2.016278,0.486709,0.103121,0.129087,0.107174
9,1.8979,1.903198,0.513291,0.130642,0.139747,0.117914
10,1.8048,1.845197,0.536205,0.137765,0.157372,0.132823


[I 2025-03-15 12:51:34,900] Trial 111 pruned. 


Trial 112 with params: {'learning_rate': 0.004929907775622283, 'weight_decay': 0.007, 'adam_beta1': 0.91, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6735,2.00603,0.495875,0.179061,0.157041,0.148058
2,1.4789,1.319825,0.690192,0.353091,0.338136,0.329992
3,0.7732,1.119451,0.72319,0.475376,0.434449,0.429869
4,0.3479,1.219243,0.738772,0.578502,0.529871,0.539632
5,0.1374,1.297285,0.765353,0.667045,0.62261,0.626533
6,0.038,1.431424,0.780018,0.69682,0.643062,0.646997
7,0.0181,1.43856,0.771769,0.692534,0.662063,0.657717
8,0.0086,1.496439,0.771769,0.697536,0.663363,0.662535
9,0.0046,1.481508,0.779102,0.711219,0.662557,0.669095
10,0.0025,1.54932,0.791017,0.685496,0.645218,0.651212


[I 2025-03-15 12:52:28,187] Trial 112 pruned. 


Trial 113 with params: {'learning_rate': 0.004061483725033018, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7214,1.942011,0.51604,0.137433,0.149644,0.124468
2,1.4871,1.312222,0.694775,0.33695,0.332303,0.320631
3,0.8221,1.091227,0.747021,0.482912,0.439931,0.444589
4,0.3933,1.200706,0.735105,0.57074,0.514761,0.524483
5,0.1586,1.179064,0.769936,0.6779,0.624928,0.62464
6,0.0651,1.318514,0.779102,0.681176,0.663587,0.648969
7,0.0316,1.354333,0.784601,0.70393,0.671729,0.671527
8,0.0169,1.406576,0.788268,0.7193,0.691689,0.681792
9,0.0104,1.484306,0.791017,0.73862,0.697771,0.698077
10,0.0032,1.489438,0.796517,0.704745,0.693471,0.683685


[I 2025-03-15 12:55:09,510] Trial 113 finished with value: 0.7011446725451269 and parameters: {'learning_rate': 0.004061483725033018, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 3}. Best is trial 52 with value: 0.7267722601596618.


Trial 114 with params: {'learning_rate': 4.0648800446916785e-05, 'weight_decay': 0.006, 'adam_beta1': 0.96, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8611,3.795534,0.176902,0.003538,0.02,0.006012
2,3.6905,3.539721,0.176902,0.003538,0.02,0.006012
3,3.3635,3.219507,0.176902,0.003538,0.02,0.006012
4,3.1813,3.139223,0.176902,0.003538,0.02,0.006012
5,3.1484,3.080831,0.176902,0.003538,0.02,0.006012
6,3.0598,3.002483,0.206233,0.044283,0.027737,0.016487
7,2.9613,2.91519,0.31439,0.03113,0.056348,0.037105
8,2.8903,2.844948,0.359303,0.048001,0.068906,0.040404
9,2.814,2.776897,0.36297,0.039442,0.07032,0.044165
10,2.75,2.728281,0.374885,0.040957,0.075933,0.049179


[I 2025-03-15 12:56:03,032] Trial 114 pruned. 


Trial 115 with params: {'learning_rate': 0.003238288483797804, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7197,2.030674,0.506874,0.112599,0.143902,0.116878
2,1.578,1.407625,0.658112,0.317509,0.307834,0.294196
3,0.935,1.115717,0.740605,0.482092,0.440842,0.443985
4,0.4654,1.12689,0.734189,0.54168,0.511057,0.5141
5,0.2075,1.237049,0.748854,0.636362,0.604153,0.594746
6,0.0886,1.209947,0.764436,0.653315,0.612242,0.614894
7,0.0407,1.363364,0.785518,0.689328,0.666203,0.663905
8,0.0199,1.488827,0.771769,0.662099,0.624634,0.627782
9,0.0091,1.457122,0.787351,0.687205,0.637642,0.644325
10,0.0043,1.512643,0.780935,0.674911,0.649638,0.651921


[I 2025-03-15 12:57:46,869] Trial 115 pruned. 


Trial 116 with params: {'learning_rate': 0.004907131677862074, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6853,2.018257,0.485793,0.15322,0.143485,0.123114
2,1.5349,1.423707,0.643446,0.296863,0.303898,0.281091
3,0.8338,1.145708,0.735105,0.460649,0.431098,0.430071
4,0.3545,1.183252,0.745188,0.583277,0.523696,0.530694
5,0.135,1.337239,0.756187,0.590458,0.606332,0.580595
6,0.0474,1.412629,0.776352,0.694629,0.698793,0.680218
7,0.0164,1.460988,0.790101,0.679721,0.642939,0.644328
8,0.0056,1.551807,0.780935,0.700821,0.685378,0.674506
9,0.005,1.645296,0.780935,0.712923,0.648427,0.665534
10,0.0049,1.594569,0.786434,0.686398,0.661159,0.655038


[I 2025-03-15 13:00:30,495] Trial 116 finished with value: 0.6945000664465287 and parameters: {'learning_rate': 0.004907131677862074, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 4}. Best is trial 52 with value: 0.7267722601596618.


Trial 117 with params: {'learning_rate': 0.004738128054108833, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6798,1.991922,0.506874,0.147832,0.149669,0.124334
2,1.536,1.413625,0.648946,0.310349,0.307471,0.287125
3,0.8148,1.15442,0.732356,0.44683,0.425018,0.417556
4,0.3568,1.161372,0.747938,0.590831,0.520777,0.536381
5,0.1275,1.216137,0.771769,0.647288,0.625402,0.610266
6,0.0527,1.304679,0.777269,0.671401,0.689203,0.663471
7,0.0206,1.414554,0.789184,0.74482,0.700096,0.702121
8,0.0091,1.431512,0.793767,0.727005,0.701014,0.697838
9,0.0043,1.444241,0.789184,0.696055,0.677377,0.671036
10,0.002,1.456639,0.797434,0.709336,0.683451,0.682992


[I 2025-03-15 13:03:18,518] Trial 117 finished with value: 0.6934940926503725 and parameters: {'learning_rate': 0.004738128054108833, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 4}. Best is trial 52 with value: 0.7267722601596618.


Trial 118 with params: {'learning_rate': 0.002943564945194245, 'weight_decay': 0.004, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0362,2.236091,0.432631,0.084693,0.106138,0.085345
2,1.8265,1.547273,0.59945,0.259535,0.232339,0.218514
3,1.182,1.244081,0.705775,0.394028,0.360455,0.362241
4,0.6803,1.214794,0.718607,0.462083,0.410164,0.411678
5,0.3651,1.26149,0.731439,0.494245,0.475469,0.471896
6,0.1851,1.392748,0.76077,0.636478,0.582077,0.580388
7,0.0844,1.456091,0.762603,0.634482,0.604754,0.603269
8,0.0416,1.643436,0.753437,0.663578,0.614868,0.619375
9,0.0327,1.672664,0.761687,0.658517,0.590621,0.603805
10,0.0192,1.675338,0.768103,0.64315,0.623063,0.610999


[I 2025-03-15 13:04:11,020] Trial 118 pruned. 


Trial 119 with params: {'learning_rate': 0.004591460732088911, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.872,1.991982,0.510541,0.140398,0.147152,0.123965
2,1.5177,1.434032,0.644363,0.325974,0.308014,0.305845
3,0.8359,1.151941,0.726856,0.383297,0.402872,0.385032
4,0.4137,1.075366,0.748854,0.570995,0.537062,0.533439
5,0.1655,1.307217,0.761687,0.664481,0.615314,0.610082
6,0.0716,1.26167,0.764436,0.610889,0.603624,0.585057
7,0.029,1.305153,0.791017,0.642709,0.663057,0.635607
8,0.0097,1.368286,0.7956,0.662514,0.65448,0.635873
9,0.0034,1.432155,0.794684,0.658067,0.659689,0.643045
10,0.0014,1.439364,0.806599,0.671678,0.683682,0.661894


[I 2025-03-15 13:05:56,100] Trial 119 pruned. 


Trial 120 with params: {'learning_rate': 0.0035106119485355, 'weight_decay': 0.003, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0905,2.17997,0.442713,0.094292,0.113414,0.096902
2,1.7789,1.567888,0.6022,0.230633,0.24743,0.227545
3,1.1189,1.25026,0.705775,0.390554,0.363657,0.361491
4,0.611,1.21615,0.71769,0.446593,0.429254,0.421053
5,0.2987,1.256364,0.747021,0.53168,0.517164,0.514324
6,0.1433,1.333246,0.762603,0.583601,0.555415,0.554083
7,0.0757,1.514015,0.758937,0.59872,0.563392,0.562654
8,0.0344,1.544364,0.772686,0.618306,0.592514,0.5879
9,0.0154,1.619482,0.772686,0.638984,0.620662,0.610828
10,0.0126,1.624958,0.769936,0.617827,0.626075,0.600411


[I 2025-03-15 13:07:44,199] Trial 120 pruned. 


Trial 121 with params: {'learning_rate': 0.0035695651897627327, 'weight_decay': 0.004, 'adam_beta1': 0.93, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.723,2.007843,0.512374,0.127885,0.145345,0.122865
2,1.5866,1.422759,0.652612,0.309443,0.311443,0.289601
3,0.945,1.126223,0.739688,0.438814,0.421511,0.42091
4,0.4706,1.06412,0.742438,0.599253,0.536221,0.547255
5,0.1923,1.206061,0.762603,0.646469,0.60753,0.602591
6,0.0747,1.243724,0.789184,0.693007,0.687254,0.672032
7,0.0334,1.360953,0.784601,0.674751,0.675957,0.6546
8,0.0159,1.388418,0.783685,0.72178,0.662765,0.669269
9,0.0116,1.411458,0.786434,0.719155,0.703093,0.694546
10,0.0026,1.40247,0.79835,0.721632,0.702812,0.698835


[I 2025-03-15 13:10:23,231] Trial 121 finished with value: 0.6916285117212065 and parameters: {'learning_rate': 0.0035695651897627327, 'weight_decay': 0.004, 'adam_beta1': 0.93, 'warmup_steps': 3}. Best is trial 52 with value: 0.7267722601596618.


Trial 122 with params: {'learning_rate': 0.00469764093859318, 'weight_decay': 0.003, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9404,2.097461,0.454629,0.118619,0.121559,0.110214
2,1.6574,1.500079,0.627864,0.272156,0.263573,0.251621
3,0.9947,1.227629,0.711274,0.396151,0.377307,0.374562
4,0.521,1.193797,0.742438,0.497797,0.472446,0.475779
5,0.261,1.270791,0.775435,0.679328,0.597,0.612721
6,0.1096,1.424334,0.766269,0.590997,0.600475,0.576245
7,0.0515,1.521821,0.765353,0.620304,0.600511,0.592546
8,0.0182,1.604445,0.778185,0.669972,0.644129,0.641761
9,0.0069,1.725453,0.780935,0.671494,0.641531,0.644305
10,0.005,1.791188,0.778185,0.630205,0.621058,0.609211


[I 2025-03-15 13:11:17,693] Trial 122 pruned. 


Trial 123 with params: {'learning_rate': 0.001926817763760169, 'weight_decay': 0.004, 'adam_beta1': 0.92, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9878,2.245555,0.43813,0.107892,0.109814,0.094284
2,1.8435,1.625357,0.584785,0.244666,0.214436,0.201222
3,1.2313,1.252062,0.693859,0.370873,0.331615,0.327201
4,0.7401,1.167899,0.72044,0.482017,0.425895,0.432234
5,0.429,1.079069,0.756187,0.533436,0.494768,0.498456
6,0.2403,1.230895,0.75802,0.524039,0.505917,0.501411
7,0.1243,1.374172,0.766269,0.656308,0.567976,0.589777
8,0.0592,1.3783,0.772686,0.636561,0.588326,0.600535
9,0.0338,1.520149,0.75527,0.620445,0.580788,0.586082
10,0.0182,1.515929,0.781852,0.699762,0.634639,0.651442


[I 2025-03-15 13:13:12,027] Trial 123 pruned. 


Trial 124 with params: {'learning_rate': 0.004074973601518237, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.753,2.00543,0.500458,0.120301,0.142875,0.116385
2,1.5286,1.357188,0.668194,0.313659,0.306478,0.29965
3,0.8284,1.128559,0.736939,0.435325,0.415836,0.413659
4,0.3998,1.178771,0.732356,0.555636,0.493219,0.502308
5,0.1649,1.240141,0.764436,0.674633,0.598202,0.605788
6,0.0595,1.326842,0.769936,0.688343,0.63768,0.642026
7,0.0273,1.399514,0.780018,0.728264,0.676306,0.686061
8,0.0152,1.375541,0.778185,0.663854,0.665049,0.644246
9,0.0058,1.468482,0.784601,0.694747,0.664828,0.661335
10,0.0019,1.486282,0.786434,0.681016,0.650935,0.648146


[I 2025-03-15 13:15:02,630] Trial 124 pruned. 


Trial 125 with params: {'learning_rate': 2.4937864696713865e-05, 'weight_decay': 0.01, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8833,3.847871,0.176902,0.003538,0.02,0.006012
2,3.8039,3.747118,0.176902,0.003538,0.02,0.006012
3,3.6534,3.511106,0.176902,0.003538,0.02,0.006012
4,3.3555,3.243197,0.176902,0.003538,0.02,0.006012
5,3.2384,3.16944,0.176902,0.003538,0.02,0.006012


[I 2025-03-15 13:15:28,440] Trial 125 pruned. 


Trial 126 with params: {'learning_rate': 0.00379601273664075, 'weight_decay': 0.004, 'adam_beta1': 0.93, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.1199,2.144792,0.445463,0.117235,0.112193,0.095917
2,1.7443,1.511239,0.597617,0.268957,0.255814,0.233503
3,1.0404,1.174353,0.729606,0.418227,0.378283,0.377776
4,0.5349,1.214794,0.71769,0.469051,0.434455,0.437548
5,0.2518,1.211829,0.771769,0.574807,0.553367,0.550276


[I 2025-03-15 13:15:54,837] Trial 126 pruned. 


Trial 127 with params: {'learning_rate': 0.0034719965068488196, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0836,2.149864,0.447296,0.119314,0.115272,0.100208
2,1.7361,1.453753,0.637947,0.290388,0.275368,0.261146
3,1.0623,1.19787,0.714024,0.414886,0.38494,0.378859
4,0.5728,1.14967,0.726856,0.487601,0.45484,0.452641
5,0.2637,1.297655,0.746104,0.634049,0.579532,0.578473


[I 2025-03-15 13:16:20,467] Trial 127 pruned. 


Trial 128 with params: {'learning_rate': 0.0047578345041226254, 'weight_decay': 0.004, 'adam_beta1': 0.93, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6856,2.00159,0.504125,0.14656,0.150925,0.123536
2,1.5287,1.399137,0.656279,0.316678,0.31701,0.296856
3,0.8224,1.162702,0.736022,0.451967,0.422917,0.421741
4,0.3607,1.228713,0.756187,0.580049,0.521412,0.530644
5,0.1378,1.260476,0.768103,0.645955,0.623964,0.608154
6,0.0539,1.349005,0.784601,0.665828,0.639484,0.635252
7,0.0208,1.441345,0.777269,0.651042,0.635759,0.622594
8,0.0087,1.569692,0.785518,0.731979,0.66892,0.682132
9,0.0039,1.510578,0.784601,0.6823,0.671524,0.665379
10,0.001,1.567318,0.785518,0.68502,0.655724,0.660158


[I 2025-03-15 13:18:19,890] Trial 128 pruned. 


Trial 129 with params: {'learning_rate': 0.004711428259886046, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6783,1.98447,0.505958,0.152858,0.14961,0.122396
2,1.5318,1.418721,0.655362,0.305998,0.313348,0.294702
3,0.8189,1.149734,0.731439,0.466728,0.439892,0.438541
4,0.365,1.20476,0.762603,0.592562,0.549637,0.556097
5,0.1285,1.340381,0.774519,0.667329,0.614931,0.619621
6,0.0512,1.308969,0.792851,0.706598,0.672554,0.673386
7,0.0167,1.452961,0.791934,0.722799,0.685897,0.682076
8,0.0045,1.477692,0.794684,0.710248,0.684586,0.683152
9,0.0015,1.522763,0.79835,0.716954,0.686557,0.683933
10,0.0016,1.551457,0.80385,0.72758,0.703623,0.700845


[I 2025-03-15 13:20:53,012] Trial 129 finished with value: 0.6910576816832651 and parameters: {'learning_rate': 0.004711428259886046, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 4}. Best is trial 52 with value: 0.7267722601596618.


Trial 130 with params: {'learning_rate': 0.004753627010019391, 'weight_decay': 0.002, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6864,1.986397,0.516957,0.160714,0.150906,0.124986
2,1.5388,1.421042,0.63978,0.300066,0.293554,0.271748
3,0.838,1.161486,0.727773,0.46759,0.430007,0.427895
4,0.3951,1.196571,0.747938,0.580059,0.550959,0.551943
5,0.1472,1.257726,0.766269,0.631453,0.61971,0.605682
6,0.044,1.336576,0.784601,0.703267,0.676794,0.668051
7,0.0152,1.430009,0.785518,0.683483,0.677934,0.664347
8,0.0051,1.509709,0.787351,0.688983,0.672624,0.666245
9,0.0016,1.527092,0.793767,0.684433,0.674398,0.669766
10,0.001,1.538821,0.794684,0.675778,0.674711,0.666499


[I 2025-03-15 13:23:31,482] Trial 130 finished with value: 0.6704222621363223 and parameters: {'learning_rate': 0.004753627010019391, 'weight_decay': 0.002, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 4}. Best is trial 52 with value: 0.7267722601596618.


Trial 131 with params: {'learning_rate': 0.004897982520062548, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6864,2.011044,0.491292,0.148467,0.145265,0.120837
2,1.5388,1.419937,0.641613,0.29995,0.300867,0.280722
3,0.8357,1.103081,0.736022,0.455198,0.430822,0.433039
4,0.3705,1.16988,0.766269,0.621253,0.553175,0.561014
5,0.1387,1.304162,0.772686,0.628578,0.601227,0.594557
6,0.0468,1.299818,0.792851,0.728236,0.68221,0.684555
7,0.0187,1.35699,0.797434,0.69391,0.695847,0.679573
8,0.0068,1.464204,0.787351,0.724286,0.711727,0.69808
9,0.0044,1.481715,0.7956,0.704712,0.698044,0.687374
10,0.0013,1.507938,0.796517,0.70248,0.693298,0.68558


[I 2025-03-15 13:26:46,215] Trial 131 finished with value: 0.693619019112808 and parameters: {'learning_rate': 0.004897982520062548, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 4}. Best is trial 52 with value: 0.7267722601596618.


Trial 132 with params: {'learning_rate': 0.004520763582796753, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.659,1.976843,0.504125,0.147602,0.154044,0.128763
2,1.5279,1.429574,0.64528,0.315807,0.30166,0.284875
3,0.8312,1.13462,0.738772,0.462401,0.440475,0.436962
4,0.3742,1.154601,0.761687,0.58799,0.537798,0.540638
5,0.1346,1.336228,0.76077,0.667644,0.628596,0.622606
6,0.0506,1.295795,0.784601,0.703489,0.693853,0.677134
7,0.0216,1.441612,0.783685,0.674378,0.667802,0.650421
8,0.0151,1.499919,0.780935,0.667992,0.660985,0.642883
9,0.0072,1.494494,0.784601,0.691671,0.664119,0.664708
10,0.003,1.546139,0.792851,0.717007,0.697518,0.69288


[I 2025-03-15 13:29:40,743] Trial 132 finished with value: 0.7020899247275694 and parameters: {'learning_rate': 0.004520763582796753, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 4}. Best is trial 52 with value: 0.7267722601596618.


Trial 133 with params: {'learning_rate': 0.004823755329947578, 'weight_decay': 0.004, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6921,2.006833,0.503208,0.130623,0.147935,0.122409
2,1.5537,1.392551,0.638863,0.303737,0.29912,0.279087
3,0.8432,1.151831,0.725023,0.457714,0.423809,0.419221
4,0.3914,1.227687,0.758937,0.601873,0.540757,0.549949
5,0.1409,1.334787,0.775435,0.652847,0.617017,0.612269
6,0.0505,1.397228,0.769019,0.65945,0.652805,0.631582
7,0.0188,1.451273,0.793767,0.682352,0.650999,0.649114
8,0.0063,1.537235,0.776352,0.665562,0.671812,0.652143
9,0.0085,1.631454,0.781852,0.68847,0.646925,0.646394
10,0.0081,1.54734,0.791017,0.666349,0.666252,0.647621


[I 2025-03-15 13:30:34,043] Trial 133 pruned. 


Trial 134 with params: {'learning_rate': 0.0011640256458428468, 'weight_decay': 0.008, 'adam_beta1': 0.97, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.1705,2.580817,0.384051,0.065542,0.083027,0.057081
2,2.2695,2.045942,0.490376,0.094503,0.128622,0.100973
3,1.8265,1.705535,0.571952,0.183551,0.186345,0.173093
4,1.4142,1.437403,0.647113,0.316857,0.272401,0.272173
5,1.0545,1.319479,0.681027,0.331915,0.336095,0.325073
6,0.7506,1.217323,0.71494,0.404889,0.388004,0.382197
7,0.5008,1.297075,0.72319,0.463312,0.416732,0.425036
8,0.3317,1.262485,0.742438,0.521413,0.492879,0.497284
9,0.2129,1.367008,0.738772,0.562656,0.503728,0.516417
10,0.1283,1.421398,0.738772,0.54424,0.529507,0.519325


[I 2025-03-15 13:31:35,819] Trial 134 pruned. 


Trial 135 with params: {'learning_rate': 0.0012238162847232963, 'weight_decay': 0.001, 'adam_beta1': 0.91, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0614,2.398365,0.405133,0.06324,0.092619,0.068354
2,2.0335,1.795902,0.549954,0.155825,0.166585,0.142803
3,1.4959,1.381565,0.662695,0.326942,0.287631,0.279623
4,0.9921,1.222537,0.681943,0.413441,0.360667,0.365548
5,0.662,1.120103,0.718607,0.399903,0.404702,0.391024
6,0.4471,1.147222,0.736939,0.524307,0.487862,0.488001
7,0.2721,1.20823,0.745188,0.590297,0.504641,0.524742
8,0.166,1.219245,0.764436,0.636132,0.599263,0.59787
9,0.1043,1.289393,0.751604,0.633117,0.543254,0.564504
10,0.0662,1.393831,0.749771,0.631375,0.585672,0.587301


[I 2025-03-15 13:33:15,711] Trial 135 pruned. 


Trial 136 with params: {'learning_rate': 0.0033504882036428604, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7199,2.003849,0.503208,0.127656,0.139005,0.115576
2,1.5859,1.430966,0.653529,0.309846,0.315827,0.29258
3,0.9499,1.149297,0.734189,0.454815,0.416661,0.42026
4,0.4739,1.145056,0.738772,0.572934,0.511388,0.523851
5,0.1989,1.197163,0.747021,0.630927,0.573542,0.573641
6,0.084,1.24619,0.775435,0.661441,0.642209,0.633934
7,0.0439,1.395258,0.765353,0.692321,0.635563,0.637189
8,0.0225,1.402625,0.780018,0.645977,0.61098,0.612628
9,0.0087,1.434947,0.779102,0.682952,0.652431,0.646799
10,0.0043,1.4574,0.790101,0.677162,0.661807,0.648349


[I 2025-03-15 13:36:07,833] Trial 136 finished with value: 0.6511716772476137 and parameters: {'learning_rate': 0.0033504882036428604, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 3}. Best is trial 52 with value: 0.7267722601596618.


Trial 137 with params: {'learning_rate': 0.004445317285689909, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6547,1.980084,0.508708,0.152609,0.151243,0.126119
2,1.5396,1.412765,0.643446,0.3201,0.308221,0.289712
3,0.8228,1.089722,0.737855,0.485448,0.431336,0.438874
4,0.3583,1.183989,0.761687,0.619973,0.565483,0.573945
5,0.128,1.304743,0.766269,0.705484,0.636754,0.640272
6,0.0596,1.284896,0.780935,0.64529,0.641731,0.629288
7,0.0267,1.340741,0.775435,0.638351,0.657067,0.630026
8,0.0079,1.459425,0.772686,0.64781,0.642378,0.626653
9,0.0036,1.444772,0.793767,0.680046,0.693398,0.675269
10,0.0015,1.503171,0.777269,0.658355,0.650626,0.64258


[I 2025-03-15 13:38:05,410] Trial 137 pruned. 


Trial 138 with params: {'learning_rate': 0.0048822691152501, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.68,1.99155,0.499542,0.190947,0.154772,0.142361
2,1.5006,1.335341,0.676444,0.327112,0.324666,0.316931
3,0.799,1.090295,0.739688,0.491487,0.444247,0.448222
4,0.3549,1.145492,0.751604,0.584379,0.52593,0.538644
5,0.1255,1.249681,0.765353,0.694328,0.646376,0.649759
6,0.0456,1.301615,0.784601,0.679479,0.665249,0.652624
7,0.0172,1.402165,0.791017,0.704978,0.694626,0.686496
8,0.0056,1.516682,0.796517,0.733455,0.69962,0.699423
9,0.0047,1.599119,0.788268,0.732701,0.691541,0.696753
10,0.0031,1.596283,0.786434,0.727317,0.686065,0.683401


[I 2025-03-15 13:40:53,414] Trial 138 finished with value: 0.7043117119408449 and parameters: {'learning_rate': 0.0048822691152501, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 4}. Best is trial 52 with value: 0.7267722601596618.


Trial 139 with params: {'learning_rate': 0.0008666250526706238, 'weight_decay': 0.005, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.2433,2.674809,0.372136,0.036017,0.077195,0.048117
2,2.3258,2.118013,0.465628,0.0851,0.120347,0.09233
3,1.8602,1.709736,0.562786,0.183313,0.172454,0.158608
4,1.4559,1.458224,0.647113,0.276267,0.258747,0.248301
5,1.0933,1.257274,0.683776,0.349109,0.338654,0.328746


[I 2025-03-15 13:41:19,948] Trial 139 pruned. 


Trial 140 with params: {'learning_rate': 0.0017189701130963888, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9555,2.26596,0.428964,0.086186,0.103867,0.083655
2,1.8953,1.622896,0.606783,0.248343,0.22059,0.208375
3,1.2985,1.285862,0.684693,0.324248,0.329827,0.318552
4,0.813,1.211133,0.698442,0.40612,0.396418,0.388574
5,0.4831,1.11403,0.746104,0.488607,0.473254,0.46844


[I 2025-03-15 13:41:45,238] Trial 140 pruned. 


Trial 141 with params: {'learning_rate': 0.004767439501199274, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6787,1.993794,0.495875,0.146614,0.153754,0.128812
2,1.5119,1.39523,0.659945,0.305719,0.313341,0.294315
3,0.8133,1.113259,0.739688,0.462312,0.433434,0.428354
4,0.3499,1.198031,0.769019,0.667119,0.587819,0.608951
5,0.1312,1.130497,0.777269,0.653262,0.627839,0.622935
6,0.0515,1.245125,0.771769,0.680809,0.669116,0.658337
7,0.0217,1.274372,0.7956,0.705839,0.707306,0.680638
8,0.0081,1.346732,0.791934,0.717186,0.701689,0.686986
9,0.0023,1.359791,0.797434,0.721726,0.711903,0.696662
10,0.0012,1.376477,0.79835,0.720854,0.705068,0.695997


[I 2025-03-15 13:44:31,155] Trial 141 finished with value: 0.6952039454937433 and parameters: {'learning_rate': 0.004767439501199274, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 4}. Best is trial 52 with value: 0.7267722601596618.


Trial 142 with params: {'learning_rate': 0.0012956365603625942, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.1024,2.464328,0.398717,0.056998,0.089231,0.062532
2,2.0835,1.818496,0.55637,0.151127,0.173908,0.149359
3,1.531,1.42143,0.64528,0.282288,0.262483,0.255062
4,1.0301,1.233655,0.687443,0.386877,0.364322,0.358408
5,0.6804,1.143866,0.72044,0.477685,0.414276,0.416773


[I 2025-03-15 13:44:56,327] Trial 142 pruned. 


Trial 143 with params: {'learning_rate': 0.0029047471604170534, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8851,2.144973,0.452796,0.106523,0.11567,0.100297
2,1.7343,1.506395,0.618698,0.257347,0.263938,0.247627
3,1.0847,1.204619,0.700275,0.380263,0.376708,0.359523
4,0.5887,1.129436,0.742438,0.500609,0.480026,0.480663
5,0.2976,1.172237,0.752521,0.58917,0.542894,0.544875
6,0.1669,1.285778,0.753437,0.598476,0.563161,0.557442
7,0.0721,1.320376,0.787351,0.689026,0.659771,0.659176
8,0.0269,1.491351,0.785518,0.678395,0.65236,0.650188
9,0.0155,1.536168,0.779102,0.732121,0.685373,0.687667
10,0.0087,1.4608,0.794684,0.678276,0.686665,0.664952


[I 2025-03-15 13:47:38,397] Trial 143 finished with value: 0.6979762879425699 and parameters: {'learning_rate': 0.0029047471604170534, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 4}. Best is trial 52 with value: 0.7267722601596618.


Trial 144 with params: {'learning_rate': 0.004676923200409123, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6592,1.958894,0.500458,0.155006,0.148938,0.120524
2,1.5023,1.365657,0.664528,0.311836,0.317931,0.296661
3,0.7875,1.069689,0.754354,0.490528,0.461661,0.463575
4,0.3401,1.207649,0.748854,0.609731,0.546647,0.558418
5,0.1304,1.23435,0.769936,0.693977,0.641476,0.637496
6,0.0496,1.295451,0.780935,0.670694,0.679373,0.652475
7,0.0171,1.389618,0.780935,0.727563,0.69237,0.692723
8,0.0116,1.491501,0.791017,0.739125,0.693714,0.69548
9,0.0068,1.482652,0.790101,0.687113,0.680543,0.664523
10,0.0027,1.512568,0.791017,0.70348,0.682173,0.675404


[I 2025-03-15 13:50:10,213] Trial 144 finished with value: 0.6812557659271745 and parameters: {'learning_rate': 0.004676923200409123, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 4}. Best is trial 52 with value: 0.7267722601596618.


Trial 145 with params: {'learning_rate': 0.004513425291421247, 'weight_decay': 0.003, 'adam_beta1': 0.91, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6415,1.952242,0.507791,0.149678,0.159122,0.135202
2,1.4744,1.295576,0.693859,0.345817,0.333028,0.327244
3,0.7737,1.108346,0.745188,0.471133,0.456623,0.446958
4,0.3557,1.077726,0.767186,0.594775,0.568636,0.567781
5,0.1125,1.243895,0.778185,0.699209,0.633805,0.641592
6,0.0397,1.337571,0.775435,0.699722,0.650057,0.654165
7,0.0161,1.366057,0.791017,0.71336,0.693197,0.681259
8,0.0092,1.39335,0.783685,0.715743,0.707106,0.693705
9,0.0029,1.439134,0.804766,0.742537,0.730034,0.716631
10,0.0008,1.466125,0.799267,0.754693,0.7187,0.718769


[I 2025-03-15 13:52:43,655] Trial 145 finished with value: 0.6975152011737452 and parameters: {'learning_rate': 0.004513425291421247, 'weight_decay': 0.003, 'adam_beta1': 0.91, 'warmup_steps': 4}. Best is trial 52 with value: 0.7267722601596618.


Trial 146 with params: {'learning_rate': 0.004223151447544888, 'weight_decay': 0.003, 'adam_beta1': 0.91, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7552,1.989293,0.499542,0.136096,0.140691,0.118252
2,1.4931,1.331377,0.671861,0.339679,0.324898,0.310594
3,0.8186,1.093586,0.737855,0.459618,0.432651,0.433543
4,0.3828,1.116469,0.750687,0.599101,0.525926,0.533745
5,0.1528,1.324748,0.751604,0.647931,0.576233,0.591429
6,0.0652,1.30087,0.768103,0.655773,0.643185,0.635029
7,0.0323,1.438065,0.769019,0.675409,0.63966,0.640909
8,0.0136,1.502699,0.781852,0.685186,0.643128,0.643514
9,0.0068,1.501673,0.783685,0.697482,0.674784,0.665365
10,0.0019,1.463219,0.793767,0.695891,0.653634,0.660883


[I 2025-03-15 13:54:27,609] Trial 146 pruned. 


Trial 147 with params: {'learning_rate': 0.00492644744141174, 'weight_decay': 0.004, 'adam_beta1': 0.92, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6831,2.018019,0.490376,0.18533,0.1471,0.137089
2,1.5245,1.360306,0.663611,0.326302,0.315293,0.306624
3,0.8034,1.112676,0.731439,0.452593,0.429595,0.426348
4,0.3589,1.204011,0.751604,0.565204,0.520273,0.530482
5,0.1249,1.266916,0.767186,0.656436,0.616561,0.619973
6,0.0491,1.370619,0.767186,0.694688,0.694121,0.678836
7,0.0201,1.413708,0.786434,0.669249,0.678361,0.659017
8,0.0051,1.507979,0.782768,0.665203,0.678254,0.656947
9,0.0029,1.542431,0.787351,0.683285,0.68191,0.668944
10,0.0007,1.556633,0.785518,0.654045,0.659917,0.645828


[I 2025-03-15 13:56:48,671] Trial 147 finished with value: 0.6466895421288966 and parameters: {'learning_rate': 0.00492644744141174, 'weight_decay': 0.004, 'adam_beta1': 0.92, 'warmup_steps': 4}. Best is trial 52 with value: 0.7267722601596618.


Trial 148 with params: {'learning_rate': 0.0029855296665933758, 'weight_decay': 0.003, 'adam_beta1': 0.91, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8786,2.127618,0.461045,0.113446,0.122024,0.106041
2,1.6802,1.417544,0.649863,0.307873,0.289397,0.276222
3,1.0013,1.183799,0.712191,0.41368,0.39368,0.384346
4,0.5476,1.117893,0.738772,0.510903,0.480885,0.480604
5,0.2549,1.175429,0.776352,0.644997,0.591157,0.601605
6,0.1237,1.247634,0.765353,0.648757,0.591919,0.597088
7,0.0553,1.434878,0.779102,0.665201,0.630377,0.631732
8,0.025,1.460705,0.780935,0.688188,0.634103,0.64546
9,0.0116,1.52963,0.772686,0.665478,0.672656,0.649388
10,0.0058,1.483609,0.779102,0.658384,0.647598,0.63362


[I 2025-03-15 13:57:36,579] Trial 148 pruned. 


Trial 149 with params: {'learning_rate': 0.003869196915802795, 'weight_decay': 0.004, 'adam_beta1': 0.92, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7119,2.02969,0.484876,0.127003,0.143698,0.121388
2,1.5955,1.359704,0.648946,0.285585,0.296562,0.275373
3,0.878,1.08587,0.735105,0.450182,0.429813,0.42847
4,0.4114,1.183112,0.72319,0.561487,0.483697,0.4981
5,0.1942,1.166628,0.771769,0.632997,0.593689,0.58926
6,0.0723,1.338467,0.780018,0.698036,0.66299,0.658903
7,0.0394,1.347932,0.773602,0.644865,0.663434,0.642289
8,0.0195,1.378293,0.791017,0.683047,0.664363,0.655907
9,0.006,1.458098,0.787351,0.701418,0.645802,0.654225
10,0.0029,1.443788,0.789184,0.724586,0.686557,0.682209


[I 2025-03-15 13:59:12,383] Trial 149 pruned. 


In [49]:
print(best_trial)

BestRun(run_id='52', objective=0.7267722601596618, hyperparameters={'learning_rate': 0.004185238693319757, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 4}, run_summary=None)


In [50]:
base.reset_seed()

In [51]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill-embedd_fine_hp-search", logging_dir=f"~/logs/{DATASET}/bilstm-distill-embedd_fine_hp-search", remove_unused_columns=False, epochs=num_epochs, batch_size=batch_size)

In [52]:
def hp_space(trial):
    params =  {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-3, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0, 1e-2, step=1e-3),
        "adam_beta1" : trial.suggest_float("adam_beta1", 0.9, 0.99, step=0.01),
        "warmup_steps" : trial.suggest_int("warmup_steps", 0, warm_up),
        "lambda_param": trial.suggest_float("lambda_param",0,1,step=.1),
        "temperature": trial.suggest_float("temperature", 2,7, step=.5)
    }
    print(f"Trial {trial.number} with params: {params}")
    return params

In [53]:
pruner = optuna.pruners.HyperbandPruner(min_resource=min_r, max_resource=max_r, reduction_factor=2, bootstrap_count=2)
sampler = optuna.samplers.TPESampler(seed=42, multivariate=True)



In [54]:
trainer = base.DistilTrainer(
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    model_init = lambda: get_BiLSTM(),
    #callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)
  

In [55]:
best_trial2 = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    compute_objective=lambda metrics: metrics["eval_f1"],
    pruner=pruner,
    sampler=sampler,
    study_name="Distill-embedd",
    n_trials=150
)

[I 2025-03-15 13:59:12,992] A new study created in memory with name: Distill-embedd


Trial 0 with params: {'learning_rate': 0.0001025350969016849, 'weight_decay': 0.01, 'adam_beta1': 0.97, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3894,2.202489,0.176902,0.003538,0.02,0.006012
2,2.1304,2.065815,0.176902,0.003538,0.02,0.006012
3,2.0565,1.989177,0.180568,0.006305,0.021096,0.00771
4,1.9315,1.870954,0.312557,0.035147,0.056676,0.035178
5,1.8335,1.757392,0.36022,0.039369,0.06972,0.044458
6,1.7096,1.66159,0.404216,0.041021,0.085248,0.054684
7,1.6263,1.575806,0.414299,0.063855,0.08803,0.057975
8,1.5491,1.512766,0.430797,0.081987,0.094931,0.067834
9,1.489,1.455442,0.44363,0.072119,0.100824,0.075004
10,1.4352,1.417142,0.468378,0.089458,0.111196,0.084425


[I 2025-03-15 14:00:49,585] Trial 0 pruned. 


Trial 1 with params: {'learning_rate': 1.4347159517201392e-05, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.96, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4731,2.458151,0.177819,0.00623,0.023331,0.009797
2,2.4516,2.432497,0.176902,0.003538,0.02,0.006012
3,2.4212,2.39687,0.176902,0.003538,0.02,0.006012
4,2.3786,2.343468,0.176902,0.003538,0.02,0.006012
5,2.3128,2.259168,0.176902,0.003538,0.02,0.006012


[I 2025-03-15 14:01:16,252] Trial 1 pruned. 


Trial 2 with params: {'learning_rate': 0.001764971584817572, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8618,1.485708,0.430797,0.111155,0.104109,0.084489
2,1.2816,1.17267,0.555454,0.1514,0.169847,0.144013
3,0.9722,0.91426,0.666361,0.297432,0.27589,0.265339
4,0.6831,0.786386,0.709441,0.367065,0.34556,0.344629
5,0.4892,0.700136,0.747021,0.416194,0.399861,0.394614
6,0.3448,0.659061,0.758937,0.454261,0.454501,0.446266
7,0.2544,0.644431,0.767186,0.517221,0.480389,0.486325
8,0.1973,0.636394,0.774519,0.555528,0.523085,0.521258
9,0.1529,0.620636,0.780018,0.577217,0.545448,0.546351
10,0.1269,0.623981,0.784601,0.646912,0.610344,0.616158


[I 2025-03-15 14:02:51,929] Trial 2 pruned. 


Trial 3 with params: {'learning_rate': 0.0001464895513280072, 'weight_decay': 0.003, 'adam_beta1': 0.96, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3169,2.125195,0.176902,0.003538,0.02,0.006012
2,2.071,2.001395,0.182401,0.007181,0.021644,0.008419
3,1.9309,1.828014,0.35472,0.035112,0.06745,0.040874
4,1.7514,1.673813,0.385885,0.039418,0.079172,0.050958
5,1.6395,1.562948,0.413382,0.051211,0.088847,0.06022
6,1.5127,1.464053,0.450962,0.08863,0.103637,0.077341
7,1.4322,1.411605,0.459212,0.096985,0.108396,0.084729
8,1.3592,1.352764,0.486709,0.118173,0.128537,0.107034
9,1.2967,1.284528,0.513291,0.116262,0.136954,0.113434
10,1.2395,1.256072,0.527956,0.124188,0.147479,0.12285


[I 2025-03-15 14:04:32,461] Trial 3 pruned. 


Trial 4 with params: {'learning_rate': 0.00017018418817029164, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 2, 'lambda_param': 0.6000000000000001, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2985,2.077205,0.176902,0.003538,0.02,0.006012
2,1.9993,1.881088,0.298808,0.026574,0.054023,0.030104
3,1.7831,1.670143,0.399633,0.044973,0.083796,0.05554
4,1.6026,1.519385,0.426214,0.076287,0.093089,0.065802
5,1.4804,1.424409,0.461962,0.081553,0.107889,0.082592
6,1.3693,1.343786,0.485793,0.108546,0.122746,0.098458
7,1.2982,1.287345,0.512374,0.127759,0.137108,0.112458
8,1.2351,1.238673,0.535289,0.133837,0.150653,0.128526
9,1.177,1.202634,0.544455,0.149305,0.155278,0.133526
10,1.1148,1.17078,0.567369,0.154624,0.177191,0.155271


[I 2025-03-15 14:05:27,046] Trial 4 pruned. 


Trial 5 with params: {'learning_rate': 0.00043625993625605574, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1979,1.901623,0.327223,0.041509,0.065431,0.046525
2,1.6987,1.532492,0.428048,0.070825,0.09399,0.066353
3,1.418,1.323038,0.494959,0.124186,0.126179,0.100438
4,1.2274,1.178333,0.550871,0.140218,0.158353,0.13608
5,1.0799,1.090724,0.586618,0.177639,0.182309,0.158465


[I 2025-03-15 14:05:50,468] Trial 5 pruned. 


Trial 6 with params: {'learning_rate': 6.639623079859457e-05, 'weight_decay': 0.001, 'adam_beta1': 0.96, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4302,2.338326,0.176902,0.003538,0.02,0.006012
2,2.2071,2.111092,0.176902,0.003538,0.02,0.006012
3,2.1019,2.056268,0.176902,0.003538,0.02,0.006012
4,2.0336,1.978929,0.188818,0.01302,0.023326,0.011062
5,1.954,1.88267,0.35472,0.035144,0.067437,0.041902


[I 2025-03-15 14:06:14,032] Trial 6 pruned. 


Trial 7 with params: {'learning_rate': 1.2382649697023537e-05, 'weight_decay': 0.01, 'adam_beta1': 0.92, 'warmup_steps': 3, 'lambda_param': 0.30000000000000004, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4745,2.461145,0.195234,0.011041,0.032696,0.014556
2,2.4559,2.437945,0.176902,0.003541,0.02,0.006017
3,2.4276,2.404687,0.176902,0.003538,0.02,0.006012
4,2.3877,2.353479,0.176902,0.003538,0.02,0.006012
5,2.3215,2.26642,0.176902,0.003538,0.02,0.006012
6,2.2217,2.167534,0.176902,0.003538,0.02,0.006012
7,2.1458,2.117796,0.176902,0.003538,0.02,0.006012
8,2.1221,2.101029,0.176902,0.003538,0.02,0.006012
9,2.1154,2.090697,0.176902,0.003538,0.02,0.006012
10,2.1016,2.081689,0.176902,0.003538,0.02,0.006012


[I 2025-03-15 14:07:06,190] Trial 7 pruned. 


Trial 8 with params: {'learning_rate': 0.00029891977384598987, 'weight_decay': 0.002, 'adam_beta1': 0.99, 'warmup_steps': 3, 'lambda_param': 1.0, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2561,2.052189,0.176902,0.003538,0.02,0.006012
2,1.9706,1.848681,0.339138,0.033885,0.07006,0.043666
3,1.7794,1.690228,0.387718,0.042175,0.079096,0.050271
4,1.6341,1.570246,0.405133,0.058072,0.087114,0.061321
5,1.5322,1.479182,0.44363,0.062561,0.101668,0.074931
6,1.425,1.398753,0.472961,0.092506,0.115969,0.093555
7,1.3596,1.340736,0.504125,0.121438,0.13368,0.108663
8,1.2894,1.283037,0.506874,0.114238,0.133569,0.109431
9,1.2294,1.229365,0.544455,0.124972,0.155529,0.130587
10,1.1625,1.184098,0.552704,0.154707,0.162456,0.139365


[I 2025-03-15 14:07:54,804] Trial 8 pruned. 


Trial 9 with params: {'learning_rate': 0.00041087915453240814, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1546,1.918806,0.338222,0.037732,0.073021,0.047211
2,1.704,1.543781,0.433547,0.068082,0.096519,0.068296
3,1.4357,1.334446,0.495875,0.125799,0.126199,0.100312
4,1.2479,1.208546,0.543538,0.134548,0.153057,0.131418
5,1.1075,1.106753,0.588451,0.159368,0.18363,0.159682
6,0.9613,1.029469,0.614115,0.177138,0.214381,0.189106
7,0.8414,0.936546,0.656279,0.277081,0.25819,0.243711
8,0.7454,0.894053,0.67736,0.35435,0.283619,0.283525
9,0.6616,0.846961,0.698442,0.362977,0.319035,0.319129
10,0.5769,0.815312,0.705775,0.353596,0.328026,0.32836


[I 2025-03-15 14:08:50,992] Trial 9 pruned. 


Trial 10 with params: {'learning_rate': 0.002041934417684722, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8051,1.411025,0.474794,0.103228,0.117587,0.091054
2,1.2104,1.106493,0.574702,0.189085,0.188193,0.167106
3,0.8669,0.855493,0.702108,0.354888,0.332738,0.325233
4,0.5934,0.754495,0.72044,0.37381,0.370299,0.365404
5,0.4047,0.672618,0.746104,0.43585,0.411658,0.408043
6,0.2847,0.643429,0.769019,0.519029,0.481988,0.489242
7,0.2008,0.628652,0.773602,0.564916,0.500692,0.510964
8,0.1563,0.623971,0.780935,0.606242,0.555811,0.561906
9,0.1264,0.613615,0.784601,0.687664,0.600664,0.621136
10,0.1084,0.608617,0.796517,0.704132,0.648891,0.658141


[I 2025-03-15 14:10:24,073] Trial 10 pruned. 


Trial 11 with params: {'learning_rate': 0.00318176128710325, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7296,1.318856,0.505958,0.127074,0.139372,0.1194
2,1.0865,0.958251,0.662695,0.280676,0.279669,0.26225
3,0.7202,0.771777,0.722273,0.351224,0.36123,0.340492
4,0.4631,0.672191,0.752521,0.451519,0.42942,0.428945
5,0.2998,0.629721,0.780018,0.544039,0.491663,0.497509
6,0.2147,0.621503,0.773602,0.561164,0.515624,0.526484
7,0.1537,0.594934,0.791934,0.625928,0.590273,0.595973
8,0.1171,0.587144,0.799267,0.667175,0.626258,0.632786
9,0.0952,0.57732,0.805683,0.681461,0.644334,0.651446
10,0.0854,0.574426,0.79835,0.673846,0.647031,0.648555


[I 2025-03-15 14:12:04,315] Trial 11 pruned. 


Trial 12 with params: {'learning_rate': 0.0030035251452626105, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8154,1.389825,0.466544,0.097522,0.121316,0.096488
2,1.1653,1.021462,0.621448,0.226121,0.234212,0.215552
3,0.7927,0.804152,0.710357,0.336059,0.349452,0.332558
4,0.5201,0.716044,0.730522,0.419888,0.400134,0.397563
5,0.3432,0.650338,0.76352,0.46454,0.464729,0.4544
6,0.2434,0.622352,0.784601,0.541371,0.515621,0.515357
7,0.174,0.60952,0.782768,0.606293,0.546768,0.558657
8,0.1307,0.597439,0.791934,0.642522,0.589884,0.598413
9,0.1073,0.59007,0.796517,0.720491,0.672797,0.679664
10,0.0934,0.589844,0.789184,0.68849,0.643747,0.649837


[I 2025-03-15 14:14:29,584] Trial 12 finished with value: 0.7051451091498009 and parameters: {'learning_rate': 0.0030035251452626105, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 7.0}. Best is trial 12 with value: 0.7051451091498009.


Trial 13 with params: {'learning_rate': 0.0013883069209569172, 'weight_decay': 0.002, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 3, 'lambda_param': 0.4, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9994,1.604733,0.405133,0.062413,0.089568,0.064715
2,1.4008,1.245684,0.525206,0.106446,0.145775,0.119227
3,1.1085,1.019114,0.613199,0.210959,0.208639,0.190526
4,0.8356,0.885469,0.687443,0.327226,0.296183,0.290215
5,0.6334,0.768014,0.719523,0.346412,0.342772,0.336816
6,0.4656,0.716361,0.746104,0.447292,0.410104,0.417939
7,0.3429,0.693633,0.754354,0.482376,0.432454,0.440981
8,0.2591,0.676243,0.764436,0.47993,0.487771,0.476529
9,0.2013,0.656633,0.772686,0.537595,0.507504,0.504204
10,0.1677,0.648565,0.771769,0.570961,0.52407,0.531128


[I 2025-03-15 14:15:18,036] Trial 13 pruned. 


Trial 14 with params: {'learning_rate': 0.003879925621399434, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8098,1.357471,0.485793,0.120274,0.128046,0.106721
2,1.1245,1.04099,0.626948,0.265376,0.249519,0.235209
3,0.7497,0.789703,0.711274,0.342253,0.343761,0.334128
4,0.467,0.692216,0.743355,0.409197,0.412662,0.40086
5,0.303,0.653585,0.766269,0.473714,0.467925,0.461152
6,0.2065,0.628643,0.780018,0.535715,0.535215,0.527012
7,0.1454,0.616885,0.785518,0.668187,0.606896,0.620604
8,0.1142,0.605244,0.796517,0.684334,0.652191,0.656338
9,0.097,0.595214,0.793767,0.681428,0.63993,0.646564
10,0.0879,0.595557,0.792851,0.725266,0.657315,0.675645


[I 2025-03-15 14:17:38,015] Trial 14 finished with value: 0.6861447089350974 and parameters: {'learning_rate': 0.003879925621399434, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 7.0}. Best is trial 12 with value: 0.7051451091498009.


Trial 15 with params: {'learning_rate': 0.0038319136984624577, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8006,1.344745,0.48121,0.114098,0.130461,0.105237
2,1.0982,0.967423,0.655362,0.266019,0.276026,0.256778
3,0.6913,0.744961,0.738772,0.363389,0.387448,0.36824
4,0.4301,0.6621,0.759853,0.449553,0.457785,0.448188
5,0.271,0.627575,0.776352,0.540257,0.505567,0.508187
6,0.1851,0.610329,0.785518,0.636545,0.563263,0.579181
7,0.1331,0.594672,0.791934,0.66028,0.600699,0.617608
8,0.104,0.583576,0.79835,0.680967,0.63939,0.648273
9,0.0917,0.580786,0.80385,0.727603,0.659694,0.675961
10,0.0847,0.575229,0.7956,0.704408,0.633486,0.652305


[I 2025-03-15 14:18:26,061] Trial 15 pruned. 


Trial 16 with params: {'learning_rate': 0.0014669389537906592, 'weight_decay': 0.004, 'adam_beta1': 0.93, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9808,1.566221,0.412466,0.063289,0.092766,0.068653
2,1.3687,1.216759,0.534372,0.10644,0.150329,0.122092
3,1.0653,0.988802,0.632447,0.25539,0.230381,0.216325
4,0.7801,0.851533,0.688359,0.347588,0.312159,0.313523
5,0.5812,0.74414,0.730522,0.394056,0.368952,0.360445
6,0.4199,0.691274,0.748854,0.458702,0.424419,0.427169
7,0.3053,0.666732,0.75802,0.521501,0.448897,0.459502
8,0.2368,0.659234,0.773602,0.555554,0.514897,0.517432
9,0.1858,0.644989,0.777269,0.589685,0.534392,0.543595
10,0.1557,0.633547,0.770852,0.601943,0.544091,0.554533


[I 2025-03-15 14:19:15,118] Trial 16 pruned. 


Trial 17 with params: {'learning_rate': 0.0029594436387712733, 'weight_decay': 0.002, 'adam_beta1': 0.97, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8112,1.397183,0.472044,0.103178,0.11864,0.095934
2,1.2327,1.107592,0.593951,0.196147,0.201604,0.178076
3,0.9139,0.907333,0.665445,0.276167,0.278146,0.257974
4,0.6597,0.805607,0.705775,0.354059,0.337519,0.330669
5,0.4731,0.732983,0.731439,0.390345,0.392948,0.38315
6,0.3317,0.708051,0.746104,0.456103,0.443695,0.437794
7,0.2369,0.658205,0.773602,0.504013,0.492114,0.490094
8,0.1814,0.652173,0.779102,0.558324,0.551264,0.540898
9,0.1397,0.648989,0.782768,0.61955,0.568764,0.580236
10,0.1156,0.622481,0.790101,0.673082,0.637042,0.643919


[I 2025-03-15 14:20:01,398] Trial 17 pruned. 


Trial 18 with params: {'learning_rate': 0.001397101869569915, 'weight_decay': 0.001, 'adam_beta1': 0.98, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0352,1.664286,0.406049,0.072034,0.089779,0.066189
2,1.5018,1.374813,0.48396,0.103434,0.123544,0.100791
3,1.2784,1.211578,0.537122,0.12148,0.151895,0.125852
4,1.0712,1.05115,0.60495,0.205319,0.204199,0.181678
5,0.8812,0.926231,0.654445,0.260363,0.255581,0.239606
6,0.7164,0.843227,0.690192,0.295112,0.311771,0.296907
7,0.5743,0.801138,0.714024,0.382877,0.361489,0.356354
8,0.4595,0.742949,0.734189,0.377611,0.383182,0.372779
9,0.3662,0.728008,0.740605,0.400246,0.406114,0.395828
10,0.2939,0.707205,0.747021,0.451664,0.451041,0.4388


[I 2025-03-15 14:20:48,078] Trial 18 pruned. 


Trial 19 with params: {'learning_rate': 0.0023474026351250226, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 4, 'lambda_param': 0.5, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8493,1.413127,0.458295,0.096021,0.114988,0.091658
2,1.199,1.098362,0.587534,0.203463,0.203868,0.189213
3,0.8312,0.826389,0.710357,0.341398,0.33722,0.328145
4,0.5538,0.711687,0.737855,0.402924,0.393699,0.389081
5,0.3712,0.646104,0.769936,0.513601,0.482841,0.485794
6,0.2622,0.630439,0.769936,0.504292,0.47463,0.47886
7,0.1872,0.613522,0.784601,0.606153,0.537466,0.554839
8,0.1452,0.602887,0.791017,0.617832,0.591178,0.591759
9,0.1152,0.578913,0.796517,0.671412,0.616342,0.630793
10,0.1008,0.584264,0.796517,0.682361,0.639557,0.646555


[I 2025-03-15 14:22:25,801] Trial 19 pruned. 


Trial 20 with params: {'learning_rate': 0.004738885014068501, 'weight_decay': 0.0, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7094,1.278108,0.513291,0.133326,0.145151,0.122341
2,1.0462,0.93459,0.657195,0.261694,0.275117,0.259723
3,0.6809,0.734334,0.727773,0.355733,0.363026,0.352062
4,0.4192,0.663369,0.76352,0.444933,0.447589,0.436156
5,0.2716,0.625515,0.780018,0.544785,0.516549,0.514052
6,0.1825,0.586843,0.796517,0.667396,0.61965,0.628723
7,0.1348,0.579627,0.804766,0.695718,0.637578,0.646698
8,0.1065,0.573405,0.8011,0.662453,0.629888,0.628971
9,0.0915,0.571754,0.799267,0.671609,0.638512,0.638983
10,0.0849,0.567673,0.804766,0.690744,0.654648,0.655762


[I 2025-03-15 14:24:56,602] Trial 20 finished with value: 0.7094094274937417 and parameters: {'learning_rate': 0.004738885014068501, 'weight_decay': 0.0, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 5.5}. Best is trial 20 with value: 0.7094094274937417.


Trial 21 with params: {'learning_rate': 0.002385624444818277, 'weight_decay': 0.0, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9222,1.48834,0.425298,0.084767,0.098509,0.077853
2,1.2918,1.148517,0.571036,0.121025,0.173788,0.140319
3,0.9583,0.936828,0.658112,0.25424,0.265607,0.251141
4,0.6672,0.786254,0.716774,0.381507,0.354479,0.352444
5,0.4666,0.709764,0.739688,0.407598,0.402485,0.392426
6,0.3392,0.663204,0.770852,0.476698,0.471994,0.466174
7,0.2421,0.65135,0.766269,0.495816,0.483952,0.479103
8,0.1852,0.640649,0.776352,0.551077,0.531192,0.525789
9,0.1449,0.626828,0.781852,0.618967,0.575986,0.5804
10,0.1195,0.627857,0.781852,0.624526,0.584345,0.588219


[I 2025-03-15 14:25:47,532] Trial 21 pruned. 


Trial 22 with params: {'learning_rate': 0.003617279293760323, 'weight_decay': 0.003, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7734,1.348047,0.484876,0.091132,0.122784,0.097003
2,1.1575,1.053043,0.623281,0.218684,0.247149,0.22012
3,0.8044,0.822814,0.703025,0.323356,0.335589,0.318541
4,0.5292,0.725371,0.739688,0.409013,0.400323,0.385143
5,0.3419,0.674187,0.759853,0.512953,0.471362,0.470491
6,0.2337,0.635674,0.774519,0.532298,0.514841,0.509383
7,0.1621,0.61242,0.789184,0.595694,0.548207,0.551293
8,0.1287,0.605585,0.787351,0.653021,0.606038,0.61321
9,0.1052,0.603445,0.790101,0.649432,0.622714,0.624453
10,0.0915,0.603358,0.789184,0.661125,0.629079,0.630401


[I 2025-03-15 14:26:36,861] Trial 22 pruned. 


Trial 23 with params: {'learning_rate': 0.0029065997051752517, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 1, 'lambda_param': 0.7000000000000001, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8189,1.386194,0.48396,0.109668,0.127595,0.102511
2,1.1649,1.025991,0.629698,0.228031,0.236707,0.216833
3,0.8046,0.80529,0.705775,0.342682,0.337186,0.326563
4,0.5286,0.709873,0.735105,0.399072,0.392426,0.384733
5,0.3516,0.652625,0.75802,0.49034,0.466023,0.461258
6,0.2454,0.625991,0.769019,0.558218,0.506308,0.516717
7,0.1802,0.622346,0.785518,0.58386,0.515723,0.52828
8,0.1464,0.611342,0.787351,0.642007,0.600435,0.608032
9,0.1143,0.592918,0.799267,0.661636,0.628066,0.634701
10,0.0997,0.600283,0.792851,0.679144,0.631375,0.639319


[I 2025-03-15 14:29:09,653] Trial 23 finished with value: 0.6759528144849816 and parameters: {'learning_rate': 0.0029065997051752517, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 1, 'lambda_param': 0.7000000000000001, 'temperature': 6.0}. Best is trial 20 with value: 0.7094094274937417.


Trial 24 with params: {'learning_rate': 0.001903640916105879, 'weight_decay': 0.0, 'adam_beta1': 0.92, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9097,1.496305,0.428048,0.101666,0.10043,0.082006
2,1.2891,1.160764,0.557287,0.145508,0.167227,0.145102
3,0.9504,0.915012,0.671861,0.319289,0.290552,0.281021
4,0.6634,0.784792,0.71494,0.373509,0.356092,0.35103
5,0.4649,0.683627,0.75527,0.463111,0.426212,0.428199
6,0.3249,0.659256,0.758937,0.46626,0.445963,0.445185
7,0.2363,0.645063,0.766269,0.55881,0.486138,0.499965
8,0.1816,0.629681,0.780935,0.584246,0.547342,0.54782
9,0.1414,0.616611,0.784601,0.630223,0.567488,0.582439
10,0.1167,0.602148,0.786434,0.631151,0.580752,0.590145


[I 2025-03-15 14:30:51,075] Trial 24 pruned. 


Trial 25 with params: {'learning_rate': 0.003092578467138645, 'weight_decay': 0.0, 'adam_beta1': 0.96, 'warmup_steps': 3, 'lambda_param': 0.2, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8192,1.39674,0.466544,0.097431,0.115266,0.091802
2,1.2084,1.087356,0.591201,0.215972,0.207757,0.18772
3,0.8673,0.872586,0.676444,0.305972,0.300176,0.289646
4,0.6037,0.757633,0.72044,0.384,0.366936,0.360261
5,0.4103,0.682943,0.753437,0.485549,0.433121,0.434046
6,0.286,0.65652,0.769019,0.514195,0.500761,0.498609
7,0.1973,0.622751,0.778185,0.584317,0.52873,0.537391
8,0.1508,0.618121,0.779102,0.615135,0.558171,0.568973
9,0.1198,0.61362,0.784601,0.652454,0.604068,0.613031
10,0.101,0.612352,0.788268,0.680014,0.634635,0.646129


[I 2025-03-15 14:32:41,018] Trial 25 pruned. 


Trial 26 with params: {'learning_rate': 0.0035740170600873423, 'weight_decay': 0.001, 'adam_beta1': 0.96, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8738,1.456337,0.446379,0.107491,0.109001,0.085761
2,1.2272,1.114098,0.582951,0.184661,0.201155,0.175215
3,0.8898,0.879841,0.687443,0.327006,0.312753,0.297831
4,0.6002,0.763027,0.721357,0.378032,0.371233,0.359883
5,0.4147,0.703864,0.741522,0.442507,0.43285,0.419185
6,0.2861,0.668928,0.762603,0.498403,0.505557,0.489501
7,0.2023,0.628921,0.769936,0.554398,0.522781,0.526113
8,0.1569,0.62423,0.780935,0.625052,0.605326,0.601657
9,0.1259,0.619338,0.786434,0.631681,0.6123,0.611265
10,0.1063,0.60841,0.790101,0.693405,0.643057,0.651774


[I 2025-03-15 14:35:21,493] Trial 26 finished with value: 0.6954977054119237 and parameters: {'learning_rate': 0.0035740170600873423, 'weight_decay': 0.001, 'adam_beta1': 0.96, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 4.5}. Best is trial 20 with value: 0.7094094274937417.


Trial 27 with params: {'learning_rate': 0.0035007024888498452, 'weight_decay': 0.002, 'adam_beta1': 0.96, 'warmup_steps': 2, 'lambda_param': 0.30000000000000004, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8667,1.445504,0.450962,0.1081,0.111195,0.086525
2,1.2298,1.123817,0.585701,0.207512,0.199509,0.178526
3,0.887,0.877625,0.684693,0.327345,0.31651,0.303742
4,0.5983,0.752534,0.727773,0.368413,0.374774,0.362997
5,0.4129,0.685642,0.747938,0.450386,0.44287,0.430956
6,0.285,0.653103,0.75802,0.506075,0.492279,0.487203
7,0.2016,0.628213,0.769936,0.528078,0.50455,0.504264
8,0.155,0.613851,0.778185,0.590271,0.564998,0.56397
9,0.1223,0.61653,0.786434,0.652496,0.612179,0.621797
10,0.1039,0.606116,0.789184,0.682837,0.619992,0.637002


[I 2025-03-15 14:37:14,118] Trial 27 pruned. 


Trial 28 with params: {'learning_rate': 0.003094589973066247, 'weight_decay': 0.0, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8341,1.422365,0.452796,0.11926,0.111954,0.090628
2,1.2073,1.09615,0.583868,0.167481,0.191749,0.167299
3,0.871,0.868706,0.676444,0.306354,0.298327,0.288985
4,0.5897,0.743124,0.726856,0.373934,0.372666,0.360541
5,0.4061,0.679627,0.753437,0.461084,0.443502,0.432733
6,0.2861,0.652999,0.768103,0.504801,0.506592,0.494087
7,0.2014,0.619187,0.776352,0.585048,0.54076,0.546051
8,0.1561,0.615658,0.782768,0.614612,0.581133,0.585997
9,0.1256,0.61207,0.785518,0.659814,0.619579,0.626967
10,0.1065,0.607061,0.794684,0.671803,0.64391,0.646333


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--accuracy/f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Sat Oct 12 13:56:14 2024) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.
[I 2025-03-15 14:40:32,799] Trial 28 pruned. 


Trial 29 with params: {'learning_rate': 0.00043967826704493964, 'weight_decay': 0.001, 'adam_beta1': 0.99, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2075,2.023371,0.251146,0.035557,0.041293,0.026025
2,1.8748,1.740484,0.371219,0.036262,0.074305,0.046641
3,1.6643,1.583791,0.406966,0.059228,0.087225,0.060966
4,1.5252,1.462788,0.463795,0.086989,0.111244,0.087051
5,1.4173,1.366637,0.486709,0.120935,0.123013,0.101266
6,1.3079,1.288459,0.514207,0.116426,0.138642,0.112212
7,1.223,1.218463,0.542621,0.125097,0.156188,0.131118
8,1.1458,1.160476,0.566453,0.163255,0.17021,0.146057
9,1.0685,1.104788,0.590284,0.158967,0.191435,0.164737
10,0.9947,1.055804,0.600367,0.184966,0.199617,0.180499


[I 2025-03-15 14:42:28,067] Trial 29 pruned. 


Trial 30 with params: {'learning_rate': 0.004957413564976425, 'weight_decay': 0.005, 'adam_beta1': 0.99, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7591,1.339408,0.491292,0.090304,0.127232,0.099716
2,1.1728,1.053079,0.598533,0.185723,0.209016,0.184302
3,0.8711,0.921822,0.67736,0.314653,0.316449,0.306342
4,0.6387,0.816138,0.715857,0.358734,0.360319,0.348139
5,0.4805,0.751644,0.738772,0.395558,0.406589,0.387644
6,0.3612,0.704305,0.751604,0.447198,0.435126,0.425928
7,0.2705,0.701872,0.750687,0.477407,0.465634,0.457273
8,0.2174,0.682468,0.76352,0.516146,0.499973,0.499704
9,0.1744,0.654386,0.775435,0.58888,0.558496,0.562554
10,0.146,0.651148,0.776352,0.613041,0.58936,0.587633


[I 2025-03-15 14:45:01,729] Trial 30 finished with value: 0.6946613139298718 and parameters: {'learning_rate': 0.004957413564976425, 'weight_decay': 0.005, 'adam_beta1': 0.99, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 5.0}. Best is trial 20 with value: 0.7094094274937417.


Trial 31 with params: {'learning_rate': 0.0027019466079757064, 'weight_decay': 0.006, 'adam_beta1': 0.96, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8426,1.461573,0.448213,0.10021,0.109092,0.085389
2,1.263,1.126324,0.570119,0.124735,0.175956,0.141302
3,0.9482,0.934565,0.663611,0.292733,0.271653,0.256961
4,0.6839,0.804812,0.702108,0.352872,0.339123,0.33452
5,0.4832,0.711004,0.743355,0.409906,0.400143,0.389364
6,0.337,0.655512,0.759853,0.499327,0.46857,0.466454
7,0.2338,0.632908,0.771769,0.508481,0.481263,0.479863
8,0.1785,0.621318,0.794684,0.626732,0.578169,0.587291
9,0.1401,0.617801,0.782768,0.629993,0.574252,0.588257
10,0.1161,0.609572,0.789184,0.691722,0.615408,0.634483


[I 2025-03-15 14:46:43,434] Trial 31 pruned. 


Trial 32 with params: {'learning_rate': 0.003800740056483792, 'weight_decay': 0.004, 'adam_beta1': 0.99, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8432,1.441494,0.464711,0.084814,0.114426,0.087448
2,1.2652,1.154711,0.560037,0.120559,0.166584,0.136224
3,1.0017,1.007616,0.631531,0.248994,0.242387,0.227143
4,0.7709,0.885318,0.67736,0.350415,0.310424,0.300324
5,0.6044,0.827109,0.711274,0.350458,0.364279,0.347969
6,0.4621,0.764307,0.712191,0.368215,0.380736,0.364769
7,0.3501,0.737232,0.731439,0.421022,0.42425,0.410782
8,0.2796,0.707663,0.751604,0.492527,0.480272,0.476683
9,0.2222,0.695508,0.756187,0.48367,0.493374,0.479149
10,0.183,0.685476,0.762603,0.543815,0.521122,0.520293


[I 2025-03-15 14:47:30,816] Trial 32 pruned. 


Trial 33 with params: {'learning_rate': 0.0009890942390504422, 'weight_decay': 0.007, 'adam_beta1': 0.98, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0953,1.756102,0.371219,0.03942,0.07582,0.050538
2,1.5984,1.491077,0.452796,0.086029,0.10803,0.085895
3,1.3781,1.291932,0.506874,0.132323,0.136677,0.11167
4,1.2011,1.160038,0.557287,0.143836,0.166802,0.140454
5,1.0314,1.027172,0.604033,0.166732,0.200728,0.174523
6,0.8665,0.934489,0.661778,0.266923,0.256111,0.243289
7,0.7249,0.865363,0.683776,0.325558,0.301485,0.294982
8,0.6071,0.816289,0.711274,0.351498,0.344344,0.335487
9,0.5014,0.775538,0.734189,0.371926,0.390384,0.376234
10,0.4118,0.747303,0.729606,0.400261,0.401914,0.391569


[I 2025-03-15 14:49:09,648] Trial 33 pruned. 


Trial 34 with params: {'learning_rate': 0.0017772731668437099, 'weight_decay': 0.005, 'adam_beta1': 0.99, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9623,1.599724,0.421632,0.065507,0.09463,0.07035
2,1.4297,1.322806,0.48396,0.082173,0.121559,0.090146
3,1.2204,1.171549,0.554537,0.146168,0.165642,0.140551
4,1.0109,1.039328,0.615949,0.213043,0.228617,0.210751
5,0.8491,0.922188,0.651696,0.252952,0.270589,0.251064


[I 2025-03-15 14:49:33,321] Trial 34 pruned. 


Trial 35 with params: {'learning_rate': 0.004409161417631827, 'weight_decay': 0.006, 'adam_beta1': 0.99, 'warmup_steps': 1, 'lambda_param': 0.30000000000000004, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8352,1.429048,0.452796,0.102924,0.111605,0.090325
2,1.2529,1.122939,0.571952,0.1554,0.178954,0.155999
3,0.953,0.959122,0.646196,0.276378,0.261718,0.24682
4,0.732,0.849767,0.679193,0.30666,0.30793,0.294367
5,0.5665,0.804727,0.712191,0.361808,0.367674,0.350889


[I 2025-03-15 14:49:58,691] Trial 35 pruned. 


Trial 36 with params: {'learning_rate': 0.0008045466606684004, 'weight_decay': 0.002, 'adam_beta1': 0.96, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.107,1.780767,0.379468,0.037741,0.077696,0.049887
2,1.5964,1.448093,0.459212,0.09031,0.108209,0.082272
3,1.3527,1.266645,0.512374,0.144442,0.140809,0.115336
4,1.1466,1.110074,0.5967,0.160004,0.193627,0.164131
5,0.9612,0.968565,0.637947,0.19155,0.226328,0.20025
6,0.796,0.872975,0.678277,0.31754,0.283037,0.270061
7,0.6537,0.838753,0.699358,0.369247,0.317856,0.318411
8,0.5398,0.768539,0.731439,0.389951,0.360113,0.360077
9,0.4355,0.723603,0.742438,0.427615,0.398344,0.399103
10,0.3494,0.713277,0.745188,0.435929,0.420777,0.414438


[I 2025-03-15 14:51:34,907] Trial 36 pruned. 


Trial 37 with params: {'learning_rate': 2.197945691935017e-05, 'weight_decay': 0.007, 'adam_beta1': 0.97, 'warmup_steps': 3, 'lambda_param': 0.9, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4679,2.445916,0.176902,0.003538,0.02,0.006012
2,2.4307,2.400389,0.176902,0.003538,0.02,0.006012
3,2.3731,2.325145,0.176902,0.003538,0.02,0.006012
4,2.274,2.203958,0.176902,0.003538,0.02,0.006012
5,2.1693,2.112636,0.176902,0.003538,0.02,0.006012


[I 2025-03-15 14:51:59,325] Trial 37 pruned. 


Trial 38 with params: {'learning_rate': 0.003662945698360812, 'weight_decay': 0.003, 'adam_beta1': 0.99, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7965,1.407434,0.464711,0.100545,0.113625,0.090021
2,1.2388,1.107777,0.576535,0.171742,0.188978,0.16472
3,0.9468,0.961234,0.648029,0.28579,0.26578,0.247415
4,0.7264,0.864042,0.68286,0.335432,0.321325,0.306353
5,0.5715,0.792384,0.721357,0.381411,0.376475,0.36521
6,0.4386,0.727427,0.737855,0.401143,0.408185,0.397354
7,0.3364,0.704885,0.750687,0.423324,0.438394,0.422665
8,0.2682,0.680948,0.76352,0.463043,0.474896,0.461338
9,0.2131,0.663109,0.774519,0.511646,0.521349,0.506456
10,0.1764,0.65212,0.777269,0.578362,0.556045,0.55407


[I 2025-03-15 14:52:49,902] Trial 38 pruned. 


Trial 39 with params: {'learning_rate': 0.0034932489254266962, 'weight_decay': 0.007, 'adam_beta1': 0.99, 'warmup_steps': 3, 'lambda_param': 0.30000000000000004, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8458,1.447577,0.471127,0.105133,0.114537,0.088208
2,1.2811,1.173082,0.544455,0.125379,0.159987,0.134642
3,1.0202,1.011137,0.637947,0.247761,0.258446,0.239159
4,0.7819,0.888355,0.668194,0.338794,0.309012,0.298156
5,0.6231,0.820415,0.704858,0.344613,0.3493,0.332247


[I 2025-03-15 14:53:13,376] Trial 39 pruned. 


Trial 40 with params: {'learning_rate': 1.1139092500128487e-05, 'weight_decay': 0.004, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4756,2.463527,0.167736,0.013305,0.030125,0.014216
2,2.46,2.444364,0.176902,0.003601,0.02,0.006103
3,2.4377,2.419961,0.176902,0.003538,0.02,0.006012
4,2.4108,2.386893,0.176902,0.003538,0.02,0.006012
5,2.3693,2.337356,0.176902,0.003538,0.02,0.006012


[I 2025-03-15 14:53:40,613] Trial 40 pruned. 


Trial 41 with params: {'learning_rate': 0.0020532439756133026, 'weight_decay': 0.001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8776,1.498193,0.432631,0.089916,0.103161,0.083707
2,1.2928,1.12947,0.572869,0.125325,0.177405,0.144755
3,0.9622,0.927608,0.666361,0.311668,0.284327,0.273884
4,0.6778,0.784414,0.716774,0.357164,0.348985,0.34564
5,0.4855,0.701672,0.746104,0.389822,0.399539,0.385996
6,0.3486,0.669524,0.759853,0.483659,0.454842,0.448926
7,0.2458,0.64897,0.768103,0.524215,0.480076,0.483957
8,0.1903,0.634391,0.774519,0.589306,0.528546,0.535428
9,0.1475,0.621839,0.788268,0.604703,0.562328,0.565304
10,0.1243,0.606767,0.789184,0.661906,0.608272,0.616966


[I 2025-03-15 14:56:06,202] Trial 41 finished with value: 0.6663398512416704 and parameters: {'learning_rate': 0.0020532439756133026, 'weight_decay': 0.001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 7.0}. Best is trial 20 with value: 0.7094094274937417.


Trial 42 with params: {'learning_rate': 4.712098624605705e-05, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 2, 'lambda_param': 0.9, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4458,2.382897,0.176902,0.003538,0.02,0.006012
2,2.2617,2.116382,0.176902,0.003538,0.02,0.006012
3,2.1092,2.067204,0.176902,0.003538,0.02,0.006012
4,2.0516,1.999857,0.182401,0.014642,0.021585,0.008642
5,1.9815,1.918544,0.307058,0.028884,0.054126,0.036247
6,1.8837,1.846795,0.355637,0.040542,0.068661,0.043034
7,1.8225,1.787996,0.366636,0.041615,0.072141,0.046853
8,1.7708,1.738282,0.376719,0.03985,0.075745,0.048882
9,1.7281,1.69641,0.377635,0.039522,0.07636,0.049532
10,1.6839,1.661928,0.395967,0.041602,0.082317,0.053017


[I 2025-03-15 14:56:54,653] Trial 42 pruned. 


Trial 43 with params: {'learning_rate': 0.003838555887827034, 'weight_decay': 0.001, 'adam_beta1': 0.99, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9005,1.48293,0.439963,0.09004,0.108113,0.087997
2,1.3064,1.189444,0.536205,0.123888,0.155431,0.127106
3,1.0321,1.013791,0.623281,0.203908,0.223003,0.20141
4,0.787,0.871133,0.692942,0.35099,0.320319,0.31107
5,0.6016,0.819277,0.706691,0.351017,0.354776,0.339773


[I 2025-03-15 14:57:20,121] Trial 43 pruned. 


Trial 44 with params: {'learning_rate': 0.0037530996496574413, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8251,1.36193,0.476627,0.140231,0.124919,0.099462
2,1.1361,1.027807,0.630614,0.23475,0.256224,0.23084
3,0.7641,0.783014,0.715857,0.342977,0.354063,0.342057
4,0.4778,0.692678,0.746104,0.447003,0.423181,0.41955
5,0.3053,0.660648,0.766269,0.491553,0.492073,0.482533
6,0.2191,0.641561,0.773602,0.593495,0.536573,0.544903
7,0.1586,0.625852,0.782768,0.621632,0.586741,0.590908
8,0.1216,0.610422,0.790101,0.661348,0.635433,0.63925
9,0.0993,0.601132,0.7956,0.660382,0.635176,0.637899
10,0.0886,0.602716,0.792851,0.684863,0.64687,0.654273


[I 2025-03-15 14:59:00,799] Trial 44 pruned. 


Trial 45 with params: {'learning_rate': 0.004648270387466142, 'weight_decay': 0.0, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8128,1.334984,0.487626,0.104688,0.129262,0.104601
2,1.1429,1.036721,0.626031,0.252162,0.245013,0.226867
3,0.7671,0.799167,0.700275,0.32789,0.334866,0.317171
4,0.484,0.71346,0.737855,0.411131,0.413095,0.399594
5,0.3057,0.65633,0.769936,0.521192,0.503046,0.500373
6,0.2137,0.633615,0.777269,0.5863,0.565278,0.560936
7,0.1523,0.606736,0.797434,0.671138,0.647004,0.645294
8,0.1249,0.605656,0.790101,0.678268,0.661448,0.656361
9,0.1023,0.595148,0.8011,0.741187,0.682559,0.691403
10,0.0901,0.586871,0.8011,0.716615,0.674947,0.678777


[I 2025-03-15 15:01:40,138] Trial 45 finished with value: 0.7320126206223214 and parameters: {'learning_rate': 0.004648270387466142, 'weight_decay': 0.0, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 3.5}. Best is trial 45 with value: 0.7320126206223214.


Trial 46 with params: {'learning_rate': 0.0035448851910578347, 'weight_decay': 0.001, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7441,1.33315,0.491292,0.113177,0.132779,0.109117
2,1.1276,1.005327,0.637947,0.238448,0.257519,0.236541
3,0.7539,0.776458,0.71769,0.348733,0.352861,0.339909
4,0.4788,0.688281,0.745188,0.42459,0.417344,0.405866
5,0.3124,0.65505,0.76077,0.538567,0.491899,0.495178
6,0.2193,0.622379,0.780935,0.581532,0.543308,0.544887
7,0.1549,0.602657,0.784601,0.673855,0.606081,0.619944
8,0.1185,0.602148,0.789184,0.66218,0.63228,0.636015
9,0.0986,0.592034,0.794684,0.658668,0.631264,0.632639
10,0.0881,0.58986,0.789184,0.660492,0.629128,0.630938


[I 2025-03-15 15:02:30,756] Trial 46 pruned. 


Trial 47 with params: {'learning_rate': 0.004673032009859981, 'weight_decay': 0.001, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7745,1.33118,0.501375,0.11153,0.134578,0.105642
2,1.1434,1.001728,0.632447,0.243519,0.24651,0.222432
3,0.7803,0.826693,0.699358,0.321921,0.342127,0.316925
4,0.5277,0.747359,0.727773,0.387935,0.391003,0.375411
5,0.3614,0.692275,0.756187,0.470975,0.46765,0.454062
6,0.2563,0.651935,0.771769,0.548745,0.522906,0.518546
7,0.1799,0.631797,0.778185,0.56411,0.562621,0.551691
8,0.1396,0.607597,0.787351,0.6095,0.597679,0.59095
9,0.1107,0.598224,0.790101,0.663008,0.628205,0.632342
10,0.096,0.590515,0.793767,0.663834,0.635973,0.637956


[I 2025-03-15 15:04:23,712] Trial 47 pruned. 


Trial 48 with params: {'learning_rate': 0.0038364943678061707, 'weight_decay': 0.002, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8215,1.359702,0.484876,0.117583,0.127437,0.105503
2,1.1367,1.015231,0.63703,0.231748,0.261343,0.233691
3,0.7658,0.793541,0.71494,0.337829,0.342059,0.331469
4,0.4873,0.702704,0.736022,0.405022,0.401857,0.394399
5,0.3163,0.665214,0.754354,0.505952,0.473155,0.470155
6,0.2236,0.636725,0.766269,0.536235,0.513872,0.514565
7,0.1603,0.625835,0.775435,0.621719,0.580345,0.589249
8,0.1261,0.611022,0.780018,0.637027,0.62357,0.61555
9,0.1032,0.610698,0.780018,0.662315,0.623658,0.633382
10,0.0915,0.606235,0.778185,0.6635,0.633969,0.637906


[I 2025-03-15 15:05:14,161] Trial 48 pruned. 


Trial 49 with params: {'learning_rate': 2.4721217192981437e-05, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 2, 'lambda_param': 0.5, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4649,2.43837,0.176902,0.003538,0.02,0.006012
2,2.4106,2.359408,0.176902,0.003538,0.02,0.006012
3,2.2866,2.175885,0.176902,0.003538,0.02,0.006012
4,2.1334,2.099452,0.176902,0.003538,0.02,0.006012
5,2.1108,2.077706,0.176902,0.003538,0.02,0.006012
6,2.0791,2.053593,0.176902,0.003538,0.02,0.006012
7,2.0499,2.016109,0.176902,0.003538,0.02,0.006012
8,2.0081,1.970839,0.305225,0.030234,0.053837,0.036897
9,1.9652,1.932521,0.315307,0.028756,0.056445,0.036759
10,1.9231,1.906269,0.355637,0.049042,0.06758,0.04061


[I 2025-03-15 15:06:57,397] Trial 49 pruned. 


Trial 50 with params: {'learning_rate': 0.0013476897900883385, 'weight_decay': 0.0, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1, 'lambda_param': 0.30000000000000004, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9818,1.589841,0.414299,0.059823,0.092939,0.067434
2,1.3934,1.242633,0.528873,0.110884,0.146586,0.122059
3,1.1087,1.02687,0.615032,0.246678,0.204919,0.184565
4,0.8417,0.896296,0.67736,0.315747,0.280802,0.273461
5,0.6461,0.773126,0.716774,0.354398,0.342427,0.33396


[I 2025-03-15 15:07:20,867] Trial 50 pruned. 


Trial 51 with params: {'learning_rate': 0.0017090005775926974, 'weight_decay': 0.001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1, 'lambda_param': 1.0, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9468,1.544785,0.414299,0.071152,0.093322,0.071546
2,1.3481,1.190519,0.56462,0.129593,0.165838,0.136434
3,1.0377,0.985156,0.632447,0.246705,0.232723,0.218994
4,0.7493,0.839487,0.706691,0.341349,0.331585,0.327078
5,0.5512,0.728026,0.739688,0.367426,0.376143,0.362154
6,0.3955,0.688904,0.757104,0.449855,0.429751,0.427424
7,0.2838,0.660225,0.770852,0.494645,0.475659,0.470703
8,0.2211,0.652041,0.764436,0.515943,0.502256,0.498023
9,0.1691,0.629878,0.777269,0.580093,0.531629,0.541478
10,0.1415,0.632106,0.771769,0.619977,0.554822,0.574096


[I 2025-03-15 15:08:10,423] Trial 51 pruned. 


Trial 52 with params: {'learning_rate': 0.004587968919031273, 'weight_decay': 0.001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 2, 'lambda_param': 0.30000000000000004, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7212,1.298823,0.525206,0.111495,0.147128,0.121336
2,1.0728,0.956768,0.659028,0.291751,0.278062,0.262839
3,0.6917,0.738947,0.733272,0.369091,0.375884,0.364893
4,0.4221,0.665222,0.752521,0.469248,0.434945,0.437324
5,0.2678,0.623832,0.780018,0.544246,0.508457,0.502061
6,0.177,0.58716,0.802016,0.667429,0.625238,0.63074
7,0.1276,0.580456,0.796517,0.664917,0.640458,0.6379
8,0.1025,0.569975,0.797434,0.671756,0.655891,0.649668
9,0.0889,0.562281,0.80385,0.660213,0.656612,0.648068
10,0.0822,0.571598,0.800183,0.664576,0.653848,0.645661


[I 2025-03-15 15:10:59,417] Trial 52 finished with value: 0.6907807793869513 and parameters: {'learning_rate': 0.004587968919031273, 'weight_decay': 0.001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 2, 'lambda_param': 0.30000000000000004, 'temperature': 7.0}. Best is trial 45 with value: 0.7320126206223214.


Trial 53 with params: {'learning_rate': 0.0025672741927678052, 'weight_decay': 0.002, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 2, 'lambda_param': 0.4, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8259,1.422539,0.458295,0.11382,0.11331,0.089647
2,1.2167,1.077929,0.590284,0.208274,0.189347,0.167199
3,0.8713,0.848594,0.692026,0.312221,0.322632,0.307006
4,0.5921,0.751817,0.72594,0.380239,0.377041,0.368849
5,0.4044,0.672012,0.754354,0.468593,0.430321,0.433126
6,0.2857,0.652258,0.767186,0.51466,0.488608,0.487707
7,0.2049,0.631476,0.769936,0.593139,0.5055,0.520184
8,0.1564,0.617374,0.788268,0.627296,0.57445,0.58517
9,0.1215,0.616261,0.783685,0.651875,0.590365,0.605299
10,0.1039,0.60495,0.788268,0.693857,0.638058,0.647384


[I 2025-03-15 15:12:34,229] Trial 53 pruned. 


Trial 54 with params: {'learning_rate': 0.002464615509766315, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8588,1.461853,0.447296,0.113328,0.109504,0.086652
2,1.2386,1.133413,0.575619,0.191922,0.189154,0.169419
3,0.8979,0.881514,0.687443,0.315784,0.312739,0.302363
4,0.6115,0.754241,0.733272,0.38662,0.383698,0.375466
5,0.4143,0.678285,0.756187,0.503734,0.455777,0.463375
6,0.2943,0.671539,0.757104,0.513717,0.461024,0.469198
7,0.2174,0.638633,0.766269,0.605866,0.501588,0.523448
8,0.1654,0.620364,0.773602,0.57779,0.53989,0.54599
9,0.128,0.613851,0.779102,0.65394,0.597473,0.608733
10,0.1082,0.616065,0.776352,0.656618,0.604015,0.612971


[I 2025-03-15 15:14:55,209] Trial 54 finished with value: 0.6659930332809211 and parameters: {'learning_rate': 0.002464615509766315, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 4.5}. Best is trial 45 with value: 0.7320126206223214.


Trial 55 with params: {'learning_rate': 1.6150211615119213e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 1, 'lambda_param': 0.9, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4711,2.45436,0.175985,0.004578,0.020434,0.007429
2,2.4455,2.423026,0.176902,0.003538,0.02,0.006012
3,2.4066,2.374637,0.176902,0.003538,0.02,0.006012
4,2.3438,2.293059,0.176902,0.003538,0.02,0.006012
5,2.2479,2.174895,0.176902,0.003538,0.02,0.006012
6,2.145,2.112541,0.176902,0.003538,0.02,0.006012
7,2.1144,2.094259,0.176902,0.003538,0.02,0.006012
8,2.1024,2.081251,0.176902,0.003538,0.02,0.006012
9,2.096,2.069665,0.176902,0.003538,0.02,0.006012
10,2.0796,2.056357,0.176902,0.003538,0.02,0.006012


[I 2025-03-15 15:16:31,279] Trial 55 pruned. 


Trial 56 with params: {'learning_rate': 1.2880316899047556e-05, 'weight_decay': 0.01, 'adam_beta1': 0.91, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4729,2.459012,0.197067,0.008595,0.030348,0.012812
2,2.453,2.433936,0.176902,0.003538,0.02,0.006012
3,2.4219,2.396493,0.176902,0.003538,0.02,0.006012
4,2.3752,2.335296,0.176902,0.003538,0.02,0.006012
5,2.2967,2.23229,0.176902,0.003538,0.02,0.006012


[I 2025-03-15 15:16:58,200] Trial 56 pruned. 


Trial 57 with params: {'learning_rate': 0.004400409967846641, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.843,1.328025,0.491292,0.101467,0.130089,0.103798
2,1.118,1.030297,0.625115,0.264836,0.261974,0.245415
3,0.7362,0.767996,0.71769,0.358911,0.361649,0.351858
4,0.4424,0.675893,0.75527,0.459593,0.436868,0.432425
5,0.2807,0.64813,0.762603,0.54391,0.508684,0.511169
6,0.1938,0.616009,0.788268,0.623776,0.593122,0.596774
7,0.1358,0.595596,0.793767,0.693166,0.636145,0.650672
8,0.1074,0.604265,0.786434,0.688797,0.626126,0.640396
9,0.0938,0.590052,0.7956,0.690345,0.654723,0.659495
10,0.0871,0.590743,0.791017,0.705718,0.665869,0.670816


[I 2025-03-15 15:19:35,686] Trial 57 finished with value: 0.6900100934865595 and parameters: {'learning_rate': 0.004400409967846641, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 5.5}. Best is trial 45 with value: 0.7320126206223214.


Trial 58 with params: {'learning_rate': 0.000441291210029716, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.156,1.879937,0.313474,0.03359,0.059163,0.038696
2,1.6981,1.542778,0.439047,0.066597,0.098743,0.069406
3,1.442,1.347636,0.494042,0.099392,0.126544,0.096056
4,1.2662,1.207841,0.551787,0.131805,0.165463,0.138443
5,1.1112,1.085747,0.593034,0.169577,0.187375,0.163663
6,0.9593,1.00537,0.631531,0.235259,0.222944,0.199294
7,0.8416,0.941247,0.648029,0.302449,0.245959,0.23366
8,0.7468,0.875956,0.687443,0.358355,0.29519,0.297079
9,0.6543,0.840701,0.695692,0.374047,0.315138,0.321043
10,0.5656,0.802623,0.713107,0.390358,0.3406,0.34454


[I 2025-03-15 15:20:25,328] Trial 58 pruned. 


Trial 59 with params: {'learning_rate': 3.781332541286505e-05, 'weight_decay': 0.008, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4525,2.410115,0.176902,0.003538,0.02,0.006012
2,2.3532,2.259087,0.176902,0.003538,0.02,0.006012
3,2.1756,2.105868,0.176902,0.003538,0.02,0.006012
4,2.0993,2.071731,0.176902,0.003538,0.02,0.006012
5,2.0786,2.033278,0.176902,0.003538,0.02,0.006012
6,2.0174,1.97194,0.219982,0.025849,0.031637,0.020345
7,1.9453,1.912029,0.320807,0.028213,0.057948,0.037124
8,1.8901,1.859449,0.35472,0.028,0.067341,0.039079
9,1.8485,1.819711,0.363886,0.039742,0.070573,0.043045
10,1.806,1.783968,0.373052,0.041501,0.074351,0.048065


[I 2025-03-15 15:21:17,266] Trial 59 pruned. 


Trial 60 with params: {'learning_rate': 0.003149701938384368, 'weight_decay': 0.0, 'adam_beta1': 0.96, 'warmup_steps': 2, 'lambda_param': 0.4, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8438,1.433009,0.451879,0.099137,0.110909,0.086144
2,1.2355,1.096252,0.583868,0.127229,0.182643,0.146304
3,0.9043,0.907922,0.666361,0.282645,0.289214,0.274668
4,0.6344,0.763691,0.726856,0.35864,0.371863,0.3566
5,0.4403,0.716037,0.740605,0.419608,0.417166,0.400962
6,0.31,0.66798,0.766269,0.488519,0.483523,0.474872
7,0.2192,0.64203,0.780018,0.55496,0.506104,0.510345
8,0.1714,0.626977,0.777269,0.594349,0.552074,0.554749
9,0.1376,0.627112,0.780935,0.638032,0.572473,0.587273
10,0.1133,0.617581,0.791934,0.682124,0.63842,0.646352


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--f1/34c46321f42186df33a6260966e34a368f14868d9cc2ba47d142112e2800d233 (last modified on Fri Jan 10 23:14:01 2025) since it couldn't be found locally at evaluate-metric--f1, or remotely on the Hugging Face Hub.
[I 2025-03-15 15:24:31,299] Trial 60 finished with value: 0.6655456580930779 and parameters: {'learning_rate': 0.003149701938384368, 'weight_decay': 0.0, 'adam_beta1': 0.96, 'warmup_steps': 2, 'lambda_param': 0.4, 'temperature': 6.0}. Best is trial 45 with value: 0.7320126206223214.


Trial 61 with params: {'learning_rate': 0.000533323769958065, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 4, 'lambda_param': 0.2, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1667,1.918484,0.315307,0.038671,0.067742,0.042884
2,1.675,1.501873,0.448213,0.081984,0.103052,0.075319
3,1.4056,1.300093,0.502291,0.120497,0.132035,0.104076
4,1.2097,1.147138,0.572869,0.156334,0.175284,0.148877
5,1.0382,1.024522,0.609533,0.191363,0.202409,0.178439


[I 2025-03-15 15:24:55,833] Trial 61 pruned. 


Trial 62 with params: {'learning_rate': 0.004928600655696639, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7058,1.288716,0.514207,0.119771,0.145264,0.120581
2,1.0207,0.895162,0.67461,0.313775,0.302582,0.293087
3,0.6411,0.715737,0.745188,0.396304,0.400879,0.387371
4,0.3862,0.642929,0.780935,0.533559,0.504064,0.50402
5,0.2462,0.607377,0.774519,0.535799,0.516748,0.514503
6,0.1649,0.584562,0.792851,0.642313,0.615525,0.613194
7,0.1252,0.570358,0.799267,0.678748,0.633118,0.63727
8,0.0999,0.55646,0.807516,0.736464,0.671902,0.686779
9,0.0866,0.55419,0.811182,0.76231,0.685989,0.705058
10,0.0822,0.549865,0.810266,0.737098,0.678605,0.691179


[I 2025-03-15 15:27:20,685] Trial 62 finished with value: 0.7066002657322474 and parameters: {'learning_rate': 0.004928600655696639, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 7.0}. Best is trial 45 with value: 0.7320126206223214.


Trial 63 with params: {'learning_rate': 0.00037157711478318914, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2061,1.943506,0.24473,0.041237,0.040743,0.032599
2,1.7492,1.579185,0.420715,0.065904,0.090733,0.060251
3,1.4713,1.372152,0.47846,0.099979,0.117941,0.090457
4,1.2929,1.23169,0.541705,0.130709,0.156533,0.134158
5,1.1528,1.126668,0.570119,0.145655,0.173503,0.147996
6,1.0032,1.062391,0.606783,0.171267,0.208566,0.183123
7,0.8883,0.975787,0.64528,0.243378,0.244373,0.221933
8,0.7974,0.931155,0.661778,0.317824,0.26561,0.260009
9,0.7138,0.886179,0.670027,0.322963,0.282843,0.279007
10,0.6331,0.854694,0.683776,0.356157,0.302596,0.304837


[I 2025-03-15 15:28:05,535] Trial 63 pruned. 


Trial 64 with params: {'learning_rate': 0.0020621214501137524, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.5, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.859,1.463029,0.442713,0.108189,0.108702,0.090669
2,1.262,1.119591,0.578368,0.150567,0.176877,0.146928
3,0.9321,0.882136,0.68561,0.314983,0.303341,0.292426
4,0.6412,0.768247,0.718607,0.380263,0.361053,0.359632
5,0.4537,0.68552,0.754354,0.41655,0.413756,0.407348
6,0.3244,0.659679,0.761687,0.468011,0.457254,0.453389
7,0.2279,0.642429,0.759853,0.504611,0.474882,0.478338
8,0.1759,0.629115,0.778185,0.553083,0.524963,0.525863
9,0.138,0.622065,0.780018,0.591633,0.55431,0.558888
10,0.1174,0.619101,0.780935,0.671644,0.608489,0.618105


[I 2025-03-15 15:30:29,361] Trial 64 finished with value: 0.6671423386206206 and parameters: {'learning_rate': 0.0020621214501137524, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.5, 'temperature': 7.0}. Best is trial 45 with value: 0.7320126206223214.


Trial 65 with params: {'learning_rate': 0.003878338405671498, 'weight_decay': 0.005, 'adam_beta1': 0.97, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8549,1.413785,0.453712,0.130297,0.115002,0.095349
2,1.2188,1.072455,0.590284,0.172244,0.201,0.174428
3,0.8686,0.892245,0.681943,0.303873,0.316845,0.29618
4,0.6055,0.777465,0.715857,0.393864,0.371519,0.362184
5,0.4296,0.718854,0.744271,0.418966,0.426481,0.410289
6,0.2954,0.671894,0.751604,0.505413,0.478959,0.478838
7,0.2135,0.643031,0.769936,0.52565,0.507498,0.502541
8,0.1624,0.639924,0.779102,0.524037,0.540383,0.524345
9,0.1263,0.619109,0.786434,0.611873,0.579959,0.585855
10,0.1073,0.615352,0.786434,0.649287,0.629754,0.630183


[I 2025-03-15 15:31:15,516] Trial 65 pruned. 


Trial 66 with params: {'learning_rate': 0.003530032079716137, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 3, 'lambda_param': 0.4, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8021,1.361467,0.472044,0.110861,0.124652,0.093487
2,1.1217,0.98258,0.647113,0.265913,0.263304,0.250148
3,0.7169,0.748469,0.72594,0.366994,0.367202,0.3515
4,0.4599,0.6654,0.75527,0.458034,0.434182,0.430548
5,0.288,0.644913,0.772686,0.543801,0.506333,0.506767
6,0.2152,0.631662,0.779102,0.625946,0.559696,0.574121
7,0.1522,0.610334,0.790101,0.668118,0.598074,0.615939
8,0.1129,0.585645,0.792851,0.685995,0.65424,0.655455
9,0.094,0.58065,0.802933,0.716601,0.654349,0.668871
10,0.0873,0.584309,0.79835,0.710379,0.653782,0.665625


[I 2025-03-15 15:33:39,149] Trial 66 finished with value: 0.7117440225729829 and parameters: {'learning_rate': 0.003530032079716137, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 3, 'lambda_param': 0.4, 'temperature': 6.5}. Best is trial 45 with value: 0.7320126206223214.


Trial 67 with params: {'learning_rate': 0.0012625784070723338, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 2, 'lambda_param': 0.30000000000000004, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9827,1.5705,0.409716,0.066549,0.091895,0.068017
2,1.3774,1.271206,0.514207,0.144227,0.140801,0.120161
3,1.0906,1.003149,0.633364,0.227763,0.224732,0.204694
4,0.799,0.864003,0.688359,0.361422,0.308707,0.309442
5,0.6028,0.758773,0.725023,0.37164,0.352576,0.347982
6,0.4449,0.714241,0.747938,0.460093,0.41805,0.42129
7,0.3255,0.665079,0.759853,0.499412,0.462106,0.466762
8,0.258,0.660106,0.771769,0.529078,0.502418,0.505623
9,0.1988,0.644157,0.780935,0.593051,0.525562,0.539561
10,0.1647,0.635595,0.770852,0.582058,0.53016,0.543525


[I 2025-03-15 15:35:21,163] Trial 67 pruned. 


Trial 68 with params: {'learning_rate': 0.003499622469201884, 'weight_decay': 0.001, 'adam_beta1': 0.91, 'warmup_steps': 3, 'lambda_param': 0.5, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7934,1.358854,0.470211,0.106068,0.123047,0.093803
2,1.1186,0.985442,0.644363,0.272064,0.263818,0.251207
3,0.7216,0.764285,0.715857,0.34248,0.355485,0.340161
4,0.4591,0.679755,0.747938,0.44297,0.424031,0.420917
5,0.2942,0.651357,0.765353,0.51205,0.498795,0.489732
6,0.2177,0.627069,0.786434,0.614127,0.549273,0.562368
7,0.1513,0.601017,0.794684,0.655487,0.595712,0.611313
8,0.1141,0.593547,0.79835,0.701884,0.635731,0.648269
9,0.0957,0.577637,0.806599,0.72304,0.669204,0.677437
10,0.0865,0.583155,0.799267,0.697355,0.649447,0.656327


[I 2025-03-15 15:37:59,149] Trial 68 finished with value: 0.7028378183047864 and parameters: {'learning_rate': 0.003499622469201884, 'weight_decay': 0.001, 'adam_beta1': 0.91, 'warmup_steps': 3, 'lambda_param': 0.5, 'temperature': 7.0}. Best is trial 45 with value: 0.7320126206223214.


Trial 69 with params: {'learning_rate': 0.004376596522685909, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 3, 'lambda_param': 0.5, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8398,1.326746,0.495875,0.115448,0.132003,0.105225
2,1.1056,1.024113,0.627864,0.25821,0.258171,0.240808
3,0.7116,0.759631,0.724106,0.360021,0.372878,0.359514
4,0.4329,0.665629,0.76077,0.458492,0.447717,0.44118
5,0.2717,0.625985,0.777269,0.566969,0.533326,0.533303
6,0.1891,0.601452,0.794684,0.641822,0.602875,0.604884
7,0.1341,0.601255,0.784601,0.649686,0.614913,0.615791
8,0.105,0.590199,0.7956,0.702707,0.661281,0.671229
9,0.0902,0.579257,0.8011,0.725248,0.665316,0.673959
10,0.0843,0.57731,0.802933,0.738551,0.689067,0.698857


[I 2025-03-15 15:40:23,158] Trial 69 finished with value: 0.7095874397194529 and parameters: {'learning_rate': 0.004376596522685909, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 3, 'lambda_param': 0.5, 'temperature': 6.0}. Best is trial 45 with value: 0.7320126206223214.


Trial 70 with params: {'learning_rate': 0.002158956684094103, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 3, 'lambda_param': 0.5, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8723,1.419793,0.460128,0.096593,0.11043,0.088472
2,1.2168,1.112397,0.593951,0.226221,0.212564,0.188014
3,0.8548,0.844213,0.696609,0.357003,0.332184,0.322453
4,0.5789,0.720844,0.736939,0.383556,0.382006,0.374204
5,0.3894,0.655081,0.762603,0.492591,0.447288,0.448571
6,0.2773,0.635315,0.765353,0.513329,0.464,0.471811
7,0.1973,0.622607,0.773602,0.575013,0.50626,0.520006
8,0.1594,0.617421,0.781852,0.567294,0.549461,0.546115
9,0.127,0.601259,0.779102,0.644971,0.585507,0.597912
10,0.1059,0.602553,0.781852,0.659482,0.600079,0.614424


[I 2025-03-15 15:42:00,372] Trial 70 pruned. 


Trial 71 with params: {'learning_rate': 0.0033307566191298076, 'weight_decay': 0.0, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 4, 'lambda_param': 0.7000000000000001, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9298,1.445969,0.44088,0.090741,0.106208,0.085062
2,1.2264,1.092935,0.587534,0.127687,0.180555,0.14603
3,0.8519,0.848243,0.692942,0.343933,0.317463,0.313803
4,0.5514,0.728513,0.729606,0.398612,0.390032,0.379495
5,0.363,0.662586,0.758937,0.45349,0.452795,0.44503
6,0.2472,0.641097,0.768103,0.561937,0.50406,0.510531
7,0.1837,0.62413,0.781852,0.583533,0.559902,0.561288
8,0.1404,0.611504,0.780935,0.621065,0.589229,0.590244
9,0.1115,0.597973,0.793767,0.650475,0.630328,0.627085
10,0.0963,0.597847,0.788268,0.697356,0.658954,0.66462


[I 2025-03-15 15:44:26,291] Trial 71 finished with value: 0.6955486664955663 and parameters: {'learning_rate': 0.0033307566191298076, 'weight_decay': 0.0, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 4, 'lambda_param': 0.7000000000000001, 'temperature': 6.0}. Best is trial 45 with value: 0.7320126206223214.


Trial 72 with params: {'learning_rate': 0.0022385767461064153, 'weight_decay': 0.003, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 4, 'lambda_param': 0.8, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9034,1.46595,0.446379,0.08925,0.106284,0.082891
2,1.2698,1.133482,0.572869,0.143045,0.180836,0.1522
3,0.9307,0.918037,0.667278,0.307507,0.281157,0.274415
4,0.6484,0.785238,0.714024,0.390513,0.357671,0.356196
5,0.4472,0.69202,0.749771,0.445699,0.414989,0.412854
6,0.3134,0.653927,0.761687,0.478342,0.462107,0.459821
7,0.2221,0.642999,0.76352,0.526458,0.482484,0.48662
8,0.1752,0.628272,0.772686,0.594784,0.540546,0.549186
9,0.1349,0.620334,0.779102,0.630973,0.566819,0.581362
10,0.1133,0.617766,0.782768,0.625151,0.586528,0.591887


[I 2025-03-15 15:46:05,704] Trial 72 pruned. 


Trial 73 with params: {'learning_rate': 0.004238208676864204, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 3, 'lambda_param': 0.6000000000000001, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8286,1.341494,0.48396,0.098864,0.128131,0.099506
2,1.1134,1.045075,0.615949,0.265529,0.249993,0.229538
3,0.7192,0.763822,0.72319,0.366768,0.366647,0.352194
4,0.4368,0.666421,0.756187,0.479734,0.442823,0.441307
5,0.2751,0.638232,0.775435,0.556139,0.507299,0.513575
6,0.1918,0.624248,0.783685,0.644337,0.60185,0.604579
7,0.1394,0.610807,0.786434,0.673536,0.614681,0.625442
8,0.1093,0.594679,0.792851,0.67329,0.637211,0.63724
9,0.0951,0.597065,0.790101,0.692268,0.645271,0.653936
10,0.0855,0.590403,0.797434,0.727203,0.677125,0.686661


[I 2025-03-15 15:48:28,723] Trial 73 finished with value: 0.7105283542943962 and parameters: {'learning_rate': 0.004238208676864204, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 3, 'lambda_param': 0.6000000000000001, 'temperature': 6.0}. Best is trial 45 with value: 0.7320126206223214.


Trial 74 with params: {'learning_rate': 0.00440439862986655, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 3, 'lambda_param': 0.8, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8346,1.315893,0.491292,0.135296,0.134017,0.110594
2,1.0676,0.957989,0.653529,0.286513,0.288345,0.274928
3,0.6604,0.729861,0.740605,0.406723,0.390918,0.382407
4,0.4056,0.641982,0.770852,0.515633,0.499047,0.491411
5,0.252,0.623916,0.781852,0.584802,0.527919,0.539275
6,0.1737,0.591825,0.79835,0.671747,0.615447,0.625951
7,0.1233,0.591996,0.793767,0.675574,0.637186,0.641389
8,0.0988,0.582542,0.799267,0.711758,0.660398,0.667696
9,0.0859,0.570798,0.805683,0.712843,0.660218,0.670464
10,0.0805,0.575172,0.799267,0.751245,0.681707,0.697466


[I 2025-03-15 15:50:57,644] Trial 74 finished with value: 0.7057379419825852 and parameters: {'learning_rate': 0.00440439862986655, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 3, 'lambda_param': 0.8, 'temperature': 6.5}. Best is trial 45 with value: 0.7320126206223214.


Trial 75 with params: {'learning_rate': 0.0010359165968121112, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 2, 'lambda_param': 0.9, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.024,1.627544,0.391384,0.067575,0.084549,0.059806
2,1.4178,1.289871,0.509624,0.119966,0.135059,0.114282
3,1.1305,1.024871,0.622365,0.212244,0.210836,0.192186
4,0.8531,0.903152,0.652612,0.333793,0.265872,0.264
5,0.6689,0.790542,0.71769,0.369793,0.336676,0.333697


[I 2025-03-15 15:51:21,467] Trial 75 pruned. 


Trial 76 with params: {'learning_rate': 0.0035828357597675275, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 3, 'lambda_param': 1.0, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8064,1.38321,0.470211,0.1217,0.126143,0.100677
2,1.143,1.06287,0.616865,0.230405,0.236058,0.218606
3,0.7555,0.783148,0.71494,0.367802,0.355838,0.343293
4,0.4822,0.6891,0.745188,0.44567,0.418037,0.415959
5,0.3044,0.654279,0.76352,0.537865,0.490753,0.497548
6,0.2175,0.628217,0.780018,0.633322,0.569738,0.58174
7,0.1562,0.612371,0.790101,0.664374,0.599209,0.61121
8,0.1179,0.603401,0.791934,0.680549,0.639501,0.647163
9,0.0975,0.590425,0.797434,0.701085,0.642429,0.657123
10,0.0899,0.593065,0.791017,0.699652,0.631608,0.65016


[I 2025-03-15 15:53:52,932] Trial 76 finished with value: 0.7154635146418629 and parameters: {'learning_rate': 0.0035828357597675275, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 3, 'lambda_param': 1.0, 'temperature': 6.5}. Best is trial 45 with value: 0.7320126206223214.


Trial 77 with params: {'learning_rate': 0.0017972114965369359, 'weight_decay': 0.004, 'adam_beta1': 0.92, 'warmup_steps': 3, 'lambda_param': 0.9, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9376,1.513256,0.415215,0.084618,0.094824,0.074392
2,1.3023,1.202771,0.549954,0.15954,0.167542,0.146793
3,0.9764,0.940104,0.662695,0.285181,0.277005,0.262669
4,0.6789,0.800636,0.707608,0.368872,0.342757,0.342628
5,0.4852,0.691508,0.749771,0.417313,0.401091,0.397282
6,0.3406,0.674254,0.759853,0.467242,0.441592,0.441027
7,0.2488,0.646898,0.76352,0.516467,0.485255,0.489141
8,0.1923,0.635308,0.769019,0.561555,0.519679,0.52583
9,0.1516,0.624402,0.775435,0.582726,0.539637,0.54737
10,0.1257,0.61456,0.781852,0.610438,0.552667,0.566185


[I 2025-03-15 15:55:26,845] Trial 77 pruned. 


Trial 78 with params: {'learning_rate': 0.0009184416817413834, 'weight_decay': 0.002, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 2, 'lambda_param': 1.0, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0701,1.70878,0.385885,0.065524,0.083488,0.057473
2,1.5174,1.3853,0.474794,0.103273,0.117545,0.091343
3,1.2656,1.17792,0.558203,0.167212,0.166578,0.148407
4,1.0241,0.993996,0.648029,0.25285,0.234912,0.214311
5,0.8257,0.875997,0.678277,0.28226,0.282063,0.267934
6,0.6651,0.795038,0.715857,0.358442,0.336316,0.330309
7,0.5209,0.75803,0.731439,0.407316,0.367127,0.370156
8,0.4188,0.707849,0.753437,0.418464,0.417916,0.407177
9,0.3267,0.677563,0.761687,0.45945,0.433465,0.436129
10,0.2619,0.668576,0.753437,0.48564,0.461028,0.462024


[I 2025-03-15 15:57:12,595] Trial 78 pruned. 


Trial 79 with params: {'learning_rate': 0.00427966697656173, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7305,1.317291,0.499542,0.116636,0.141136,0.112552
2,1.0199,0.908424,0.68286,0.30851,0.313851,0.297167
3,0.6412,0.735005,0.730522,0.410771,0.391717,0.385309
4,0.3927,0.650823,0.765353,0.53898,0.484389,0.497389
5,0.2437,0.621708,0.784601,0.573949,0.543917,0.541429
6,0.1649,0.599521,0.791017,0.657896,0.592957,0.611247
7,0.1233,0.585899,0.799267,0.696303,0.64598,0.655582
8,0.097,0.569173,0.802933,0.727029,0.663701,0.680753
9,0.0835,0.561366,0.807516,0.764828,0.680798,0.704799
10,0.0794,0.567284,0.802933,0.742186,0.6661,0.683651


[I 2025-03-15 15:59:49,302] Trial 79 finished with value: 0.7424099914579557 and parameters: {'learning_rate': 0.00427966697656173, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 7.0}. Best is trial 79 with value: 0.7424099914579557.


Trial 80 with params: {'learning_rate': 0.0015785690559416458, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.9, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.949,1.526392,0.409716,0.081051,0.093449,0.07387
2,1.3046,1.173148,0.560037,0.139714,0.1712,0.145562
3,0.9802,0.90929,0.678277,0.349275,0.287224,0.285764
4,0.6785,0.783298,0.706691,0.390524,0.338536,0.339779
5,0.4911,0.685118,0.749771,0.421412,0.403839,0.400514
6,0.3534,0.67493,0.754354,0.484622,0.434343,0.442445
7,0.257,0.631975,0.769936,0.507054,0.468238,0.474741
8,0.203,0.626739,0.783685,0.598821,0.540746,0.55286
9,0.1563,0.61592,0.781852,0.662647,0.58756,0.609285
10,0.1295,0.600927,0.791017,0.651763,0.602543,0.611792


[I 2025-03-15 16:00:41,458] Trial 80 pruned. 


Trial 81 with params: {'learning_rate': 0.0029201386483594387, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.8, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8655,1.391577,0.469294,0.099596,0.120771,0.095768
2,1.1673,1.009897,0.638863,0.254374,0.247151,0.227621
3,0.7763,0.811397,0.713107,0.354146,0.354336,0.339685
4,0.5075,0.690632,0.752521,0.433271,0.419604,0.413785
5,0.3264,0.645705,0.773602,0.52236,0.496041,0.49299
6,0.2315,0.629101,0.784601,0.585298,0.545809,0.547734
7,0.1606,0.624862,0.785518,0.650888,0.56432,0.589053
8,0.1232,0.612411,0.790101,0.661281,0.6077,0.619081
9,0.1021,0.597016,0.792851,0.664413,0.615814,0.626305
10,0.0914,0.601726,0.791934,0.660321,0.625701,0.629744


[I 2025-03-15 16:01:31,121] Trial 81 pruned. 


Trial 82 with params: {'learning_rate': 0.0033349454725830215, 'weight_decay': 0.001, 'adam_beta1': 0.91, 'warmup_steps': 3, 'lambda_param': 0.9, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7877,1.357278,0.476627,0.088719,0.124524,0.094272
2,1.1197,1.027816,0.623281,0.25807,0.249928,0.233491
3,0.7278,0.780638,0.716774,0.33453,0.35117,0.332377
4,0.469,0.672979,0.753437,0.483032,0.448338,0.452481
5,0.3008,0.658548,0.764436,0.515796,0.497111,0.487795
6,0.2104,0.617844,0.783685,0.60905,0.569907,0.5739
7,0.1464,0.590773,0.789184,0.658645,0.582467,0.601367
8,0.1119,0.583727,0.793767,0.681245,0.626883,0.639024
9,0.0971,0.578946,0.80385,0.698674,0.63306,0.644227
10,0.0869,0.577249,0.79835,0.70156,0.639783,0.651951


[I 2025-03-15 16:04:05,512] Trial 82 finished with value: 0.6978760420984877 and parameters: {'learning_rate': 0.0033349454725830215, 'weight_decay': 0.001, 'adam_beta1': 0.91, 'warmup_steps': 3, 'lambda_param': 0.9, 'temperature': 6.0}. Best is trial 79 with value: 0.7424099914579557.


Trial 83 with params: {'learning_rate': 0.0036943205908849964, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9261,1.380589,0.471127,0.09704,0.116694,0.090177
2,1.1535,1.014488,0.624198,0.256988,0.233071,0.217294
3,0.7642,0.797586,0.710357,0.348581,0.347865,0.332744
4,0.4771,0.706484,0.744271,0.433519,0.428881,0.420564
5,0.3043,0.657494,0.762603,0.513907,0.482685,0.481826
6,0.2147,0.637692,0.768103,0.540634,0.537503,0.526816
7,0.151,0.618458,0.786434,0.649596,0.608843,0.615489
8,0.1166,0.603098,0.789184,0.654928,0.631797,0.631163
9,0.0965,0.590896,0.793767,0.663989,0.636084,0.636479
10,0.0867,0.592873,0.794684,0.695161,0.652404,0.659397


[I 2025-03-15 16:05:51,634] Trial 83 pruned. 


Trial 84 with params: {'learning_rate': 0.0038631096474548445, 'weight_decay': 0.001, 'adam_beta1': 0.91, 'warmup_steps': 2, 'lambda_param': 0.4, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7903,1.340894,0.47846,0.114534,0.128625,0.10586
2,1.088,0.979997,0.660862,0.320277,0.279753,0.273688
3,0.7001,0.756489,0.72319,0.383947,0.370046,0.362044
4,0.4407,0.667741,0.756187,0.502875,0.452635,0.462171
5,0.2779,0.643998,0.771769,0.501647,0.511106,0.499321
6,0.1954,0.596548,0.787351,0.582991,0.551309,0.554911
7,0.1442,0.591026,0.792851,0.652026,0.619442,0.625387
8,0.1102,0.577683,0.800183,0.667665,0.643183,0.641548
9,0.0933,0.569125,0.813932,0.687894,0.652112,0.656328
10,0.0849,0.564971,0.806599,0.72314,0.668988,0.677472


[I 2025-03-15 16:07:48,033] Trial 84 pruned. 


Trial 85 with params: {'learning_rate': 0.0034066097496086275, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9832,1.382398,0.466544,0.100365,0.114782,0.088976
2,1.1635,1.030318,0.618698,0.252575,0.234388,0.214415
3,0.7608,0.783105,0.72594,0.366966,0.366897,0.34871
4,0.4797,0.677627,0.752521,0.439209,0.431832,0.426829
5,0.3139,0.628761,0.773602,0.521766,0.507125,0.499866
6,0.2159,0.610501,0.781852,0.566462,0.537973,0.538945
7,0.1517,0.596595,0.788268,0.626893,0.592223,0.596176
8,0.1182,0.58847,0.790101,0.643439,0.610052,0.614636
9,0.0976,0.572503,0.793767,0.664778,0.62887,0.637153
10,0.0873,0.56999,0.799267,0.7052,0.668033,0.674571


[I 2025-03-15 16:10:25,027] Trial 85 finished with value: 0.7161360138102446 and parameters: {'learning_rate': 0.0034066097496086275, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 6.0}. Best is trial 79 with value: 0.7424099914579557.


Trial 86 with params: {'learning_rate': 0.00024696163656226093, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.2, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2586,2.04108,0.176902,0.003538,0.02,0.006012
2,1.8805,1.719975,0.390467,0.040206,0.081004,0.051907
3,1.6213,1.507819,0.43538,0.065987,0.096395,0.067105
4,1.4469,1.378782,0.476627,0.123623,0.117601,0.093441
5,1.3278,1.286973,0.508708,0.112932,0.132462,0.10858


[I 2025-03-15 16:10:50,702] Trial 86 pruned. 


Trial 87 with params: {'learning_rate': 0.0024999237575739435, 'weight_decay': 0.0, 'adam_beta1': 0.92, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8562,1.407444,0.466544,0.099932,0.115708,0.093788
2,1.209,1.086267,0.586618,0.210716,0.205676,0.191139
3,0.8354,0.817841,0.706691,0.344514,0.334685,0.328361
4,0.5483,0.725791,0.736939,0.397548,0.395813,0.387056
5,0.3765,0.659061,0.766269,0.529216,0.481012,0.488572
6,0.2641,0.624294,0.781852,0.542474,0.499129,0.502246
7,0.1891,0.615197,0.777269,0.578967,0.525173,0.536697
8,0.1505,0.604566,0.784601,0.594231,0.582226,0.575506
9,0.1192,0.581223,0.796517,0.677979,0.618172,0.633026
10,0.1014,0.591777,0.791017,0.675871,0.629662,0.637166


[I 2025-03-15 16:13:31,441] Trial 87 finished with value: 0.6725783904281619 and parameters: {'learning_rate': 0.0024999237575739435, 'weight_decay': 0.0, 'adam_beta1': 0.92, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 6.0}. Best is trial 79 with value: 0.7424099914579557.


Trial 88 with params: {'learning_rate': 0.004040536817439322, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 3, 'lambda_param': 0.6000000000000001, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8236,1.363505,0.477544,0.099015,0.125078,0.098437
2,1.145,1.044063,0.623281,0.227249,0.238625,0.215738
3,0.7672,0.785913,0.71494,0.347555,0.350027,0.337822
4,0.4739,0.687034,0.744271,0.385372,0.39903,0.38412
5,0.297,0.653854,0.762603,0.526717,0.494222,0.492202
6,0.2004,0.616729,0.782768,0.639961,0.583797,0.591336
7,0.1399,0.612031,0.778185,0.639719,0.592255,0.599921
8,0.1149,0.597166,0.783685,0.68437,0.623373,0.637693
9,0.0962,0.588562,0.791934,0.722297,0.645652,0.6645
10,0.0871,0.590476,0.796517,0.735663,0.665981,0.681603


[I 2025-03-15 16:15:10,532] Trial 88 pruned. 


Trial 89 with params: {'learning_rate': 0.004031022345931651, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.9, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7479,1.330114,0.496792,0.135347,0.137772,0.113953
2,1.0477,0.908244,0.679193,0.320609,0.300215,0.289036
3,0.6481,0.714948,0.736939,0.410681,0.391361,0.381876
4,0.4095,0.64808,0.769019,0.531892,0.505529,0.504334
5,0.2495,0.615564,0.781852,0.576351,0.532258,0.535214
6,0.1804,0.59021,0.791934,0.614327,0.577725,0.58186
7,0.123,0.59131,0.790101,0.669975,0.619115,0.631545
8,0.0981,0.574727,0.80385,0.723284,0.664462,0.679409
9,0.0848,0.567229,0.807516,0.753297,0.680355,0.699449
10,0.0803,0.565041,0.804766,0.730094,0.658914,0.67508


[I 2025-03-15 16:17:40,583] Trial 89 finished with value: 0.7213461979619734 and parameters: {'learning_rate': 0.004031022345931651, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.9, 'temperature': 6.5}. Best is trial 79 with value: 0.7424099914579557.


Trial 90 with params: {'learning_rate': 0.0019976534404814463, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9286,1.478066,0.428048,0.114267,0.101148,0.082733
2,1.2603,1.136293,0.570119,0.200756,0.180625,0.159831
3,0.9088,0.865595,0.692942,0.350671,0.316667,0.309511
4,0.6108,0.752763,0.71494,0.35937,0.35557,0.346724
5,0.4202,0.669513,0.754354,0.463268,0.434475,0.435958
6,0.2986,0.664406,0.751604,0.505169,0.457027,0.464356
7,0.2152,0.626777,0.784601,0.60732,0.527185,0.541431
8,0.1678,0.607192,0.774519,0.596358,0.551486,0.55783
9,0.1321,0.619055,0.783685,0.628811,0.584201,0.593128
10,0.111,0.602052,0.790101,0.64537,0.609197,0.614583


[I 2025-03-15 16:18:30,940] Trial 90 pruned. 


Trial 91 with params: {'learning_rate': 0.0046173057607050766, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.8, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7583,1.302818,0.519707,0.123322,0.151699,0.122656
2,1.043,0.932903,0.668194,0.307462,0.306,0.290477
3,0.6336,0.71022,0.747938,0.419141,0.403589,0.393836
4,0.3802,0.643618,0.767186,0.51633,0.499536,0.494544
5,0.2296,0.59284,0.793767,0.609787,0.569548,0.573536
6,0.151,0.574246,0.8011,0.657346,0.625978,0.629566
7,0.1142,0.570423,0.802933,0.740665,0.67128,0.691153
8,0.0932,0.553984,0.806599,0.73005,0.671239,0.683681
9,0.0829,0.550557,0.811182,0.745305,0.674504,0.692398
10,0.0791,0.552833,0.805683,0.761735,0.688792,0.710225


[I 2025-03-15 16:21:05,734] Trial 91 finished with value: 0.7168629658444057 and parameters: {'learning_rate': 0.0046173057607050766, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.8, 'temperature': 7.0}. Best is trial 79 with value: 0.7424099914579557.


Trial 92 with params: {'learning_rate': 0.003831341644287877, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 4, 'lambda_param': 0.7000000000000001, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7787,1.332111,0.503208,0.110815,0.138366,0.111162
2,1.093,0.957198,0.654445,0.278815,0.272961,0.253384
3,0.6831,0.747018,0.737855,0.374571,0.380545,0.369129
4,0.4234,0.64636,0.774519,0.520074,0.487854,0.490691
5,0.2667,0.634924,0.783685,0.536736,0.514686,0.510366
6,0.1933,0.590613,0.791934,0.612322,0.538244,0.557344
7,0.1415,0.587973,0.802016,0.672616,0.636621,0.643799
8,0.1079,0.574777,0.808433,0.6956,0.65717,0.663733
9,0.0907,0.56668,0.802933,0.669583,0.63063,0.639107
10,0.0829,0.562049,0.808433,0.716993,0.658306,0.675155


[I 2025-03-15 16:23:41,842] Trial 92 finished with value: 0.7002475050059774 and parameters: {'learning_rate': 0.003831341644287877, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 4, 'lambda_param': 0.7000000000000001, 'temperature': 7.0}. Best is trial 79 with value: 0.7424099914579557.


Trial 93 with params: {'learning_rate': 0.0018400474384852972, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9505,1.524551,0.401467,0.07741,0.091225,0.071773
2,1.2818,1.142864,0.565536,0.132078,0.176177,0.146489
3,0.9395,0.878917,0.692942,0.326698,0.305146,0.297887
4,0.6374,0.777595,0.716774,0.392864,0.360848,0.360513
5,0.4494,0.670968,0.767186,0.462305,0.454938,0.449997
6,0.3128,0.657871,0.758937,0.487215,0.462763,0.46612
7,0.2242,0.624392,0.778185,0.553435,0.511515,0.518333
8,0.1749,0.614153,0.775435,0.575932,0.545059,0.545416
9,0.1363,0.605142,0.786434,0.647757,0.586737,0.602061
10,0.1143,0.604058,0.792851,0.663181,0.619698,0.629557


[I 2025-03-15 16:24:33,651] Trial 93 pruned. 


Trial 94 with params: {'learning_rate': 0.004794522211574071, 'weight_decay': 0.0, 'adam_beta1': 0.92, 'warmup_steps': 4, 'lambda_param': 0.2, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7682,1.316478,0.498625,0.111862,0.14006,0.114457
2,1.069,0.977722,0.649863,0.294088,0.281285,0.271751
3,0.6675,0.74762,0.722273,0.385857,0.375647,0.366093
4,0.3972,0.642505,0.772686,0.493128,0.489793,0.480857
5,0.2474,0.610444,0.782768,0.534739,0.531727,0.517312
6,0.1709,0.585167,0.8011,0.682232,0.632888,0.644878
7,0.1292,0.577429,0.807516,0.725422,0.674466,0.684967
8,0.1024,0.567181,0.802933,0.743113,0.688017,0.698262
9,0.0879,0.559731,0.808433,0.751635,0.676769,0.6977
10,0.0834,0.559574,0.802016,0.748435,0.673673,0.693071


[I 2025-03-15 16:27:03,710] Trial 94 finished with value: 0.729398736483693 and parameters: {'learning_rate': 0.004794522211574071, 'weight_decay': 0.0, 'adam_beta1': 0.92, 'warmup_steps': 4, 'lambda_param': 0.2, 'temperature': 7.0}. Best is trial 79 with value: 0.7424099914579557.


Trial 95 with params: {'learning_rate': 0.004756025487608138, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.762,1.310553,0.511457,0.114239,0.149187,0.12122
2,1.0397,0.96782,0.659028,0.307851,0.2974,0.283792
3,0.6314,0.726873,0.749771,0.398159,0.399165,0.388972
4,0.377,0.652759,0.767186,0.515014,0.490173,0.488515
5,0.2315,0.590059,0.792851,0.579849,0.548815,0.547763
6,0.1533,0.560553,0.802016,0.691142,0.643051,0.648579
7,0.1119,0.554511,0.811182,0.716923,0.666366,0.675148
8,0.0929,0.553929,0.808433,0.754777,0.672159,0.692142
9,0.0828,0.543979,0.815765,0.750477,0.680728,0.699551
10,0.078,0.543628,0.813016,0.760714,0.675244,0.698456


[I 2025-03-15 16:29:38,927] Trial 95 finished with value: 0.7210372604703523 and parameters: {'learning_rate': 0.004756025487608138, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 5.5}. Best is trial 79 with value: 0.7424099914579557.


Trial 96 with params: {'learning_rate': 0.003916333903012831, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.30000000000000004, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7434,1.318738,0.489459,0.126394,0.135453,0.108916
2,1.0452,0.90965,0.670944,0.293414,0.289987,0.273311
3,0.6582,0.733853,0.736022,0.414313,0.385086,0.377214
4,0.4105,0.653938,0.771769,0.535812,0.503988,0.504496
5,0.2534,0.619505,0.779102,0.556328,0.526664,0.523433
6,0.1717,0.595053,0.783685,0.643552,0.590311,0.605614
7,0.125,0.582321,0.7956,0.681584,0.623963,0.639646
8,0.0995,0.571841,0.792851,0.695143,0.653126,0.662411
9,0.0878,0.565711,0.792851,0.686157,0.644027,0.654414
10,0.0819,0.565988,0.800183,0.748017,0.681195,0.700923


[I 2025-03-15 16:32:09,909] Trial 96 finished with value: 0.694047731504514 and parameters: {'learning_rate': 0.003916333903012831, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.30000000000000004, 'temperature': 6.5}. Best is trial 79 with value: 0.7424099914579557.


Trial 97 with params: {'learning_rate': 0.004015748553559552, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7756,1.330914,0.490376,0.110994,0.126204,0.099788
2,1.1074,1.014368,0.629698,0.249762,0.26093,0.236821
3,0.7274,0.762703,0.725023,0.359385,0.370583,0.356504
4,0.4457,0.665662,0.758937,0.465777,0.443301,0.438871
5,0.2813,0.619912,0.778185,0.527899,0.498202,0.499734
6,0.1931,0.588591,0.791017,0.647789,0.579107,0.593008
7,0.1399,0.588621,0.797434,0.70747,0.627372,0.648904
8,0.109,0.57778,0.805683,0.717372,0.655066,0.670172
9,0.0922,0.56608,0.810266,0.716672,0.662533,0.673395
10,0.0841,0.5679,0.807516,0.700218,0.659185,0.665042


[I 2025-03-15 16:34:42,850] Trial 97 finished with value: 0.7023655838138257 and parameters: {'learning_rate': 0.004015748553559552, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 6.5}. Best is trial 79 with value: 0.7424099914579557.


Trial 98 with params: {'learning_rate': 0.0035619732926833146, 'weight_decay': 0.005, 'adam_beta1': 0.91, 'warmup_steps': 4, 'lambda_param': 0.2, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9605,1.368062,0.480293,0.097196,0.119626,0.092203
2,1.1462,0.996839,0.632447,0.245817,0.243832,0.226725
3,0.752,0.781647,0.709441,0.331448,0.344862,0.331956
4,0.4749,0.680214,0.759853,0.450313,0.434322,0.431713
5,0.3104,0.650711,0.773602,0.504416,0.502096,0.490081
6,0.2168,0.60632,0.786434,0.550526,0.535783,0.531109
7,0.1525,0.600367,0.794684,0.635256,0.610021,0.604463
8,0.1175,0.588929,0.79835,0.642106,0.621453,0.61939
9,0.0967,0.576342,0.800183,0.678197,0.649371,0.647222
10,0.0882,0.584074,0.800183,0.694645,0.651791,0.655992


[I 2025-03-15 16:36:25,422] Trial 98 pruned. 


Trial 99 with params: {'learning_rate': 0.0021434225857968666, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8757,1.418422,0.456462,0.103586,0.110933,0.090399
2,1.2115,1.083898,0.592117,0.191829,0.208028,0.183722
3,0.845,0.822661,0.706691,0.345048,0.334259,0.326322
4,0.563,0.730378,0.730522,0.379756,0.381803,0.37374
5,0.3887,0.654303,0.761687,0.471938,0.437863,0.44233
6,0.2733,0.645455,0.770852,0.515309,0.479382,0.482836
7,0.1954,0.632031,0.765353,0.584007,0.503919,0.521866
8,0.1519,0.614887,0.777269,0.604852,0.556386,0.567064
9,0.1213,0.605071,0.785518,0.674951,0.596005,0.613497
10,0.1029,0.59802,0.788268,0.666143,0.628475,0.632452


[I 2025-03-15 16:38:04,542] Trial 99 pruned. 


Trial 100 with params: {'learning_rate': 0.003091581936339246, 'weight_decay': 0.001, 'adam_beta1': 0.93, 'warmup_steps': 4, 'lambda_param': 0.9, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8969,1.416253,0.457379,0.09424,0.110758,0.087971
2,1.2033,1.087934,0.600367,0.218998,0.213939,0.195291
3,0.8324,0.841706,0.708524,0.323306,0.335076,0.323134
4,0.5424,0.721394,0.746104,0.404233,0.40337,0.38865
5,0.3578,0.661495,0.759853,0.487894,0.455022,0.454293
6,0.2508,0.627107,0.783685,0.551969,0.511461,0.51439
7,0.1806,0.620717,0.782768,0.628424,0.558041,0.568669
8,0.138,0.607973,0.780935,0.613552,0.566311,0.576119
9,0.1113,0.601278,0.7956,0.659959,0.61069,0.620445
10,0.0962,0.604123,0.789184,0.66998,0.615866,0.629793


[I 2025-03-15 16:38:55,478] Trial 100 pruned. 


Trial 101 with params: {'learning_rate': 0.004072412448526985, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.9, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7429,1.317513,0.504125,0.132194,0.140739,0.117344
2,1.0363,0.922278,0.672777,0.335227,0.303236,0.293672
3,0.6427,0.737177,0.735105,0.399561,0.398015,0.385375
4,0.4059,0.644349,0.772686,0.52387,0.502893,0.501761
5,0.2433,0.627718,0.774519,0.569658,0.528587,0.527347
6,0.1765,0.593078,0.791017,0.629456,0.595571,0.597437
7,0.1204,0.587726,0.796517,0.678033,0.626826,0.637423
8,0.0963,0.584916,0.8011,0.730764,0.659013,0.675031
9,0.0835,0.571516,0.802016,0.756752,0.669899,0.69093
10,0.0792,0.577869,0.799267,0.759117,0.685763,0.706017


[I 2025-03-15 16:41:31,377] Trial 101 finished with value: 0.720708938951212 and parameters: {'learning_rate': 0.004072412448526985, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.9, 'temperature': 6.5}. Best is trial 79 with value: 0.7424099914579557.


Trial 102 with params: {'learning_rate': 0.002974123900951366, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 3, 'lambda_param': 0.8, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7939,1.363765,0.47846,0.091978,0.123633,0.096108
2,1.1378,1.018665,0.615949,0.263232,0.242234,0.228191
3,0.7428,0.769847,0.718607,0.36662,0.358351,0.34717
4,0.472,0.676514,0.747938,0.452358,0.417617,0.4192
5,0.3064,0.617683,0.782768,0.538938,0.515639,0.512941
6,0.2156,0.607689,0.786434,0.598954,0.56286,0.565715
7,0.1501,0.58609,0.791934,0.657673,0.579374,0.593048
8,0.1217,0.573047,0.807516,0.727091,0.650797,0.668733
9,0.0986,0.557745,0.813932,0.731986,0.662113,0.679737
10,0.0872,0.566876,0.804766,0.761172,0.666197,0.694351


[I 2025-03-15 16:44:02,898] Trial 102 finished with value: 0.7192544457853266 and parameters: {'learning_rate': 0.002974123900951366, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 3, 'lambda_param': 0.8, 'temperature': 7.0}. Best is trial 79 with value: 0.7424099914579557.


Trial 103 with params: {'learning_rate': 0.0020317522690098543, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 2, 'lambda_param': 0.9, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9017,1.472691,0.43538,0.133685,0.103269,0.08494
2,1.2504,1.140537,0.568286,0.186522,0.184054,0.163264
3,0.8956,0.850353,0.694775,0.322315,0.31441,0.307354
4,0.6076,0.748655,0.725023,0.377164,0.365573,0.360585
5,0.4205,0.666497,0.769019,0.505098,0.465056,0.467897
6,0.2945,0.660994,0.767186,0.511348,0.473312,0.477911
7,0.2196,0.624554,0.780018,0.536403,0.504501,0.503789
8,0.1723,0.608204,0.780935,0.588879,0.567548,0.566211
9,0.1334,0.608288,0.784601,0.626142,0.567752,0.583632
10,0.1107,0.602047,0.782768,0.636265,0.591224,0.601775


[I 2025-03-15 16:44:56,790] Trial 103 pruned. 


Trial 104 with params: {'learning_rate': 0.00437038253082406, 'weight_decay': 0.001, 'adam_beta1': 0.91, 'warmup_steps': 4, 'lambda_param': 0.9, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7416,1.31616,0.513291,0.129081,0.147055,0.123439
2,1.0361,0.946612,0.664528,0.309353,0.299999,0.285919
3,0.6565,0.730143,0.738772,0.39287,0.392694,0.38175
4,0.4082,0.653559,0.76352,0.50971,0.471244,0.473052
5,0.2479,0.624656,0.781852,0.617703,0.552033,0.558829
6,0.1657,0.586905,0.8011,0.620449,0.600398,0.596694
7,0.1186,0.584392,0.791017,0.696433,0.630281,0.643896
8,0.0954,0.577258,0.799267,0.735234,0.670688,0.687297
9,0.0838,0.572222,0.807516,0.761057,0.685073,0.708864
10,0.0795,0.573705,0.802933,0.730636,0.672662,0.685458


[I 2025-03-15 16:47:40,616] Trial 104 finished with value: 0.7250945646978804 and parameters: {'learning_rate': 0.00437038253082406, 'weight_decay': 0.001, 'adam_beta1': 0.91, 'warmup_steps': 4, 'lambda_param': 0.9, 'temperature': 5.0}. Best is trial 79 with value: 0.7424099914579557.


Trial 105 with params: {'learning_rate': 0.004299610816003975, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.91, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7397,1.352949,0.511457,0.14119,0.144302,0.123825
2,1.0472,0.932067,0.673694,0.303029,0.309869,0.294753
3,0.6648,0.721703,0.735105,0.396916,0.381369,0.374735
4,0.4077,0.652209,0.768103,0.521316,0.488226,0.493022
5,0.2527,0.61969,0.780018,0.599006,0.537842,0.548143
6,0.1715,0.611463,0.782768,0.627002,0.587075,0.592417
7,0.1286,0.594786,0.791934,0.687887,0.643427,0.646621
8,0.1011,0.588992,0.800183,0.682947,0.657566,0.655186
9,0.086,0.580256,0.800183,0.692546,0.65006,0.655123
10,0.0809,0.582114,0.79835,0.709008,0.671621,0.673508


[I 2025-03-15 16:50:18,264] Trial 105 finished with value: 0.7006659265685178 and parameters: {'learning_rate': 0.004299610816003975, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.91, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 6.5}. Best is trial 79 with value: 0.7424099914579557.


Trial 106 with params: {'learning_rate': 0.004967055493307966, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 3, 'lambda_param': 1.0, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9694,1.355722,0.48396,0.119807,0.130381,0.110792
2,1.0933,1.022526,0.633364,0.307746,0.267914,0.261187
3,0.6915,0.74944,0.730522,0.392249,0.381201,0.375762
4,0.4085,0.651951,0.764436,0.499561,0.470979,0.476293
5,0.2585,0.633347,0.785518,0.561519,0.524763,0.526518
6,0.1863,0.607381,0.793767,0.625882,0.596483,0.597989
7,0.1328,0.587934,0.800183,0.651885,0.612726,0.614128
8,0.1038,0.579447,0.802933,0.700522,0.654008,0.661202
9,0.0902,0.573306,0.804766,0.718211,0.667284,0.677613
10,0.0845,0.570861,0.805683,0.739089,0.664002,0.682264


[I 2025-03-15 16:52:57,005] Trial 106 finished with value: 0.711656832116688 and parameters: {'learning_rate': 0.004967055493307966, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 3, 'lambda_param': 1.0, 'temperature': 4.0}. Best is trial 79 with value: 0.7424099914579557.


Trial 107 with params: {'learning_rate': 0.00042255438774755895, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 4, 'lambda_param': 0.8, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2112,1.939461,0.254812,0.044064,0.045071,0.036395
2,1.7386,1.571577,0.426214,0.054019,0.092671,0.061453
3,1.4653,1.367562,0.48121,0.100855,0.122707,0.096334
4,1.2836,1.216164,0.543538,0.129472,0.158861,0.134323
5,1.1309,1.10355,0.578368,0.154439,0.177036,0.149817
6,0.9782,1.022666,0.619615,0.193353,0.215127,0.18992
7,0.854,0.936986,0.651696,0.275584,0.249469,0.238465
8,0.7536,0.886914,0.692942,0.359557,0.298425,0.299337
9,0.6632,0.83975,0.699358,0.321891,0.313488,0.307675
10,0.5736,0.807862,0.706691,0.384317,0.331263,0.339837


[I 2025-03-15 16:54:54,539] Trial 107 pruned. 


Trial 108 with params: {'learning_rate': 0.0049204390526703815, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 3, 'lambda_param': 0.9, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9636,1.350412,0.485793,0.115311,0.132352,0.109909
2,1.0792,0.960502,0.662695,0.316194,0.29192,0.286021
3,0.6618,0.735442,0.744271,0.406609,0.397045,0.385946
4,0.3958,0.651254,0.769019,0.535188,0.502641,0.505192
5,0.2533,0.621108,0.787351,0.54786,0.53916,0.534074
6,0.1752,0.596268,0.8011,0.625334,0.590605,0.596153
7,0.1243,0.584663,0.804766,0.687933,0.64554,0.648927
8,0.1026,0.572933,0.813932,0.724733,0.659903,0.673119
9,0.0892,0.565565,0.817599,0.738594,0.666372,0.687882
10,0.0847,0.565495,0.820348,0.770951,0.714919,0.725615


[I 2025-03-15 16:57:29,969] Trial 108 finished with value: 0.7353084909693063 and parameters: {'learning_rate': 0.0049204390526703815, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 3, 'lambda_param': 0.9, 'temperature': 6.5}. Best is trial 79 with value: 0.7424099914579557.


Trial 109 with params: {'learning_rate': 0.0023543513023025157, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 3, 'lambda_param': 0.9, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8811,1.408278,0.459212,0.096401,0.111661,0.089059
2,1.2098,1.122343,0.594867,0.204003,0.215501,0.191601
3,0.8424,0.834504,0.705775,0.345692,0.331635,0.324151
4,0.5628,0.732518,0.734189,0.392468,0.397886,0.385177
5,0.3796,0.647749,0.768103,0.479048,0.468401,0.459004
6,0.2713,0.64578,0.774519,0.528744,0.496776,0.498291
7,0.1973,0.626005,0.769936,0.561494,0.490608,0.505796
8,0.152,0.619325,0.778185,0.559558,0.554923,0.545918
9,0.1216,0.595023,0.789184,0.652795,0.614786,0.621274
10,0.103,0.607676,0.785518,0.653193,0.620706,0.625934


[I 2025-03-15 16:58:28,227] Trial 109 pruned. 


Trial 110 with params: {'learning_rate': 0.002062317199659096, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 2, 'lambda_param': 0.9, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9034,1.479528,0.428048,0.114993,0.101969,0.084948
2,1.2582,1.127781,0.573786,0.191227,0.181504,0.161658
3,0.9039,0.861562,0.690192,0.330536,0.31997,0.309307
4,0.6094,0.752112,0.722273,0.374998,0.362609,0.358084
5,0.4209,0.667596,0.75802,0.460953,0.429067,0.430937
6,0.293,0.650559,0.769019,0.518157,0.475064,0.47821
7,0.213,0.62541,0.780018,0.574453,0.510332,0.522319
8,0.1676,0.619319,0.783685,0.593016,0.566841,0.568854
9,0.1302,0.608189,0.791017,0.641709,0.574891,0.59454
10,0.11,0.601466,0.7956,0.699796,0.645515,0.657495


[I 2025-03-15 17:01:03,470] Trial 110 finished with value: 0.6882977817795447 and parameters: {'learning_rate': 0.002062317199659096, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 2, 'lambda_param': 0.9, 'temperature': 7.0}. Best is trial 79 with value: 0.7424099914579557.


Trial 111 with params: {'learning_rate': 0.0020948770962470205, 'weight_decay': 0.007, 'adam_beta1': 0.91, 'warmup_steps': 3, 'lambda_param': 0.7000000000000001, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8789,1.450981,0.448213,0.091157,0.105678,0.081788
2,1.2417,1.134064,0.573786,0.178319,0.184946,0.159096
3,0.8898,0.852827,0.692942,0.337743,0.314179,0.304979
4,0.5968,0.749028,0.727773,0.379262,0.365953,0.362807
5,0.412,0.663945,0.766269,0.478511,0.460691,0.457206
6,0.2894,0.641527,0.762603,0.49045,0.451104,0.454568
7,0.2097,0.628154,0.776352,0.589693,0.521732,0.539381
8,0.1613,0.616621,0.785518,0.5727,0.556805,0.554047
9,0.1264,0.595751,0.788268,0.666529,0.593733,0.609159
10,0.1056,0.592645,0.786434,0.686466,0.6155,0.633846


[I 2025-03-15 17:01:54,701] Trial 111 pruned. 


Trial 112 with params: {'learning_rate': 0.0036719686740773663, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 4, 'lambda_param': 0.7000000000000001, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8345,1.344175,0.493126,0.102158,0.132737,0.103354
2,1.1102,0.961457,0.647113,0.263248,0.264406,0.24431
3,0.7205,0.754643,0.730522,0.362166,0.365164,0.350794
4,0.4542,0.678641,0.75802,0.420162,0.424417,0.415971
5,0.2875,0.635953,0.779102,0.54866,0.510194,0.510879
6,0.2021,0.622681,0.780018,0.606086,0.55895,0.567631
7,0.1448,0.609691,0.791934,0.681122,0.636043,0.645837
8,0.1106,0.590054,0.797434,0.669873,0.651392,0.647455
9,0.0935,0.576324,0.806599,0.688171,0.668597,0.665076
10,0.0848,0.580346,0.80385,0.690024,0.664258,0.665067


[I 2025-03-15 17:04:30,902] Trial 112 finished with value: 0.7116720248876972 and parameters: {'learning_rate': 0.0036719686740773663, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 4, 'lambda_param': 0.7000000000000001, 'temperature': 7.0}. Best is trial 79 with value: 0.7424099914579557.


Trial 113 with params: {'learning_rate': 0.004835876788416951, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7621,1.314895,0.505041,0.111575,0.147051,0.119716
2,1.049,0.970222,0.662695,0.319531,0.298715,0.288095
3,0.6403,0.731068,0.744271,0.425529,0.402081,0.396506
4,0.3886,0.636643,0.767186,0.505227,0.487029,0.487739
5,0.233,0.604196,0.788268,0.649745,0.58067,0.593136
6,0.1558,0.576274,0.8011,0.676814,0.635112,0.6418
7,0.1184,0.563025,0.807516,0.688082,0.631036,0.646004
8,0.0954,0.560464,0.808433,0.76852,0.682951,0.706539
9,0.0833,0.553006,0.816682,0.772341,0.688004,0.7121
10,0.0789,0.555706,0.810266,0.76584,0.688469,0.707884


[I 2025-03-15 17:07:12,463] Trial 113 finished with value: 0.7030470005390426 and parameters: {'learning_rate': 0.004835876788416951, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 7.0}. Best is trial 79 with value: 0.7424099914579557.


Trial 114 with params: {'learning_rate': 0.0008469561848146573, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0705,1.69038,0.367553,0.073922,0.076296,0.05144
2,1.4804,1.362026,0.474794,0.109533,0.117665,0.095232
3,1.2081,1.121034,0.575619,0.154394,0.177124,0.154727
4,0.9526,0.953763,0.648946,0.271078,0.24516,0.232827
5,0.7641,0.857736,0.698442,0.342059,0.311188,0.307577


[I 2025-03-15 17:07:39,256] Trial 114 pruned. 


Trial 115 with params: {'learning_rate': 0.00014963473671761275, 'weight_decay': 0.007, 'adam_beta1': 0.98, 'warmup_steps': 0, 'lambda_param': 0.7000000000000001, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3183,2.13098,0.176902,0.003538,0.02,0.006012
2,2.0896,2.036236,0.203483,0.011737,0.027945,0.01432
3,1.9822,1.88217,0.290559,0.01842,0.050542,0.026653
4,1.8307,1.765115,0.36297,0.040547,0.07183,0.045735
5,1.7281,1.652626,0.387718,0.044607,0.080117,0.053904


[I 2025-03-15 17:08:04,413] Trial 115 pruned. 


Trial 116 with params: {'learning_rate': 0.00425364252462216, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.9, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7326,1.294374,0.503208,0.117377,0.140295,0.115254
2,1.0262,0.88694,0.689276,0.29731,0.314009,0.298127
3,0.6357,0.730076,0.741522,0.398416,0.40177,0.385521
4,0.3975,0.659171,0.76077,0.545292,0.496931,0.50636
5,0.2495,0.608184,0.773602,0.542649,0.519626,0.51495
6,0.1683,0.59192,0.789184,0.650141,0.582589,0.59333
7,0.1213,0.57765,0.802016,0.695039,0.638898,0.653704
8,0.0966,0.57058,0.802933,0.73277,0.653809,0.676119
9,0.0839,0.56879,0.802016,0.775015,0.664257,0.697259
10,0.0807,0.571381,0.8011,0.742542,0.65807,0.682264


[I 2025-03-15 17:10:48,100] Trial 116 finished with value: 0.7220803864748833 and parameters: {'learning_rate': 0.00425364252462216, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.9, 'temperature': 5.0}. Best is trial 79 with value: 0.7424099914579557.


Trial 117 with params: {'learning_rate': 0.004374181490113054, 'weight_decay': 0.001, 'adam_beta1': 0.91, 'warmup_steps': 4, 'lambda_param': 0.9, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7411,1.321119,0.506874,0.126845,0.144613,0.12045
2,1.0329,0.951924,0.665445,0.286592,0.295887,0.27594
3,0.6492,0.723357,0.733272,0.414455,0.386056,0.380779
4,0.4013,0.646486,0.759853,0.504618,0.472473,0.469389
5,0.2425,0.604931,0.787351,0.571148,0.538775,0.541633
6,0.159,0.573967,0.804766,0.651045,0.60268,0.613363
7,0.1178,0.57279,0.802016,0.707997,0.651161,0.663614
8,0.0952,0.562063,0.804766,0.728729,0.677933,0.688351
9,0.0832,0.557355,0.805683,0.726253,0.668345,0.681348
10,0.0797,0.556633,0.806599,0.73015,0.677625,0.686695


[I 2025-03-15 17:13:22,507] Trial 117 finished with value: 0.7275893860777765 and parameters: {'learning_rate': 0.004374181490113054, 'weight_decay': 0.001, 'adam_beta1': 0.91, 'warmup_steps': 4, 'lambda_param': 0.9, 'temperature': 4.5}. Best is trial 79 with value: 0.7424099914579557.


Trial 118 with params: {'learning_rate': 0.002807199074165172, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.9, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8601,1.398084,0.469294,0.100275,0.120499,0.095998
2,1.1732,1.010944,0.635197,0.230175,0.246966,0.225581
3,0.782,0.807547,0.71494,0.328887,0.352146,0.333545
4,0.5149,0.69922,0.740605,0.408236,0.399993,0.391878
5,0.3372,0.65056,0.771769,0.505255,0.494711,0.488227
6,0.2387,0.631136,0.771769,0.53199,0.51225,0.506567
7,0.1656,0.615823,0.790101,0.6443,0.58978,0.603938
8,0.1251,0.605242,0.787351,0.663232,0.627003,0.629183
9,0.1055,0.584245,0.79835,0.687952,0.647098,0.654742
10,0.0921,0.596392,0.7956,0.735923,0.683825,0.693142


[I 2025-03-15 17:16:01,570] Trial 118 finished with value: 0.7176121112690046 and parameters: {'learning_rate': 0.002807199074165172, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.9, 'temperature': 4.0}. Best is trial 79 with value: 0.7424099914579557.


Trial 119 with params: {'learning_rate': 0.004906394310012523, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 3, 'lambda_param': 0.9, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9591,1.33853,0.486709,0.111409,0.13352,0.105298
2,1.0697,0.948489,0.658112,0.298892,0.280265,0.271163
3,0.651,0.717742,0.750687,0.40287,0.400219,0.389411
4,0.3953,0.643541,0.780935,0.529024,0.498829,0.500399
5,0.2486,0.607817,0.790101,0.538795,0.542537,0.528493
6,0.1705,0.581978,0.810266,0.629994,0.612895,0.608062
7,0.1252,0.576429,0.805683,0.684262,0.643015,0.647052
8,0.1017,0.578843,0.815765,0.699314,0.676545,0.673433
9,0.0879,0.560467,0.821265,0.719234,0.680803,0.687259
10,0.0821,0.560102,0.817599,0.722043,0.681361,0.687108


[I 2025-03-15 17:18:46,408] Trial 119 finished with value: 0.7105926283602771 and parameters: {'learning_rate': 0.004906394310012523, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 3, 'lambda_param': 0.9, 'temperature': 4.5}. Best is trial 79 with value: 0.7424099914579557.


Trial 120 with params: {'learning_rate': 0.0031602431037892243, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.9, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8857,1.387483,0.476627,0.098805,0.123665,0.095712
2,1.1574,0.999774,0.638863,0.248448,0.252933,0.232449
3,0.7602,0.782973,0.72044,0.345717,0.353092,0.338262
4,0.4858,0.697159,0.750687,0.441701,0.419066,0.416662
5,0.3145,0.649712,0.769019,0.501941,0.487795,0.478206
6,0.218,0.611421,0.789184,0.559455,0.53988,0.537541
7,0.1528,0.600038,0.790101,0.59987,0.555124,0.564362
8,0.1213,0.586776,0.80385,0.692397,0.659872,0.660514
9,0.1013,0.574688,0.812099,0.683997,0.666308,0.665793
10,0.0877,0.577564,0.809349,0.714943,0.675964,0.684018


[I 2025-03-15 17:20:36,925] Trial 120 pruned. 


Trial 121 with params: {'learning_rate': 0.0005694536738201405, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 3, 'lambda_param': 0.8, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1473,1.86349,0.331806,0.03299,0.070509,0.042745
2,1.6364,1.482149,0.451879,0.084826,0.104683,0.077076
3,1.3711,1.276289,0.514207,0.12256,0.139308,0.114598
4,1.1736,1.121038,0.590284,0.179077,0.187112,0.164632
5,1.0021,1.003317,0.626031,0.239393,0.213967,0.195481
6,0.8348,0.915669,0.660862,0.262437,0.261659,0.250082
7,0.7024,0.846344,0.679193,0.347339,0.293952,0.294746
8,0.5974,0.79914,0.724106,0.364069,0.346225,0.344513
9,0.5065,0.757155,0.739688,0.410908,0.375365,0.379486
10,0.4209,0.728464,0.747938,0.45846,0.406231,0.415083


[I 2025-03-15 17:22:26,631] Trial 121 pruned. 


Trial 122 with params: {'learning_rate': 0.004275505205052455, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.8, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7296,1.324486,0.494959,0.119671,0.140318,0.113611
2,1.023,0.910524,0.67736,0.298021,0.304238,0.288663
3,0.6424,0.728886,0.737855,0.414686,0.401511,0.395426
4,0.3942,0.653248,0.769019,0.51237,0.486502,0.489451
5,0.2427,0.625598,0.782768,0.549413,0.532601,0.529488
6,0.1662,0.587348,0.794684,0.671892,0.594986,0.617123
7,0.1223,0.582243,0.8011,0.67535,0.625147,0.637295
8,0.0956,0.568749,0.805683,0.706499,0.661724,0.670599
9,0.0837,0.570776,0.804766,0.760205,0.669352,0.691783
10,0.0788,0.56922,0.802016,0.745483,0.670414,0.691991


[I 2025-03-15 17:25:01,527] Trial 122 finished with value: 0.7324932077809855 and parameters: {'learning_rate': 0.004275505205052455, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.8, 'temperature': 4.5}. Best is trial 79 with value: 0.7424099914579557.


Trial 123 with params: {'learning_rate': 0.001007837258506956, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 4, 'lambda_param': 0.7000000000000001, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0509,1.655172,0.380385,0.057687,0.080881,0.056303
2,1.454,1.296074,0.512374,0.110082,0.13767,0.112737
3,1.1732,1.085483,0.583868,0.191553,0.183108,0.158277
4,0.9121,0.925941,0.667278,0.31893,0.268266,0.263925
5,0.7097,0.801876,0.715857,0.342318,0.329745,0.321586


[I 2025-03-15 17:25:27,467] Trial 123 pruned. 


Trial 124 with params: {'learning_rate': 0.0013184757444496644, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 4, 'lambda_param': 0.9, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0089,1.587261,0.407883,0.062318,0.090738,0.06579
2,1.3847,1.251932,0.51604,0.113205,0.13973,0.119067
3,1.083,1.001277,0.63428,0.253969,0.224862,0.212713
4,0.7986,0.864665,0.690192,0.360306,0.30971,0.3092
5,0.6038,0.750665,0.727773,0.375372,0.356475,0.35225
6,0.4397,0.696917,0.751604,0.486073,0.416767,0.429673
7,0.3276,0.656886,0.76077,0.487664,0.439879,0.447046
8,0.2499,0.651785,0.766269,0.492614,0.47173,0.472375
9,0.1972,0.636928,0.774519,0.552558,0.506917,0.513863
10,0.1628,0.623197,0.782768,0.587632,0.557633,0.561191


[I 2025-03-15 17:26:30,141] Trial 124 pruned. 


Trial 125 with params: {'learning_rate': 0.0019491399054133297, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.8, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9419,1.486933,0.419798,0.094618,0.097859,0.078225
2,1.2653,1.141269,0.567369,0.165626,0.180685,0.157868
3,0.9148,0.864764,0.697525,0.348003,0.320987,0.316487
4,0.6109,0.747023,0.727773,0.376459,0.366955,0.360713
5,0.4237,0.672444,0.76352,0.459014,0.429483,0.430986
6,0.2999,0.653018,0.762603,0.484997,0.466824,0.463965
7,0.2153,0.632736,0.771769,0.595008,0.508323,0.526359
8,0.1687,0.610145,0.781852,0.586567,0.546481,0.551909
9,0.1314,0.605098,0.786434,0.626335,0.566584,0.580701
10,0.1118,0.598204,0.792851,0.644995,0.601338,0.608895


[I 2025-03-15 17:27:19,189] Trial 125 pruned. 


Trial 126 with params: {'learning_rate': 0.004849961480952609, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.8, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7635,1.30453,0.513291,0.114579,0.150862,0.12333
2,1.04,0.960396,0.662695,0.322456,0.305102,0.293303
3,0.6296,0.718547,0.745188,0.422736,0.407704,0.399885
4,0.377,0.634844,0.775435,0.536798,0.515722,0.512276
5,0.2286,0.598484,0.781852,0.623979,0.564911,0.57442
6,0.1505,0.580513,0.810266,0.69997,0.657849,0.665089
7,0.1085,0.564287,0.812099,0.715535,0.679755,0.687231
8,0.091,0.556537,0.814849,0.711887,0.684142,0.688976
9,0.0822,0.560902,0.815765,0.743853,0.690694,0.703204
10,0.0789,0.556055,0.814849,0.759225,0.700614,0.715429


[I 2025-03-15 17:29:53,517] Trial 126 finished with value: 0.7476729369943443 and parameters: {'learning_rate': 0.004849961480952609, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.8, 'temperature': 5.0}. Best is trial 126 with value: 0.7476729369943443.


Trial 127 with params: {'learning_rate': 0.0032736811386574775, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.8, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8979,1.384645,0.469294,0.077457,0.121042,0.09207
2,1.157,1.005277,0.63703,0.245571,0.254982,0.230203
3,0.761,0.80739,0.710357,0.328742,0.351401,0.332021
4,0.4825,0.68588,0.756187,0.450924,0.449711,0.443516
5,0.3078,0.642227,0.780935,0.534476,0.507565,0.505427
6,0.2167,0.620187,0.779102,0.584732,0.535043,0.540785
7,0.1565,0.603357,0.781852,0.602129,0.550344,0.560786
8,0.1185,0.593454,0.796517,0.674311,0.625801,0.628974
9,0.0973,0.576932,0.805683,0.679187,0.647539,0.651438
10,0.0881,0.583475,0.800183,0.677374,0.647514,0.648727


[I 2025-03-15 17:32:58,755] Trial 127 finished with value: 0.6755324651297934 and parameters: {'learning_rate': 0.0032736811386574775, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.8, 'temperature': 5.0}. Best is trial 126 with value: 0.7476729369943443.


Trial 128 with params: {'learning_rate': 0.004200396026318684, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 4, 'lambda_param': 0.9, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.738,1.286914,0.51604,0.136261,0.144575,0.122331
2,1.0462,0.927193,0.671861,0.289476,0.295355,0.283958
3,0.6598,0.725337,0.737855,0.395137,0.393419,0.380126
4,0.4102,0.666967,0.758937,0.521349,0.471736,0.478995
5,0.2575,0.632057,0.777269,0.55621,0.524065,0.517733
6,0.1782,0.602199,0.778185,0.583478,0.558626,0.557092
7,0.1324,0.602905,0.783685,0.642768,0.588402,0.596499
8,0.1042,0.588657,0.797434,0.697311,0.646952,0.655481
9,0.0889,0.578053,0.799267,0.703906,0.65795,0.663771
10,0.0833,0.58301,0.792851,0.697642,0.644891,0.654246


[I 2025-03-15 17:35:34,456] Trial 128 finished with value: 0.697392341833907 and parameters: {'learning_rate': 0.004200396026318684, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 4, 'lambda_param': 0.9, 'temperature': 4.5}. Best is trial 126 with value: 0.7476729369943443.


Trial 129 with params: {'learning_rate': 0.0024255908232954817, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.845,1.40281,0.461045,0.09976,0.117134,0.095205
2,1.1878,1.052409,0.6022,0.226857,0.215244,0.19694
3,0.8119,0.833837,0.696609,0.352852,0.34116,0.330462
4,0.5382,0.714391,0.740605,0.388821,0.398424,0.388992
5,0.3604,0.643358,0.766269,0.499937,0.471773,0.471265
6,0.2509,0.630455,0.768103,0.56783,0.494951,0.506424
7,0.1789,0.620088,0.786434,0.626938,0.548973,0.565057
8,0.1409,0.598807,0.783685,0.623906,0.576292,0.585618
9,0.1134,0.586088,0.791934,0.665608,0.616761,0.628163
10,0.098,0.58984,0.794684,0.680818,0.638179,0.643041


[I 2025-03-15 17:37:11,750] Trial 129 pruned. 


Trial 130 with params: {'learning_rate': 0.003937527978011733, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.8, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7476,1.315119,0.493126,0.115256,0.134108,0.106471
2,1.0468,0.926037,0.67461,0.313003,0.293148,0.27836
3,0.6619,0.732247,0.741522,0.397721,0.393264,0.382218
4,0.4143,0.631599,0.777269,0.536255,0.502495,0.505312
5,0.2522,0.617768,0.778185,0.533391,0.515369,0.512064
6,0.1744,0.588449,0.788268,0.673542,0.622119,0.632873
7,0.125,0.578752,0.791017,0.681911,0.62414,0.634944
8,0.0978,0.578791,0.791934,0.67877,0.637219,0.644148
9,0.0853,0.566532,0.7956,0.685607,0.641617,0.651468
10,0.0793,0.567905,0.802933,0.747776,0.682728,0.698938


[I 2025-03-15 17:39:43,951] Trial 130 finished with value: 0.6989304706447627 and parameters: {'learning_rate': 0.003937527978011733, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.8, 'temperature': 5.0}. Best is trial 126 with value: 0.7476729369943443.


Trial 131 with params: {'learning_rate': 0.0038750790099444845, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.7000000000000001, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7663,1.326659,0.501375,0.107481,0.140019,0.107651
2,1.0776,0.956341,0.653529,0.271752,0.279587,0.258122
3,0.6647,0.728309,0.742438,0.374403,0.386215,0.374562
4,0.4069,0.636242,0.771769,0.501874,0.489512,0.484096
5,0.2525,0.596313,0.785518,0.571673,0.527595,0.531031
6,0.1804,0.57186,0.79835,0.654326,0.598021,0.607481
7,0.1315,0.579349,0.79835,0.678473,0.619592,0.634877
8,0.1021,0.561767,0.802016,0.687359,0.643925,0.652561
9,0.0868,0.554788,0.805683,0.713344,0.644378,0.662309
10,0.079,0.556952,0.805683,0.744896,0.678021,0.696497


[I 2025-03-15 17:42:16,207] Trial 131 finished with value: 0.701490851060924 and parameters: {'learning_rate': 0.0038750790099444845, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.7000000000000001, 'temperature': 4.5}. Best is trial 126 with value: 0.7476729369943443.


Trial 132 with params: {'learning_rate': 0.00027627219320570554, 'weight_decay': 0.002, 'adam_beta1': 0.99, 'warmup_steps': 4, 'lambda_param': 0.7000000000000001, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2724,2.061897,0.176902,0.003538,0.02,0.006012
2,1.9965,1.88168,0.320807,0.0457,0.061968,0.048339
3,1.8108,1.712462,0.369386,0.041304,0.074297,0.04942
4,1.6552,1.595068,0.398717,0.057116,0.084411,0.058723
5,1.5576,1.500033,0.439047,0.069207,0.099734,0.0735


[I 2025-03-15 17:42:51,114] Trial 132 pruned. 


Trial 133 with params: {'learning_rate': 0.0032019790815294167, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 3, 'lambda_param': 1.0, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7873,1.348073,0.48396,0.103204,0.126878,0.096301
2,1.1127,1.031138,0.629698,0.273782,0.260984,0.242923
3,0.7318,0.777706,0.713107,0.353001,0.354712,0.339595
4,0.4685,0.662254,0.761687,0.48444,0.456005,0.456745
5,0.3021,0.640458,0.772686,0.54786,0.507311,0.505528
6,0.2122,0.614645,0.788268,0.612408,0.556036,0.565188
7,0.145,0.582765,0.79835,0.649757,0.601122,0.610514
8,0.1118,0.580464,0.80385,0.732118,0.65272,0.669652
9,0.0947,0.579415,0.80385,0.737662,0.644509,0.669205
10,0.0855,0.573709,0.806599,0.78579,0.684732,0.713985


[I 2025-03-15 17:45:32,103] Trial 133 finished with value: 0.7301586726738547 and parameters: {'learning_rate': 0.0032019790815294167, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 3, 'lambda_param': 1.0, 'temperature': 6.5}. Best is trial 126 with value: 0.7476729369943443.


Trial 134 with params: {'learning_rate': 0.004527933144668462, 'weight_decay': 0.002, 'adam_beta1': 0.96, 'warmup_steps': 1, 'lambda_param': 0.2, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8186,1.370037,0.472044,0.118827,0.120197,0.10214
2,1.1791,1.053145,0.608616,0.207618,0.224473,0.202995
3,0.8159,0.849578,0.687443,0.303833,0.323899,0.302152
4,0.5367,0.734285,0.734189,0.406314,0.39204,0.382237
5,0.347,0.687691,0.756187,0.483127,0.471418,0.464071
6,0.242,0.662062,0.770852,0.533428,0.533736,0.525353
7,0.1737,0.637154,0.775435,0.614716,0.584398,0.585923
8,0.141,0.616605,0.788268,0.655122,0.632795,0.631524
9,0.1149,0.606284,0.786434,0.663709,0.626202,0.630057
10,0.1,0.596628,0.791934,0.68348,0.645397,0.651041


[I 2025-03-15 17:48:14,298] Trial 134 finished with value: 0.6873021807086122 and parameters: {'learning_rate': 0.004527933144668462, 'weight_decay': 0.002, 'adam_beta1': 0.96, 'warmup_steps': 1, 'lambda_param': 0.2, 'temperature': 3.0}. Best is trial 126 with value: 0.7476729369943443.


Trial 135 with params: {'learning_rate': 0.00449409980188597, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 2, 'lambda_param': 0.8, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6979,1.31568,0.501375,0.120467,0.14937,0.123327
2,1.024,0.906293,0.68286,0.315896,0.304713,0.29392
3,0.6203,0.7092,0.759853,0.450245,0.425873,0.411737
4,0.3868,0.655151,0.758937,0.496523,0.468246,0.470035
5,0.2353,0.59801,0.781852,0.566994,0.533298,0.534667
6,0.1572,0.580796,0.790101,0.656238,0.594585,0.610452
7,0.1204,0.573751,0.799267,0.685865,0.617105,0.640343
8,0.095,0.562521,0.804766,0.686933,0.650706,0.657979
9,0.0832,0.554815,0.80385,0.693659,0.639416,0.654426
10,0.0784,0.558674,0.807516,0.724139,0.678341,0.688841


[I 2025-03-15 17:51:03,419] Trial 135 finished with value: 0.6901205240234904 and parameters: {'learning_rate': 0.00449409980188597, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 2, 'lambda_param': 0.8, 'temperature': 6.5}. Best is trial 126 with value: 0.7476729369943443.


Trial 136 with params: {'learning_rate': 0.004201692416972031, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.8, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7365,1.32015,0.494042,0.136069,0.136775,0.10994
2,1.0299,0.908052,0.679193,0.32959,0.313092,0.300548
3,0.64,0.731265,0.748854,0.424886,0.409876,0.398701
4,0.4001,0.658315,0.76352,0.520376,0.477201,0.485602
5,0.248,0.608425,0.785518,0.559391,0.528967,0.526022
6,0.1648,0.584756,0.793767,0.654537,0.608797,0.616844
7,0.1197,0.576082,0.79835,0.675283,0.641973,0.645744
8,0.0959,0.563288,0.8011,0.692281,0.644347,0.656646
9,0.0848,0.56028,0.808433,0.712078,0.648192,0.66635
10,0.0803,0.556942,0.808433,0.729914,0.669296,0.68524


[I 2025-03-15 17:54:08,369] Trial 136 finished with value: 0.683671158576542 and parameters: {'learning_rate': 0.004201692416972031, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.8, 'temperature': 6.0}. Best is trial 126 with value: 0.7476729369943443.


Trial 137 with params: {'learning_rate': 0.004408460453494676, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.2, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7946,1.339176,0.492209,0.105487,0.130892,0.105207
2,1.1362,1.032659,0.637947,0.25899,0.261232,0.238385
3,0.766,0.797311,0.711274,0.365523,0.343711,0.32493
4,0.4805,0.69932,0.744271,0.425297,0.419587,0.412616
5,0.2988,0.640698,0.772686,0.532257,0.492454,0.494532
6,0.2098,0.624112,0.785518,0.569998,0.563147,0.556932
7,0.1487,0.60743,0.797434,0.675626,0.632428,0.638508
8,0.1166,0.595308,0.7956,0.657792,0.628829,0.627556
9,0.0979,0.581864,0.796517,0.704153,0.660691,0.664544
10,0.0875,0.582604,0.799267,0.741639,0.678205,0.690887


[I 2025-03-15 17:56:47,138] Trial 137 finished with value: 0.7417669257954287 and parameters: {'learning_rate': 0.004408460453494676, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.2, 'temperature': 3.0}. Best is trial 126 with value: 0.7476729369943443.


Trial 138 with params: {'learning_rate': 0.002795362273254196, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8301,1.404263,0.462878,0.10038,0.114373,0.091451
2,1.1966,1.061002,0.594867,0.210192,0.201544,0.185171
3,0.8354,0.821788,0.699358,0.341246,0.329142,0.323262
4,0.5461,0.720273,0.727773,0.405172,0.386348,0.381676
5,0.3676,0.656987,0.769019,0.473372,0.458246,0.455325
6,0.2646,0.640058,0.776352,0.547131,0.50598,0.511251
7,0.1883,0.61748,0.779102,0.592492,0.537182,0.550729
8,0.1432,0.601957,0.785518,0.674616,0.616622,0.630807
9,0.1139,0.600157,0.790101,0.69792,0.639602,0.654145
10,0.0993,0.598623,0.784601,0.701724,0.633136,0.648663


[I 2025-03-15 17:59:50,665] Trial 138 finished with value: 0.6953013174744442 and parameters: {'learning_rate': 0.002795362273254196, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 3.5}. Best is trial 126 with value: 0.7476729369943443.


Trial 139 with params: {'learning_rate': 0.0027928593493966357, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.30000000000000004, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8633,1.462391,0.441797,0.122117,0.107432,0.086244
2,1.2254,1.123711,0.575619,0.193335,0.187187,0.167532
3,0.8757,0.862405,0.686526,0.305767,0.311653,0.300131
4,0.5863,0.729642,0.736939,0.389159,0.383202,0.376439
5,0.3927,0.665704,0.757104,0.479987,0.452885,0.454177
6,0.2732,0.653056,0.764436,0.518355,0.486462,0.487337
7,0.1995,0.620622,0.776352,0.569427,0.507732,0.517791
8,0.1548,0.611992,0.782768,0.60401,0.566325,0.572276
9,0.1211,0.605647,0.790101,0.659228,0.594384,0.611324
10,0.1038,0.602428,0.791017,0.693308,0.631502,0.645435


[I 2025-03-15 18:01:01,373] Trial 139 pruned. 


Trial 140 with params: {'learning_rate': 0.004399796580936986, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7408,1.318506,0.511457,0.128451,0.148169,0.12204
2,1.023,0.918916,0.670944,0.30737,0.312054,0.293017
3,0.635,0.733381,0.736939,0.381181,0.391882,0.372949
4,0.4005,0.655449,0.756187,0.504385,0.483937,0.483162
5,0.2428,0.619083,0.775435,0.558326,0.526958,0.52501
6,0.1598,0.580759,0.79835,0.685773,0.619322,0.637213
7,0.1228,0.580189,0.799267,0.701731,0.643226,0.656542
8,0.0979,0.573403,0.806599,0.730797,0.667509,0.686723
9,0.0835,0.563599,0.811182,0.756813,0.670998,0.69653
10,0.0791,0.567377,0.811182,0.767691,0.687237,0.710875


[I 2025-03-15 18:03:43,884] Trial 140 finished with value: 0.72876051310766 and parameters: {'learning_rate': 0.004399796580936986, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 7.0}. Best is trial 126 with value: 0.7476729369943443.


Trial 141 with params: {'learning_rate': 0.002557776611254075, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8498,1.39795,0.462878,0.09658,0.116407,0.093168
2,1.1893,1.04913,0.609533,0.227633,0.228143,0.209587
3,0.8083,0.829663,0.705775,0.354,0.339675,0.328414
4,0.5356,0.711849,0.743355,0.407754,0.404169,0.398611
5,0.3523,0.654924,0.75802,0.501159,0.464441,0.465618
6,0.2463,0.627101,0.774519,0.530799,0.495554,0.499434
7,0.1786,0.614927,0.779102,0.56961,0.515583,0.524195
8,0.1379,0.593622,0.790101,0.605788,0.594849,0.587641
9,0.1117,0.579196,0.791017,0.672245,0.622796,0.630924
10,0.0947,0.590353,0.786434,0.6531,0.614036,0.621718


[I 2025-03-15 18:04:34,526] Trial 141 pruned. 


Trial 142 with params: {'learning_rate': 0.00409811721191459, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7416,1.310626,0.505958,0.135647,0.144109,0.122206
2,1.0355,0.917908,0.676444,0.334486,0.308018,0.29833
3,0.6397,0.740398,0.736022,0.396149,0.3992,0.384566
4,0.402,0.651039,0.769936,0.548621,0.499758,0.506946
5,0.2432,0.611063,0.779102,0.558028,0.528215,0.527357
6,0.1777,0.602125,0.794684,0.62528,0.598909,0.599576
7,0.1221,0.58672,0.796517,0.666707,0.609938,0.622052
8,0.0973,0.582586,0.802933,0.740738,0.665647,0.680724
9,0.0856,0.572191,0.800183,0.738373,0.666253,0.684087
10,0.0809,0.578744,0.802016,0.743304,0.682834,0.696658


[I 2025-03-15 18:07:41,614] Trial 142 finished with value: 0.7035912106647132 and parameters: {'learning_rate': 0.00409811721191459, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 7.0}. Best is trial 126 with value: 0.7476729369943443.


Trial 143 with params: {'learning_rate': 0.004960271950947583, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 4, 'lambda_param': 0.9, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7695,1.311555,0.517874,0.110497,0.147742,0.113484
2,1.0613,0.97846,0.644363,0.290223,0.277473,0.265531
3,0.6568,0.729114,0.740605,0.438244,0.404743,0.400543
4,0.3885,0.639579,0.771769,0.542652,0.492891,0.499767
5,0.2404,0.593991,0.790101,0.602125,0.570587,0.570446
6,0.1583,0.578161,0.797434,0.688347,0.642208,0.649291
7,0.1142,0.569801,0.8011,0.690059,0.645559,0.653907
8,0.095,0.564486,0.80385,0.772693,0.705465,0.722338
9,0.084,0.548717,0.821265,0.796438,0.713787,0.7393
10,0.0797,0.549887,0.819432,0.79891,0.711915,0.739474


[I 2025-03-15 18:10:16,603] Trial 143 finished with value: 0.7405107809949696 and parameters: {'learning_rate': 0.004960271950947583, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 4, 'lambda_param': 0.9, 'temperature': 5.5}. Best is trial 126 with value: 0.7476729369943443.


Trial 144 with params: {'learning_rate': 0.004864923655922129, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7549,1.317716,0.486709,0.113361,0.134495,0.109389
2,1.0799,0.961027,0.666361,0.285799,0.291977,0.278093
3,0.7125,0.76809,0.716774,0.35577,0.366959,0.354279
4,0.4541,0.683163,0.751604,0.455359,0.443827,0.438409
5,0.2926,0.653862,0.769019,0.5346,0.516047,0.514426
6,0.1982,0.618155,0.778185,0.600004,0.55047,0.554686
7,0.1401,0.604455,0.782768,0.651084,0.609278,0.617742
8,0.1117,0.588482,0.794684,0.668343,0.634727,0.638342
9,0.0956,0.580637,0.797434,0.688432,0.645291,0.654619
10,0.0864,0.591066,0.791934,0.722662,0.67153,0.681409


[I 2025-03-15 18:12:54,164] Trial 144 finished with value: 0.7267923105502754 and parameters: {'learning_rate': 0.004864923655922129, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}. Best is trial 126 with value: 0.7476729369943443.


Trial 145 with params: {'learning_rate': 0.002359450595486672, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8141,1.408904,0.471127,0.092755,0.116976,0.091466
2,1.2137,1.117163,0.567369,0.186855,0.186038,0.165028
3,0.8774,0.856995,0.692026,0.316659,0.318561,0.307138
4,0.589,0.737667,0.728689,0.36891,0.363919,0.361033
5,0.4013,0.650019,0.759853,0.451944,0.426425,0.42716


[I 2025-03-15 18:13:19,276] Trial 145 pruned. 


Trial 146 with params: {'learning_rate': 0.0008166629074090491, 'weight_decay': 0.0, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.08,1.734495,0.385885,0.044556,0.083046,0.055395
2,1.5488,1.423293,0.461045,0.10414,0.109487,0.084078
3,1.2976,1.198879,0.543538,0.161892,0.155546,0.134065
4,1.0724,1.035796,0.628781,0.22462,0.219271,0.192805
5,0.876,0.909008,0.671861,0.282033,0.271551,0.261422


[I 2025-03-15 18:13:44,156] Trial 146 pruned. 


Trial 147 with params: {'learning_rate': 0.0015278093079990453, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8959,1.524228,0.419798,0.071164,0.095921,0.074797
2,1.3386,1.190346,0.563703,0.125923,0.16745,0.139273
3,1.0439,0.994748,0.632447,0.243818,0.2308,0.217006
4,0.7701,0.835386,0.692026,0.354827,0.316465,0.3193
5,0.5706,0.73365,0.736939,0.361506,0.370391,0.359232
6,0.4088,0.706766,0.740605,0.432464,0.414758,0.412547
7,0.2946,0.670356,0.75802,0.481213,0.456174,0.455728
8,0.228,0.663596,0.762603,0.485854,0.491728,0.477335
9,0.1779,0.630918,0.773602,0.582241,0.541143,0.54733
10,0.1458,0.621025,0.781852,0.59042,0.562532,0.564187


[I 2025-03-15 18:14:39,253] Trial 147 pruned. 


Trial 148 with params: {'learning_rate': 0.0035743722892889324, 'weight_decay': 0.001, 'adam_beta1': 0.91, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7306,1.332466,0.496792,0.111809,0.137176,0.115544
2,1.1092,1.012473,0.633364,0.240945,0.251073,0.233405
3,0.7419,0.790973,0.707608,0.352872,0.350676,0.33728
4,0.4727,0.678512,0.750687,0.474042,0.422351,0.427781
5,0.3062,0.630988,0.774519,0.526285,0.493095,0.491358
6,0.2152,0.618997,0.772686,0.569703,0.521625,0.527311
7,0.1523,0.604175,0.785518,0.639156,0.577941,0.589856
8,0.1166,0.590123,0.794684,0.680377,0.647333,0.650796
9,0.0979,0.572954,0.800183,0.685154,0.656027,0.660779
10,0.0866,0.5763,0.8011,0.700695,0.668766,0.672652


[I 2025-03-15 18:17:44,329] Trial 148 finished with value: 0.7198241340232989 and parameters: {'learning_rate': 0.0035743722892889324, 'weight_decay': 0.001, 'adam_beta1': 0.91, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 2.5}. Best is trial 126 with value: 0.7476729369943443.


Trial 149 with params: {'learning_rate': 0.002718471657061312, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 1, 'lambda_param': 0.5, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.836,1.399695,0.463795,0.104342,0.118371,0.099805
2,1.1765,1.035076,0.621448,0.221994,0.235325,0.214955
3,0.7975,0.810713,0.707608,0.348186,0.341596,0.326221
4,0.5243,0.703849,0.747938,0.460183,0.429056,0.423754
5,0.3535,0.647583,0.76077,0.480148,0.458097,0.455168
6,0.2466,0.642005,0.769019,0.553778,0.488316,0.496891
7,0.1837,0.617373,0.782768,0.62185,0.552141,0.571648
8,0.1473,0.606639,0.784601,0.623171,0.575613,0.58103
9,0.1141,0.594457,0.793767,0.672992,0.628306,0.634259
10,0.0976,0.592116,0.797434,0.671314,0.643691,0.645155


[I 2025-03-15 18:20:23,302] Trial 149 finished with value: 0.7096346115125874 and parameters: {'learning_rate': 0.002718471657061312, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 1, 'lambda_param': 0.5, 'temperature': 3.5}. Best is trial 126 with value: 0.7476729369943443.


In [56]:
print(best_trial2)

BestRun(run_id='126', objective=0.7476729369943443, hyperparameters={'learning_rate': 0.004849961480952609, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.8, 'temperature': 5.0}, run_summary=None)


In [57]:
base.reset_seed()

In [58]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base-embedd_fine_aug_hp-search", logging_dir=f"~/logs/{DATASET}/bilstm-base-embedd_fine_aug_hp-search", epochs=num_epochs, batch_size=batch_size)

In [59]:
#Nápočet epoch na steps
data_length = len(all_train_data)
min_r = math.ceil(data_length/batch_size)*5
max_r = math.ceil(data_length/batch_size)*num_epochs
warm_up = math.ceil(data_length/batch_size/10)

In [60]:
def hp_space(trial):
    params =  {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-3, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0, 1e-2, step=1e-3),
        "adam_beta1" : trial.suggest_float("adam_beta1", 0.9, 0.99, step=0.01),
        "warmup_steps" : trial.suggest_int("warmup_steps", 0, warm_up),
    }
    print(f"Trial {trial.number} with params: {params}")
    return params

In [61]:
pruner = optuna.pruners.HyperbandPruner(min_resource=min_r, max_resource=max_r, reduction_factor=2, bootstrap_count=2)
sampler = optuna.samplers.TPESampler(seed=42, multivariate=True)



In [62]:
trainer = Trainer(
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    model_init = lambda: get_BiLSTM(),
    #callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)
  

In [63]:
best_trial3 = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    compute_objective=lambda metrics: metrics["eval_f1"],
    pruner=pruner,
    sampler=sampler,
    study_name="Base-aug-embedd",
    n_trials=150
)

[I 2025-03-15 18:20:23,677] A new study created in memory with name: Base-aug-embedd


Trial 0 with params: {'learning_rate': 0.0001025350969016849, 'weight_decay': 0.01, 'adam_beta1': 0.97, 'warmup_steps': 32}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4705,1.834866,0.534372,0.143789,0.150608,0.12896
2,1.3326,1.349108,0.659028,0.299349,0.27561,0.264669
3,0.8299,1.151894,0.707608,0.423168,0.382134,0.382695
4,0.5353,1.106892,0.736939,0.485646,0.463364,0.460583
5,0.3407,1.123481,0.749771,0.519467,0.512656,0.503252


[I 2025-03-15 18:21:18,040] Trial 0 pruned. 


Trial 1 with params: {'learning_rate': 2.6368755339723032e-05, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 46}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0561,2.6205,0.35472,0.035136,0.072306,0.045223
2,2.309,2.234505,0.439047,0.093317,0.101002,0.075565
3,1.9889,2.011767,0.487626,0.110406,0.126658,0.103825
4,1.7693,1.843497,0.526123,0.13926,0.153273,0.133488
5,1.5892,1.725125,0.56462,0.159951,0.182015,0.160833


[I 2025-03-15 18:22:18,309] Trial 1 pruned. 


Trial 2 with params: {'learning_rate': 0.00041917115166952007, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 52}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4235,1.046461,0.741522,0.485256,0.463422,0.454392
2,0.2202,1.172949,0.778185,0.610387,0.6072,0.599502
3,0.06,1.399263,0.779102,0.650563,0.642739,0.639078
4,0.0283,1.430048,0.790101,0.692404,0.660708,0.664637
5,0.0151,1.532795,0.773602,0.669242,0.625882,0.624862
6,0.0093,1.606596,0.776352,0.676862,0.638249,0.64019
7,0.0086,1.737449,0.778185,0.656477,0.640053,0.635939
8,0.0061,1.740638,0.782768,0.672647,0.65647,0.65097
9,0.0053,1.7435,0.788268,0.67757,0.653991,0.652815
10,0.0047,1.781091,0.791017,0.679602,0.69014,0.667922


[I 2025-03-15 18:24:10,680] Trial 2 pruned. 


Trial 3 with params: {'learning_rate': 0.001764971584817572, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 9}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6893,1.033948,0.8011,0.745674,0.692117,0.70545
2,0.0333,1.339331,0.804766,0.76431,0.703485,0.711003
3,0.0122,1.348029,0.813932,0.781684,0.722302,0.737184
4,0.0113,1.362657,0.819432,0.80553,0.733985,0.744736
5,0.0073,1.335005,0.825848,0.765582,0.73874,0.739626
6,0.0059,1.639386,0.816682,0.782489,0.727821,0.734704
7,0.0036,1.459582,0.825848,0.773917,0.744563,0.74428
8,0.004,1.572042,0.824931,0.778769,0.728628,0.736033
9,0.0043,1.565974,0.809349,0.749094,0.724937,0.723252
10,0.0044,1.561246,0.824015,0.795604,0.761531,0.762955


[I 2025-03-15 18:26:03,365] Trial 3 pruned. 


Trial 4 with params: {'learning_rate': 6.62431060594998e-05, 'weight_decay': 0.005, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 15}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5939,2.050934,0.489459,0.099255,0.124604,0.099365
2,1.6622,1.628327,0.588451,0.213257,0.204689,0.184453
3,1.2476,1.40662,0.64253,0.289761,0.259141,0.25388
4,0.9504,1.259542,0.672777,0.326549,0.310508,0.306143
5,0.7207,1.170059,0.691109,0.410645,0.372795,0.381269


[I 2025-03-15 18:27:01,126] Trial 4 pruned. 


Trial 5 with params: {'learning_rate': 0.0004480975918214954, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 19}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.358,1.007347,0.752521,0.520878,0.474179,0.479537
2,0.204,1.224674,0.776352,0.646947,0.604223,0.608737
3,0.0566,1.466169,0.774519,0.653227,0.633006,0.629192
4,0.0255,1.537283,0.789184,0.697262,0.673876,0.664128
5,0.0141,1.611581,0.776352,0.654884,0.654973,0.642248
6,0.0085,1.615806,0.788268,0.714029,0.660652,0.668306
7,0.0065,1.76713,0.796517,0.707472,0.666952,0.67102
8,0.0067,1.779088,0.7956,0.689869,0.662851,0.656514
9,0.0046,1.759688,0.788268,0.731895,0.683944,0.696896
10,0.004,1.795985,0.8011,0.738495,0.707046,0.709571


[I 2025-03-15 18:30:58,555] Trial 5 pruned. 


Trial 6 with params: {'learning_rate': 0.00017018418817029164, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 27}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0185,1.437505,0.635197,0.274578,0.239548,0.226792
2,0.8312,1.106526,0.728689,0.420171,0.419724,0.412312
3,0.3924,1.108837,0.746104,0.589824,0.49939,0.52037
4,0.1881,1.188254,0.764436,0.62605,0.556006,0.574808
5,0.0977,1.319087,0.76352,0.647584,0.597491,0.603108
6,0.0552,1.394434,0.761687,0.62916,0.619316,0.604679
7,0.0324,1.566143,0.756187,0.597087,0.607375,0.588439
8,0.0252,1.545592,0.765353,0.652555,0.630184,0.630928
9,0.0162,1.603877,0.761687,0.630379,0.645337,0.628627
10,0.0138,1.686394,0.769936,0.653159,0.625664,0.624558


[I 2025-03-15 18:32:56,488] Trial 6 pruned. 


Trial 7 with params: {'learning_rate': 0.00039710847107924746, 'weight_decay': 0.0, 'adam_beta1': 0.96, 'warmup_steps': 9}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5296,1.074571,0.736939,0.439136,0.433047,0.426509
2,0.2889,1.159679,0.769936,0.623309,0.590177,0.593926
3,0.0776,1.407478,0.769019,0.630078,0.621496,0.604972
4,0.0327,1.404194,0.783685,0.730975,0.663827,0.672591
5,0.0175,1.564266,0.781852,0.693379,0.665775,0.65905
6,0.0107,1.587531,0.796517,0.70828,0.65324,0.656768
7,0.0084,1.646441,0.791017,0.718409,0.658315,0.662827
8,0.0063,1.719336,0.805683,0.739431,0.689831,0.698403
9,0.0058,1.770808,0.8011,0.740048,0.662544,0.683573
10,0.0047,1.744681,0.808433,0.743591,0.704816,0.708139


[I 2025-03-15 18:36:54,515] Trial 7 pruned. 


Trial 8 with params: {'learning_rate': 1.498208643215546e-05, 'weight_decay': 0.01, 'adam_beta1': 0.99, 'warmup_steps': 43}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4269,3.097955,0.177819,0.013545,0.020274,0.006555
2,2.818,2.650002,0.347388,0.037061,0.069508,0.043694
3,2.4095,2.376579,0.405133,0.056658,0.087607,0.05835
4,2.1751,2.202023,0.447296,0.12961,0.103647,0.079094
5,2.0151,2.083023,0.477544,0.099463,0.120268,0.095263
6,1.8996,1.981266,0.498625,0.105842,0.13002,0.10489
7,1.7882,1.888625,0.515124,0.13959,0.141765,0.119625
8,1.6942,1.822496,0.533456,0.15036,0.153957,0.133708
9,1.6149,1.754745,0.56462,0.155368,0.176496,0.157338
10,1.5378,1.69922,0.573786,0.173372,0.18646,0.169791


[I 2025-03-15 18:39:16,435] Trial 8 pruned. 


Trial 9 with params: {'learning_rate': 6.639623079859457e-05, 'weight_decay': 0.001, 'adam_beta1': 0.96, 'warmup_steps': 23}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6477,2.078235,0.477544,0.1074,0.119548,0.097156
2,1.6688,1.628458,0.592117,0.216406,0.202352,0.186312
3,1.2409,1.391858,0.63978,0.269728,0.255217,0.241922
4,0.9416,1.259579,0.675527,0.330423,0.311762,0.307016
5,0.7179,1.162974,0.698442,0.402228,0.368169,0.374815
6,0.5468,1.126224,0.72594,0.444912,0.428501,0.427812
7,0.4133,1.137655,0.734189,0.500892,0.470515,0.473523
8,0.3172,1.145804,0.729606,0.509322,0.469883,0.473164
9,0.246,1.176107,0.744271,0.579733,0.507109,0.521406
10,0.1924,1.19617,0.738772,0.554677,0.541443,0.533508


[I 2025-03-15 18:41:45,119] Trial 9 pruned. 


Trial 10 with params: {'learning_rate': 0.00012001988398838816, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 15}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2132,1.661394,0.585701,0.174834,0.188961,0.166876
2,1.1549,1.267741,0.673694,0.351144,0.317207,0.311767
3,0.6845,1.111261,0.726856,0.444778,0.410093,0.416422
4,0.4054,1.093664,0.748854,0.508578,0.472689,0.479723
5,0.2411,1.143849,0.751604,0.610877,0.535019,0.55306


[I 2025-03-15 18:42:58,885] Trial 10 pruned. 


Trial 11 with params: {'learning_rate': 0.0012958854235248182, 'weight_decay': 0.0, 'adam_beta1': 0.97, 'warmup_steps': 14}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.989,1.111727,0.789184,0.65197,0.5959,0.603904
2,0.0595,1.255394,0.805683,0.746138,0.723439,0.7227
3,0.0158,1.387127,0.811182,0.755707,0.712224,0.718904
4,0.0089,1.528032,0.810266,0.73731,0.714658,0.708526
5,0.0064,1.689027,0.802933,0.668505,0.698268,0.668713
6,0.0076,1.641328,0.809349,0.750335,0.706689,0.713057
7,0.0053,1.665608,0.799267,0.766384,0.715502,0.717888
8,0.0031,1.772285,0.797434,0.718662,0.707415,0.694675
9,0.0037,1.945929,0.789184,0.737694,0.699166,0.697074
10,0.0021,1.76207,0.808433,0.758669,0.722415,0.722759


[I 2025-03-15 18:48:08,334] Trial 11 pruned. 


Trial 12 with params: {'learning_rate': 0.00045284548274119295, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 28}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3864,1.024905,0.757104,0.480541,0.4667,0.461553
2,0.2017,1.244959,0.780935,0.638819,0.632059,0.618144
3,0.0571,1.467143,0.779102,0.649968,0.667595,0.640754
4,0.0241,1.409083,0.796517,0.686481,0.680704,0.671054
5,0.0144,1.576372,0.781852,0.706305,0.644834,0.657309
6,0.0098,1.549563,0.791017,0.71382,0.673296,0.673476
7,0.0096,1.676261,0.794684,0.729257,0.656843,0.67142
8,0.0055,1.687583,0.800183,0.692636,0.671175,0.667835
9,0.0058,1.853396,0.783685,0.692836,0.678739,0.670099
10,0.0039,1.767041,0.802016,0.686644,0.685256,0.671571


[I 2025-03-15 18:53:17,786] Trial 12 pruned. 


Trial 13 with params: {'learning_rate': 0.00018292719871077867, 'weight_decay': 0.0, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9726,1.396103,0.63703,0.252813,0.242305,0.227767
2,0.7976,1.057289,0.736939,0.457461,0.451348,0.444428
3,0.3542,1.071391,0.756187,0.617583,0.515485,0.535517
4,0.1634,1.157144,0.764436,0.663192,0.6033,0.613955
5,0.0825,1.256515,0.768103,0.675663,0.613569,0.621812
6,0.0448,1.349767,0.773602,0.655919,0.635017,0.629809
7,0.0284,1.466708,0.774519,0.679421,0.639862,0.642621
8,0.0196,1.559458,0.773602,0.656098,0.624011,0.623486
9,0.0136,1.562805,0.779102,0.675368,0.65119,0.651483
10,0.0107,1.695633,0.764436,0.647759,0.641699,0.627459


[I 2025-03-15 18:58:17,271] Trial 13 pruned. 


Trial 14 with params: {'learning_rate': 0.00012199424891552456, 'weight_decay': 0.0, 'adam_beta1': 0.99, 'warmup_steps': 10}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5259,1.907009,0.525206,0.146845,0.146131,0.122294
2,1.3809,1.340495,0.667278,0.276346,0.279768,0.267424
3,0.8316,1.124304,0.728689,0.415845,0.393025,0.393545
4,0.4973,1.07469,0.749771,0.455626,0.461054,0.451339
5,0.2934,1.119361,0.762603,0.530681,0.508084,0.508356
6,0.1752,1.180189,0.76077,0.620777,0.580383,0.583301
7,0.1111,1.26883,0.768103,0.619711,0.607814,0.600684
8,0.0701,1.367272,0.762603,0.612762,0.599225,0.593403
9,0.0498,1.414019,0.764436,0.629043,0.615367,0.611006
10,0.033,1.46858,0.770852,0.62867,0.632901,0.619562


[I 2025-03-15 19:03:17,534] Trial 14 pruned. 


Trial 15 with params: {'learning_rate': 0.0003143261471908058, 'weight_decay': 0.001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 44}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6915,1.130669,0.719523,0.416741,0.402741,0.398456
2,0.3913,1.118824,0.753437,0.576337,0.560283,0.552488
3,0.1128,1.294169,0.764436,0.625,0.609489,0.595949
4,0.0455,1.360423,0.781852,0.679388,0.622492,0.636864
5,0.0237,1.557093,0.779102,0.67844,0.610996,0.620131
6,0.0157,1.50338,0.791934,0.769599,0.680156,0.70391
7,0.0115,1.542253,0.793767,0.736036,0.67669,0.686424
8,0.0084,1.674644,0.788268,0.66815,0.664506,0.653637
9,0.0064,1.654512,0.789184,0.69141,0.674342,0.669167
10,0.0049,1.751076,0.787351,0.667952,0.690526,0.665568


[I 2025-03-15 19:08:13,615] Trial 15 pruned. 


Trial 16 with params: {'learning_rate': 0.001282283674471692, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 27}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8015,1.089522,0.799267,0.747187,0.684034,0.696879
2,0.0434,1.192985,0.808433,0.735212,0.687251,0.694784
3,0.0146,1.342321,0.814849,0.768904,0.711094,0.722992
4,0.0101,1.407717,0.816682,0.749329,0.714436,0.720932
5,0.0061,1.689402,0.817599,0.805129,0.752446,0.761411
6,0.0057,1.648276,0.815765,0.814674,0.737913,0.757008
7,0.0045,1.672123,0.811182,0.768253,0.740474,0.737504
8,0.0053,1.651543,0.808433,0.756761,0.747271,0.735878
9,0.0037,1.691806,0.807516,0.771295,0.749242,0.745909
10,0.003,1.710169,0.806599,0.763733,0.729485,0.730274


[I 2025-03-15 19:13:25,605] Trial 16 pruned. 


Trial 17 with params: {'learning_rate': 0.003255327422016847, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 27}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5757,1.219158,0.804766,0.774099,0.704124,0.720522
2,0.025,1.427089,0.802016,0.736959,0.710472,0.707771
3,0.0132,1.548241,0.79835,0.758786,0.727358,0.725327
4,0.0114,1.663822,0.80385,0.796196,0.726697,0.742721
5,0.0097,1.736368,0.802933,0.785706,0.745372,0.745022
6,0.0081,1.846752,0.800183,0.7563,0.705115,0.714218
7,0.0076,1.855716,0.813932,0.773694,0.732649,0.739054
8,0.0066,2.214339,0.792851,0.765369,0.727459,0.718826
9,0.0051,2.259634,0.79835,0.756215,0.747503,0.734342
10,0.0046,2.436625,0.8011,0.732866,0.714152,0.709


[I 2025-03-15 19:20:56,136] Trial 17 finished with value: 0.7773922394396791 and parameters: {'learning_rate': 0.003255327422016847, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 27}. Best is trial 17 with value: 0.7773922394396791.


Trial 18 with params: {'learning_rate': 0.00436516467815393, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 41}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5356,1.193284,0.788268,0.761094,0.71251,0.722054
2,0.0263,1.366959,0.811182,0.769639,0.739798,0.740897
3,0.0137,1.710797,0.799267,0.782896,0.732136,0.734846
4,0.0165,1.797994,0.792851,0.745531,0.74732,0.730949
5,0.0158,1.818102,0.799267,0.755707,0.728484,0.725138
6,0.0067,2.135292,0.802933,0.755424,0.734337,0.730556
7,0.0085,2.127833,0.8011,0.775744,0.74573,0.740727
8,0.0101,2.199087,0.776352,0.744259,0.72254,0.711987
9,0.0096,2.676824,0.791934,0.767424,0.748372,0.740846
10,0.0052,2.717016,0.787351,0.751555,0.722647,0.720496


[I 2025-03-15 19:29:45,545] Trial 18 finished with value: 0.7605967643759093 and parameters: {'learning_rate': 0.00436516467815393, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 41}. Best is trial 17 with value: 0.7773922394396791.


Trial 19 with params: {'learning_rate': 0.002081950455785548, 'weight_decay': 0.0, 'adam_beta1': 0.92, 'warmup_steps': 45}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7079,1.048983,0.813016,0.749871,0.690126,0.707091
2,0.029,1.312909,0.811182,0.743482,0.702553,0.704799
3,0.0106,1.32475,0.824015,0.779486,0.729817,0.739977
4,0.0086,1.321072,0.820348,0.780872,0.753825,0.752759
5,0.0083,1.550947,0.812099,0.754846,0.743198,0.733397
6,0.0052,1.591241,0.819432,0.787625,0.73407,0.740356
7,0.006,1.607218,0.817599,0.762344,0.729887,0.729183
8,0.0044,1.641002,0.812099,0.754328,0.717817,0.720739
9,0.0035,1.773629,0.819432,0.779114,0.733845,0.740773
10,0.0012,1.741307,0.809349,0.745349,0.743501,0.723947


[I 2025-03-15 19:38:45,034] Trial 19 finished with value: 0.7806870868181661 and parameters: {'learning_rate': 0.002081950455785548, 'weight_decay': 0.0, 'adam_beta1': 0.92, 'warmup_steps': 45}. Best is trial 19 with value: 0.7806870868181661.


Trial 20 with params: {'learning_rate': 0.0029594436387712733, 'weight_decay': 0.001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 41}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6719,1.13915,0.808433,0.71158,0.701493,0.692453
2,0.0273,1.278882,0.821265,0.760647,0.749345,0.740748
3,0.0107,1.347138,0.814849,0.775496,0.737197,0.740671
4,0.0106,1.470357,0.807516,0.743472,0.72879,0.723322
5,0.0106,1.517588,0.815765,0.730611,0.737301,0.726627
6,0.0085,1.692137,0.806599,0.791411,0.756171,0.75658
7,0.0058,1.809786,0.807516,0.766477,0.755419,0.749207
8,0.0031,1.880245,0.810266,0.75435,0.731681,0.726022
9,0.0043,1.920003,0.802016,0.726287,0.734476,0.715795
10,0.0058,2.050403,0.814849,0.773228,0.747411,0.738656


[I 2025-03-15 19:44:55,389] Trial 20 pruned. 


Trial 21 with params: {'learning_rate': 0.0008897882051300087, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 50}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0257,1.124195,0.780935,0.651224,0.60704,0.61306
2,0.0728,1.292699,0.7956,0.735518,0.661666,0.681363
3,0.02,1.473629,0.799267,0.734058,0.690578,0.691305
4,0.0127,1.425935,0.799267,0.762189,0.698572,0.717153
5,0.0064,1.594737,0.79835,0.761827,0.716795,0.721027
6,0.0069,1.725629,0.79835,0.761572,0.702943,0.708944
7,0.0042,1.735078,0.806599,0.774692,0.720458,0.727994
8,0.0032,1.835898,0.790101,0.726622,0.703456,0.700145
9,0.0046,1.773519,0.810266,0.778322,0.736549,0.742482
10,0.003,1.79302,0.817599,0.796854,0.721496,0.739786


[I 2025-03-15 19:51:19,153] Trial 21 pruned. 


Trial 22 with params: {'learning_rate': 0.0015456814484478751, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 44}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7776,1.074159,0.810266,0.718981,0.65609,0.669759
2,0.0359,1.254187,0.8011,0.742474,0.722648,0.714048
3,0.0147,1.51315,0.796517,0.734004,0.718293,0.705989
4,0.009,1.426894,0.813016,0.791038,0.726938,0.739931
5,0.006,1.496451,0.808433,0.746889,0.739829,0.723813
6,0.0058,1.620734,0.804766,0.77276,0.731721,0.733543
7,0.005,1.576577,0.812099,0.763495,0.740802,0.724882
8,0.0061,1.618453,0.812099,0.766623,0.756555,0.746996
9,0.0033,1.88535,0.805683,0.724928,0.720954,0.705551
10,0.0011,1.767872,0.813932,0.730939,0.728146,0.716249


[I 2025-03-15 20:00:23,980] Trial 22 finished with value: 0.7691348369400604 and parameters: {'learning_rate': 0.0015456814484478751, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 44}. Best is trial 19 with value: 0.7806870868181661.


Trial 23 with params: {'learning_rate': 0.004884377508703136, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 42}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.522,1.178442,0.813932,0.781885,0.742996,0.747024
2,0.0258,1.509028,0.805683,0.777562,0.743378,0.74468
3,0.0149,1.638635,0.79835,0.764719,0.698576,0.717921
4,0.0186,1.948753,0.80385,0.747901,0.71216,0.712537
5,0.0192,2.253259,0.802016,0.762154,0.713652,0.721524
6,0.0165,2.196709,0.797434,0.762031,0.729299,0.723648
7,0.0132,2.635369,0.772686,0.732637,0.721101,0.707492
8,0.0124,2.577635,0.791934,0.722313,0.705669,0.690794
9,0.0083,2.750335,0.792851,0.737627,0.709051,0.70291
10,0.0059,2.77124,0.790101,0.734402,0.722271,0.713599


[I 2025-03-15 20:09:41,917] Trial 23 finished with value: 0.7059046581669017 and parameters: {'learning_rate': 0.004884377508703136, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 42}. Best is trial 19 with value: 0.7806870868181661.


Trial 24 with params: {'learning_rate': 0.0005796538699647382, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 48}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2161,1.060911,0.75802,0.585662,0.535166,0.54143
2,0.1274,1.261587,0.781852,0.656422,0.644752,0.636773
3,0.0361,1.482584,0.784601,0.672726,0.671136,0.659913
4,0.0176,1.412292,0.802933,0.725967,0.721529,0.709057
5,0.0087,1.624435,0.797434,0.719765,0.692422,0.688458
6,0.0077,1.503574,0.800183,0.688282,0.706911,0.677923
7,0.0064,1.714202,0.805683,0.752686,0.700027,0.709233
8,0.0055,1.68672,0.794684,0.754322,0.706929,0.710286
9,0.0056,1.68058,0.811182,0.726601,0.726039,0.708482
10,0.0035,1.737649,0.80385,0.746553,0.711632,0.713753


[I 2025-03-15 20:16:04,295] Trial 24 pruned. 


Trial 25 with params: {'learning_rate': 0.0018381955253960309, 'weight_decay': 0.0, 'adam_beta1': 0.92, 'warmup_steps': 27}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7343,1.112568,0.8011,0.699572,0.677116,0.673445
2,0.032,1.264895,0.814849,0.734546,0.704568,0.704667
3,0.0122,1.398188,0.813932,0.781138,0.711668,0.727848
4,0.0089,1.541478,0.805683,0.73695,0.715076,0.709097
5,0.0058,1.453275,0.810266,0.755502,0.728927,0.728408
6,0.008,1.56537,0.813016,0.799376,0.738088,0.74167
7,0.0059,1.623392,0.818515,0.797441,0.764479,0.763729
8,0.0032,1.771316,0.811182,0.742447,0.743496,0.724403
9,0.0037,1.733769,0.804766,0.759219,0.746828,0.735242
10,0.0008,1.85125,0.811182,0.792064,0.754195,0.758584


[I 2025-03-15 20:22:07,683] Trial 25 pruned. 


Trial 26 with params: {'learning_rate': 5.713116097887079e-05, 'weight_decay': 0.01, 'adam_beta1': 0.93, 'warmup_steps': 51}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7118,2.155578,0.466544,0.102689,0.11275,0.087733
2,1.7816,1.738652,0.570119,0.164327,0.183925,0.162888
3,1.3826,1.503877,0.611366,0.269024,0.229217,0.218714
4,1.0915,1.33982,0.658112,0.325762,0.282959,0.279374
5,0.8645,1.239046,0.67736,0.376093,0.32728,0.332117


[I 2025-03-15 20:23:35,181] Trial 26 pruned. 


Trial 27 with params: {'learning_rate': 0.002908847268178964, 'weight_decay': 0.001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 53}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6792,1.114753,0.813016,0.744788,0.706694,0.71023
2,0.0264,1.202732,0.815765,0.761896,0.765605,0.744163
3,0.0122,1.399535,0.808433,0.738048,0.731366,0.718578
4,0.0091,1.574557,0.790101,0.712597,0.719212,0.693315
5,0.0106,1.641513,0.812099,0.784976,0.749049,0.749367
6,0.0043,1.836372,0.811182,0.756655,0.722284,0.725822
7,0.0029,1.952498,0.794684,0.759713,0.760559,0.742094
8,0.0073,1.865013,0.804766,0.738874,0.758622,0.730109
9,0.0106,1.849159,0.807516,0.731838,0.733426,0.72109
10,0.004,2.171336,0.804766,0.783809,0.744538,0.744407


[I 2025-03-15 20:33:07,891] Trial 27 finished with value: 0.7440901247277988 and parameters: {'learning_rate': 0.002908847268178964, 'weight_decay': 0.001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 53}. Best is trial 19 with value: 0.7806870868181661.


Trial 28 with params: {'learning_rate': 0.0038981520240048643, 'weight_decay': 0.006, 'adam_beta1': 0.96, 'warmup_steps': 10}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6131,1.248829,0.796517,0.755653,0.71195,0.715545
2,0.0291,1.397313,0.808433,0.741245,0.734085,0.723174
3,0.0129,1.635895,0.808433,0.776611,0.742399,0.748431
4,0.0122,1.788907,0.793767,0.749721,0.736264,0.723861
5,0.0097,2.001718,0.792851,0.761785,0.723783,0.724435
6,0.012,2.218891,0.789184,0.739698,0.716076,0.713172
7,0.0085,2.07505,0.804766,0.750364,0.711615,0.717412
8,0.0045,2.324054,0.800183,0.782767,0.699379,0.723675
9,0.0041,2.482972,0.786434,0.736339,0.722336,0.711917
10,0.0063,2.577104,0.789184,0.75832,0.727713,0.725238


[I 2025-03-15 20:42:10,576] Trial 28 finished with value: 0.7505826365143348 and parameters: {'learning_rate': 0.0038981520240048643, 'weight_decay': 0.006, 'adam_beta1': 0.96, 'warmup_steps': 10}. Best is trial 19 with value: 0.7806870868181661.


Trial 29 with params: {'learning_rate': 0.004432243617831026, 'weight_decay': 0.001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 23}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5773,1.26922,0.800183,0.750697,0.688195,0.701869
2,0.0279,1.416453,0.799267,0.773658,0.70553,0.723105
3,0.0155,1.623958,0.805683,0.753902,0.728322,0.727899
4,0.0126,1.900046,0.797434,0.742712,0.723725,0.710546
5,0.0158,2.140245,0.791017,0.763688,0.701954,0.711153
6,0.0126,1.937122,0.787351,0.769709,0.729872,0.730493
7,0.0105,2.157618,0.777269,0.704511,0.713282,0.687729
8,0.0075,2.402877,0.783685,0.725661,0.731237,0.713226
9,0.0049,2.610523,0.777269,0.722024,0.702518,0.697447
10,0.0056,2.799969,0.787351,0.738156,0.706271,0.705909


[I 2025-03-15 20:51:14,957] Trial 29 finished with value: 0.7194312361441193 and parameters: {'learning_rate': 0.004432243617831026, 'weight_decay': 0.001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 23}. Best is trial 19 with value: 0.7806870868181661.


Trial 30 with params: {'learning_rate': 0.0049080291133746625, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 41}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.541,1.18752,0.7956,0.740638,0.714733,0.715843
2,0.0303,1.543855,0.794684,0.769046,0.748402,0.742565
3,0.0153,1.765843,0.787351,0.732482,0.733533,0.719418
4,0.0194,1.858431,0.793767,0.762326,0.718084,0.72302
5,0.0141,2.024355,0.800183,0.763423,0.732552,0.732103
6,0.0142,2.146872,0.797434,0.787542,0.727851,0.743869
7,0.0115,2.430424,0.784601,0.73198,0.727873,0.711047
8,0.0117,2.53754,0.783685,0.731113,0.688974,0.694028
9,0.0088,2.554667,0.780935,0.767344,0.740069,0.740245
10,0.006,2.708068,0.792851,0.726379,0.700616,0.702563


[I 2025-03-15 20:54:27,216] Trial 30 pruned. 


Trial 31 with params: {'learning_rate': 0.004363859144422787, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 46}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5467,1.218952,0.813016,0.784781,0.755115,0.751216
2,0.0258,1.381292,0.808433,0.764736,0.726135,0.729829
3,0.014,1.49633,0.802016,0.762775,0.71328,0.722631
4,0.0155,1.637554,0.799267,0.754208,0.728672,0.728899
5,0.0105,2.013226,0.790101,0.777774,0.741668,0.737814
6,0.0115,2.096129,0.808433,0.795791,0.73618,0.749005
7,0.0123,2.194268,0.802933,0.75484,0.713458,0.720374
8,0.0103,2.752373,0.786434,0.763735,0.721577,0.719905
9,0.0073,2.78531,0.797434,0.768202,0.727177,0.731565
10,0.005,2.98802,0.791017,0.781084,0.732586,0.740534


[I 2025-03-15 21:00:49,470] Trial 31 pruned. 


Trial 32 with params: {'learning_rate': 0.0035957332243489342, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 39}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5505,1.196292,0.802016,0.807956,0.73634,0.757681
2,0.0252,1.41837,0.793767,0.756657,0.703578,0.714434
3,0.0141,1.696506,0.7956,0.761528,0.750609,0.743411
4,0.0133,1.658144,0.813016,0.81362,0.746598,0.762802
5,0.0083,2.049834,0.794684,0.77518,0.719226,0.732069
6,0.0083,1.941349,0.810266,0.790911,0.729452,0.743823
7,0.0095,2.130532,0.789184,0.776278,0.738244,0.738599
8,0.0063,2.174126,0.802016,0.793105,0.761377,0.765537
9,0.0075,2.370327,0.788268,0.740285,0.722852,0.710563
10,0.0047,2.614226,0.802933,0.77294,0.739093,0.741724


[I 2025-03-15 21:09:50,288] Trial 32 finished with value: 0.7551439586062821 and parameters: {'learning_rate': 0.0035957332243489342, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 39}. Best is trial 19 with value: 0.7806870868181661.


Trial 33 with params: {'learning_rate': 1.2161047690501456e-05, 'weight_decay': 0.002, 'adam_beta1': 0.99, 'warmup_steps': 49}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4937,3.150948,0.192484,0.010931,0.024658,0.011908
2,2.9319,2.77432,0.328139,0.042439,0.063122,0.03912
3,2.5537,2.513983,0.381302,0.038554,0.08014,0.05118
4,2.3229,2.336629,0.411549,0.068938,0.089726,0.061184
5,2.1589,2.210761,0.448213,0.108782,0.103874,0.0776


[I 2025-03-15 21:11:19,677] Trial 33 pruned. 


Trial 34 with params: {'learning_rate': 0.00036875829250628456, 'weight_decay': 0.01, 'adam_beta1': 0.99, 'warmup_steps': 18}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9197,1.331091,0.68011,0.388245,0.332231,0.336155
2,0.5455,1.162414,0.761687,0.546439,0.519181,0.521486
3,0.1415,1.403035,0.775435,0.667596,0.612386,0.617561
4,0.0492,1.515646,0.780018,0.643967,0.607211,0.60213
5,0.0239,1.671943,0.794684,0.69857,0.669363,0.661606
6,0.0124,1.742832,0.787351,0.682982,0.649534,0.647844
7,0.0091,1.761717,0.789184,0.660243,0.67429,0.646535
8,0.0076,1.795175,0.790101,0.67481,0.650886,0.651877
9,0.0072,1.929482,0.788268,0.650945,0.664013,0.640595
10,0.0038,1.908446,0.784601,0.670084,0.63052,0.629594


[I 2025-03-15 21:14:13,408] Trial 34 pruned. 


Trial 35 with params: {'learning_rate': 0.000713455730511562, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 37}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0914,1.07813,0.772686,0.621403,0.566849,0.581845
2,0.0955,1.295861,0.797434,0.712816,0.674105,0.678453
3,0.0276,1.514449,0.780018,0.692561,0.684519,0.674762
4,0.0147,1.493766,0.804766,0.748195,0.719907,0.721468
5,0.0088,1.559086,0.79835,0.741623,0.684991,0.694349
6,0.0067,1.694276,0.790101,0.706595,0.672834,0.668888
7,0.0055,1.701751,0.799267,0.722501,0.694455,0.692917
8,0.0018,1.8494,0.812099,0.792125,0.692615,0.719415
9,0.0066,1.817931,0.786434,0.727243,0.693067,0.694597
10,0.0023,1.807978,0.808433,0.784334,0.738016,0.740528


[I 2025-03-15 21:23:14,650] Trial 35 finished with value: 0.7371004109150994 and parameters: {'learning_rate': 0.000713455730511562, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 37}. Best is trial 19 with value: 0.7806870868181661.


Trial 36 with params: {'learning_rate': 0.001284781033806759, 'weight_decay': 0.01, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 37}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9076,1.089557,0.790101,0.690103,0.639957,0.649089
2,0.0455,1.285572,0.8011,0.682389,0.700659,0.680299
3,0.0149,1.485699,0.804766,0.757614,0.706768,0.715659
4,0.0087,1.506093,0.802016,0.753567,0.706124,0.711696
5,0.0096,1.430621,0.7956,0.719596,0.697632,0.68981
6,0.0056,1.671801,0.7956,0.743498,0.700687,0.701987
7,0.0041,1.618062,0.811182,0.748652,0.702475,0.712804
8,0.0035,1.700235,0.808433,0.769392,0.715706,0.714564
9,0.0038,1.747801,0.813932,0.786988,0.734186,0.74309
10,0.0029,1.719266,0.807516,0.748718,0.706247,0.70955


[I 2025-03-15 21:26:13,984] Trial 36 pruned. 


Trial 37 with params: {'learning_rate': 0.004973622215556151, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 33}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.529,1.115176,0.809349,0.768944,0.740104,0.74117
2,0.0261,1.400679,0.804766,0.769633,0.744993,0.748267
3,0.0181,1.683911,0.80385,0.744183,0.721088,0.723738
4,0.0178,1.961615,0.782768,0.721967,0.718429,0.709458
5,0.0162,1.82863,0.790101,0.742316,0.739219,0.722882
6,0.013,2.121156,0.784601,0.734263,0.719088,0.709538
7,0.0093,2.553316,0.791934,0.79521,0.745549,0.75652
8,0.0076,2.690702,0.7956,0.78544,0.730279,0.740878
9,0.0074,2.472627,0.79835,0.788092,0.734338,0.746557
10,0.0108,2.847055,0.794684,0.775483,0.712635,0.724704


[I 2025-03-15 21:32:21,040] Trial 37 pruned. 


Trial 38 with params: {'learning_rate': 1.162626851313962e-05, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 12}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3449,3.019231,0.299725,0.020106,0.054228,0.028797
2,2.7807,2.702019,0.334555,0.037418,0.065615,0.040679
3,2.5209,2.506987,0.381302,0.038156,0.079917,0.051003
4,2.3364,2.358671,0.4033,0.066852,0.086698,0.057428
5,2.1876,2.242058,0.432631,0.093575,0.097632,0.072359
6,2.0742,2.151746,0.461962,0.113885,0.110753,0.086069
7,1.98,2.073339,0.476627,0.1046,0.119884,0.096858
8,1.8999,2.015533,0.488543,0.102571,0.125619,0.100839
9,1.8318,1.964038,0.491292,0.122728,0.128001,0.10526
10,1.7666,1.903966,0.516957,0.136472,0.14571,0.126327


[I 2025-03-15 21:38:31,063] Trial 38 pruned. 


Trial 39 with params: {'learning_rate': 0.0014061226914511792, 'weight_decay': 0.004, 'adam_beta1': 0.99, 'warmup_steps': 47}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3168,1.220173,0.76077,0.560421,0.548006,0.544382
2,0.0949,1.467067,0.794684,0.710265,0.671084,0.678084
3,0.0166,1.675771,0.791017,0.726044,0.663759,0.67298
4,0.0091,1.709408,0.8011,0.720517,0.6811,0.682767
5,0.0068,1.847585,0.800183,0.712141,0.700765,0.690135
6,0.0061,1.805264,0.79835,0.71649,0.671495,0.672557
7,0.0039,1.935354,0.796517,0.740926,0.707069,0.708022
8,0.0021,2.045288,0.800183,0.692496,0.693257,0.676735
9,0.0024,2.03163,0.791934,0.738205,0.730898,0.71426
10,0.0037,1.89255,0.789184,0.722707,0.686918,0.687122


[I 2025-03-15 21:44:30,549] Trial 39 pruned. 


Trial 40 with params: {'learning_rate': 0.00015460941865464952, 'weight_decay': 0.006, 'adam_beta1': 0.99, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3723,1.771804,0.553621,0.171859,0.165608,0.144095
2,1.2065,1.229862,0.703025,0.387323,0.343165,0.339812
3,0.6332,1.087214,0.747021,0.491698,0.453425,0.460318
4,0.3239,1.120085,0.76077,0.532108,0.517813,0.516986
5,0.1697,1.238357,0.75802,0.597308,0.556005,0.562066
6,0.093,1.315787,0.771769,0.619497,0.596433,0.596205
7,0.0553,1.440203,0.774519,0.677541,0.614816,0.627003
8,0.0348,1.575546,0.772686,0.644401,0.614697,0.613869
9,0.0246,1.614025,0.771769,0.646163,0.609198,0.612639
10,0.0162,1.741534,0.770852,0.634081,0.634577,0.618121


[I 2025-03-15 21:47:26,851] Trial 40 pruned. 


Trial 41 with params: {'learning_rate': 0.004839808217482779, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 41}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5267,1.196252,0.815765,0.770582,0.724583,0.735262
2,0.0303,1.395836,0.788268,0.758415,0.741247,0.731914
3,0.0157,1.644828,0.80385,0.807106,0.708115,0.737657
4,0.0173,1.808203,0.799267,0.729828,0.701658,0.699933
5,0.0152,2.085272,0.796517,0.761956,0.722687,0.723586
6,0.0126,2.251613,0.807516,0.768941,0.726906,0.729558
7,0.016,2.290892,0.776352,0.745019,0.689631,0.687135
8,0.0104,2.375856,0.800183,0.746849,0.71861,0.718931
9,0.0047,2.636715,0.791934,0.747901,0.726189,0.723061
10,0.0051,2.944983,0.780018,0.746667,0.733213,0.727737


[I 2025-03-15 21:56:40,840] Trial 41 finished with value: 0.7218493853974779 and parameters: {'learning_rate': 0.004839808217482779, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 41}. Best is trial 19 with value: 0.7806870868181661.


Trial 42 with params: {'learning_rate': 0.0011189905431405816, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 39}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8942,1.058402,0.804766,0.757725,0.679029,0.701143
2,0.0503,1.293885,0.808433,0.750818,0.699681,0.707448
3,0.015,1.321656,0.810266,0.714441,0.713489,0.703136
4,0.0108,1.520492,0.810266,0.787906,0.722529,0.736065
5,0.0051,1.593613,0.80385,0.764102,0.723374,0.729126
6,0.0051,1.5847,0.814849,0.786716,0.736536,0.74185
7,0.0044,1.640849,0.810266,0.740166,0.718512,0.715473
8,0.0049,1.697365,0.808433,0.743609,0.703739,0.707271
9,0.0036,1.63814,0.80385,0.758209,0.714002,0.720183
10,0.0016,1.783291,0.817599,0.769349,0.737364,0.740505


[I 2025-03-15 22:02:50,169] Trial 42 pruned. 


Trial 43 with params: {'learning_rate': 0.0010965118056325376, 'weight_decay': 0.0, 'adam_beta1': 0.92, 'warmup_steps': 45}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.943,1.121046,0.785518,0.693116,0.642235,0.649992
2,0.0537,1.262353,0.802016,0.743986,0.699482,0.70558
3,0.018,1.422694,0.7956,0.715899,0.707377,0.697878
4,0.0087,1.531608,0.802933,0.774706,0.729621,0.731758
5,0.0075,1.629874,0.813016,0.802293,0.737905,0.754383
6,0.0068,1.80977,0.791017,0.768194,0.692132,0.708047
7,0.0045,1.629866,0.817599,0.755311,0.739654,0.735473
8,0.0024,1.854236,0.796517,0.762764,0.718907,0.717724
9,0.0035,1.578223,0.807516,0.764961,0.740511,0.74247
10,0.0028,1.795221,0.811182,0.791673,0.735349,0.745528


[I 2025-03-15 22:09:12,699] Trial 43 pruned. 


Trial 44 with params: {'learning_rate': 0.0043173978558411295, 'weight_decay': 0.001, 'adam_beta1': 0.91, 'warmup_steps': 23}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5261,1.233242,0.802933,0.767007,0.720845,0.730126
2,0.0277,1.338448,0.797434,0.753016,0.710098,0.713186
3,0.0124,1.743015,0.8011,0.74767,0.715779,0.716496
4,0.013,1.825654,0.796517,0.773644,0.707966,0.723978
5,0.0216,2.008277,0.791934,0.756744,0.712766,0.715655
6,0.0115,2.248636,0.793767,0.774028,0.719515,0.734632
7,0.0093,2.274764,0.786434,0.79806,0.720925,0.742536
8,0.0059,2.332817,0.797434,0.811929,0.743976,0.761925
9,0.0047,2.48102,0.7956,0.797646,0.733794,0.748721
10,0.0065,2.780135,0.793767,0.792812,0.726556,0.742327


[I 2025-03-15 22:18:55,097] Trial 44 finished with value: 0.7259334915619076 and parameters: {'learning_rate': 0.0043173978558411295, 'weight_decay': 0.001, 'adam_beta1': 0.91, 'warmup_steps': 23}. Best is trial 19 with value: 0.7806870868181661.


Trial 45 with params: {'learning_rate': 1.5109064595787985e-05, 'weight_decay': 0.006, 'adam_beta1': 0.96, 'warmup_steps': 31}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3178,2.93973,0.304308,0.037632,0.056125,0.027541
2,2.6556,2.559529,0.373052,0.037455,0.077275,0.049208
3,2.3552,2.345682,0.406966,0.064177,0.088058,0.058995
4,2.1557,2.190674,0.446379,0.088852,0.103186,0.077316
5,1.9984,2.068536,0.477544,0.120242,0.118672,0.095003
6,1.8811,1.97429,0.505958,0.127737,0.136241,0.112269
7,1.7792,1.893883,0.512374,0.137788,0.142061,0.121093
8,1.6896,1.833936,0.530706,0.145528,0.157863,0.13825
9,1.6114,1.771657,0.546288,0.146695,0.164257,0.146791
10,1.5356,1.716865,0.56462,0.159302,0.180249,0.161671


[I 2025-03-15 22:22:00,209] Trial 45 pruned. 


Trial 46 with params: {'learning_rate': 0.0032657778777058616, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 40}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5896,1.020781,0.815765,0.795104,0.702515,0.730133
2,0.026,1.160432,0.820348,0.797844,0.756351,0.762144
3,0.0126,1.524855,0.796517,0.798576,0.737766,0.747018
4,0.0106,1.650283,0.810266,0.803501,0.754582,0.759073
5,0.0103,1.65094,0.819432,0.762781,0.734775,0.735776
6,0.0076,1.927336,0.809349,0.787128,0.738587,0.743998
7,0.0073,1.727998,0.806599,0.781096,0.717659,0.725957
8,0.0067,1.823162,0.808433,0.737461,0.728808,0.718111
9,0.0045,1.887214,0.79835,0.757646,0.752428,0.741428
10,0.002,1.999397,0.811182,0.775294,0.751346,0.74917


[I 2025-03-15 22:31:36,067] Trial 46 finished with value: 0.7572771278121868 and parameters: {'learning_rate': 0.0032657778777058616, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 40}. Best is trial 19 with value: 0.7806870868181661.


Trial 47 with params: {'learning_rate': 0.0025043993022139744, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 46}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6556,1.13212,0.817599,0.821515,0.734422,0.758817
2,0.0284,1.203225,0.810266,0.760346,0.731755,0.728336
3,0.0119,1.407204,0.817599,0.752886,0.749204,0.729663
4,0.0091,1.426811,0.819432,0.786441,0.75562,0.757773
5,0.0076,1.554104,0.810266,0.792702,0.749574,0.755836
6,0.0068,1.733708,0.821265,0.807082,0.755477,0.762535
7,0.0067,1.734116,0.7956,0.761487,0.724831,0.729046
8,0.004,1.92255,0.813932,0.741988,0.721484,0.717003
9,0.0023,2.073083,0.809349,0.747216,0.741308,0.729758
10,0.0031,2.188723,0.804766,0.74627,0.757166,0.735816


[I 2025-03-15 22:37:27,270] Trial 47 pruned. 


Trial 48 with params: {'learning_rate': 0.0032157946080445237, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 35}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5729,1.096209,0.814849,0.801707,0.723138,0.74597
2,0.0253,1.339103,0.810266,0.774839,0.729566,0.735954
3,0.0129,1.511765,0.809349,0.808704,0.740253,0.755386
4,0.0106,1.519016,0.808433,0.768355,0.746206,0.742789
5,0.0109,1.730672,0.815765,0.785065,0.760625,0.755823
6,0.0099,1.81348,0.786434,0.772557,0.735857,0.738426
7,0.0091,1.933161,0.813016,0.744828,0.735037,0.727198
8,0.0047,1.94977,0.817599,0.794726,0.757146,0.761586
9,0.0035,2.42602,0.804766,0.789841,0.758856,0.759046
10,0.0049,2.463793,0.7956,0.775123,0.742701,0.741482


[I 2025-03-15 22:46:09,523] Trial 48 finished with value: 0.7631644816454031 and parameters: {'learning_rate': 0.0032157946080445237, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 35}. Best is trial 19 with value: 0.7806870868181661.


Trial 49 with params: {'learning_rate': 0.004812290828879115, 'weight_decay': 0.01, 'adam_beta1': 0.99, 'warmup_steps': 5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7201,1.49451,0.779102,0.697117,0.644209,0.654791
2,0.0429,1.569944,0.784601,0.710243,0.707277,0.691249
3,0.0157,2.001243,0.790101,0.716976,0.688158,0.692284
4,0.0101,1.928314,0.797434,0.758435,0.710296,0.721644
5,0.0119,2.026078,0.7956,0.724547,0.72005,0.706281
6,0.011,2.317107,0.782768,0.697912,0.694477,0.677949
7,0.012,2.44065,0.784601,0.729569,0.734385,0.717182
8,0.0144,2.624539,0.790101,0.724935,0.704009,0.69679
9,0.0131,2.795415,0.776352,0.733484,0.713576,0.707064
10,0.0064,2.912161,0.779102,0.723846,0.711698,0.705308


[I 2025-03-15 22:49:09,526] Trial 49 pruned. 


Trial 50 with params: {'learning_rate': 0.0005808020967394468, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 31}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1886,1.059174,0.762603,0.557796,0.533883,0.532942
2,0.1308,1.351463,0.774519,0.681974,0.630503,0.638182
3,0.0371,1.436621,0.791934,0.719135,0.695923,0.695101
4,0.018,1.438989,0.7956,0.742056,0.685801,0.695911
5,0.0089,1.6262,0.7956,0.758849,0.721548,0.728739
6,0.0071,1.652405,0.796517,0.773422,0.707735,0.72202
7,0.0065,1.691241,0.806599,0.76567,0.693671,0.710149
8,0.0036,1.793813,0.806599,0.723403,0.708301,0.698165
9,0.0039,1.855975,0.787351,0.701769,0.696232,0.687232
10,0.005,1.756602,0.813932,0.802974,0.737688,0.749824


[I 2025-03-15 22:55:33,652] Trial 50 pruned. 


Trial 51 with params: {'learning_rate': 0.002217002917021402, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 42}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6711,1.074553,0.8011,0.734113,0.704972,0.701561
2,0.0287,1.23516,0.810266,0.794859,0.746513,0.752954
3,0.0116,1.376005,0.813016,0.766038,0.736924,0.734026
4,0.0071,1.453316,0.796517,0.760204,0.723843,0.724266
5,0.0088,1.612772,0.813016,0.733269,0.739384,0.711214
6,0.0068,1.649566,0.813932,0.736581,0.723556,0.714267
7,0.0064,1.814562,0.805683,0.780957,0.738603,0.737762
8,0.0028,1.907975,0.810266,0.76163,0.726963,0.730344
9,0.0041,1.797233,0.802016,0.749813,0.735103,0.725148
10,0.0038,1.850817,0.809349,0.788143,0.761673,0.759976


[I 2025-03-15 23:01:34,439] Trial 51 pruned. 


Trial 52 with params: {'learning_rate': 0.0015578723468632086, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 43}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.787,1.088221,0.793767,0.688067,0.667351,0.663139
2,0.0353,1.357946,0.796517,0.685916,0.672598,0.664395
3,0.012,1.479746,0.804766,0.770227,0.717623,0.725437
4,0.0091,1.410039,0.813932,0.780074,0.713222,0.731194
5,0.0073,1.482171,0.810266,0.78199,0.718087,0.729896
6,0.0049,1.663782,0.814849,0.763517,0.726393,0.728583
7,0.0065,1.660965,0.809349,0.768106,0.716466,0.723155
8,0.0046,1.678399,0.818515,0.757177,0.728961,0.723339
9,0.0034,1.709167,0.817599,0.779699,0.754307,0.746479
10,0.0016,1.802061,0.821265,0.810502,0.744724,0.75851


[I 2025-03-15 23:11:08,709] Trial 52 finished with value: 0.7341405339808367 and parameters: {'learning_rate': 0.0015578723468632086, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 43}. Best is trial 19 with value: 0.7806870868181661.


Trial 53 with params: {'learning_rate': 0.001411189747590599, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 32}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8109,1.157456,0.796517,0.707895,0.6522,0.66147
2,0.0403,1.2853,0.79835,0.749054,0.70646,0.70829
3,0.0132,1.470527,0.8011,0.728814,0.700929,0.695197
4,0.0079,1.558034,0.792851,0.745998,0.708482,0.711429
5,0.0066,1.577987,0.818515,0.747172,0.736783,0.726492
6,0.0054,1.582061,0.815765,0.76417,0.731475,0.737019
7,0.0061,1.679738,0.815765,0.785034,0.721258,0.738083
8,0.0035,1.804386,0.812099,0.76524,0.704954,0.7172
9,0.0031,1.748275,0.814849,0.806481,0.733825,0.749641
10,0.0036,1.713681,0.813016,0.71675,0.716705,0.704244


[I 2025-03-15 23:14:18,721] Trial 53 pruned. 


Trial 54 with params: {'learning_rate': 0.0031552331066603506, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 34}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5723,1.090962,0.807516,0.741731,0.699383,0.705045
2,0.0245,1.290519,0.814849,0.799585,0.744157,0.756378
3,0.0126,1.647597,0.818515,0.774746,0.736488,0.739484
4,0.0129,1.602949,0.811182,0.74846,0.735569,0.726711
5,0.0064,1.863471,0.811182,0.761899,0.752032,0.742341
6,0.0088,1.99133,0.797434,0.765442,0.734567,0.732894
7,0.0069,1.988629,0.802016,0.782351,0.740362,0.745149
8,0.0072,2.10853,0.800183,0.753907,0.741967,0.729024
9,0.005,2.060305,0.811182,0.744325,0.763109,0.743447
10,0.0021,2.099174,0.802933,0.738478,0.747632,0.731386


[I 2025-03-15 23:23:50,218] Trial 54 finished with value: 0.7687807434619329 and parameters: {'learning_rate': 0.0031552331066603506, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 34}. Best is trial 19 with value: 0.7806870868181661.


Trial 55 with params: {'learning_rate': 0.004789291216712974, 'weight_decay': 0.003, 'adam_beta1': 0.91, 'warmup_steps': 34}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5317,1.134504,0.80385,0.780214,0.719781,0.737224
2,0.027,1.385307,0.80385,0.795524,0.751185,0.755962
3,0.0181,1.74357,0.799267,0.807035,0.719613,0.744535
4,0.0163,1.765528,0.805683,0.791878,0.743591,0.752148
5,0.0151,1.853372,0.800183,0.771461,0.741236,0.740858
6,0.0141,2.0041,0.780018,0.764844,0.724546,0.725028
7,0.012,2.25908,0.786434,0.735506,0.724168,0.706758
8,0.0088,2.185534,0.788268,0.801755,0.722601,0.740589
9,0.0087,2.463428,0.790101,0.785032,0.734118,0.732145
10,0.006,2.560427,0.796517,0.773858,0.740169,0.737067


[I 2025-03-15 23:29:48,960] Trial 55 pruned. 


Trial 56 with params: {'learning_rate': 0.0037353161988420685, 'weight_decay': 0.005, 'adam_beta1': 0.91, 'warmup_steps': 21}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5609,1.115926,0.813016,0.765626,0.73121,0.737484
2,0.0265,1.320946,0.802933,0.76081,0.723084,0.72203
3,0.0136,1.544372,0.815765,0.76888,0.730555,0.73907
4,0.0122,1.540307,0.810266,0.756051,0.762017,0.741787
5,0.0102,1.892714,0.80385,0.802106,0.735452,0.751577
6,0.0062,1.996825,0.799267,0.766657,0.717864,0.728573
7,0.0133,2.048179,0.802933,0.771434,0.736361,0.732587
8,0.0073,2.36055,0.79835,0.781466,0.722014,0.734336
9,0.0035,2.415836,0.796517,0.787657,0.731247,0.745807
10,0.0034,2.36312,0.813016,0.796622,0.782186,0.775724


[I 2025-03-15 23:38:59,639] Trial 56 finished with value: 0.7637716899741291 and parameters: {'learning_rate': 0.0037353161988420685, 'weight_decay': 0.005, 'adam_beta1': 0.91, 'warmup_steps': 21}. Best is trial 19 with value: 0.7806870868181661.


Trial 57 with params: {'learning_rate': 0.0015640645154086068, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.91, 'warmup_steps': 12}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7315,1.099603,0.80385,0.671892,0.660229,0.651417
2,0.0383,1.308985,0.802016,0.686455,0.676524,0.667303
3,0.0133,1.44074,0.8011,0.698183,0.68766,0.678087
4,0.01,1.502412,0.809349,0.737679,0.731076,0.720461
5,0.0055,1.586643,0.813016,0.765848,0.748143,0.745049
6,0.0066,1.64722,0.819432,0.715702,0.71842,0.706932
7,0.0046,1.680704,0.800183,0.751315,0.734119,0.731116
8,0.0047,1.735441,0.802016,0.751944,0.733462,0.721811
9,0.0042,1.747177,0.812099,0.76317,0.750869,0.741285
10,0.0029,1.787386,0.811182,0.759205,0.746736,0.736043


[I 2025-03-15 23:48:20,178] Trial 57 finished with value: 0.7457835631876044 and parameters: {'learning_rate': 0.0015640645154086068, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.91, 'warmup_steps': 12}. Best is trial 19 with value: 0.7806870868181661.


Trial 58 with params: {'learning_rate': 0.002846415951530443, 'weight_decay': 0.005, 'adam_beta1': 0.91, 'warmup_steps': 13}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5714,1.209115,0.806599,0.800399,0.737224,0.755956
2,0.0273,1.294716,0.805683,0.761155,0.711282,0.714237
3,0.0103,1.335978,0.825848,0.802725,0.764796,0.769268
4,0.0117,1.523414,0.813016,0.796829,0.743445,0.750644
5,0.0091,1.507765,0.805683,0.752938,0.760497,0.740476
6,0.0078,1.71937,0.813932,0.744659,0.740482,0.722863
7,0.0064,1.667504,0.818515,0.765665,0.757739,0.736432
8,0.0046,1.814388,0.802016,0.762209,0.726739,0.725325
9,0.0041,1.905398,0.804766,0.722337,0.740706,0.714919
10,0.0025,1.963548,0.811182,0.764525,0.751003,0.739884


[I 2025-03-15 23:54:11,489] Trial 58 pruned. 


Trial 59 with params: {'learning_rate': 0.0013173549670040963, 'weight_decay': 0.005, 'adam_beta1': 0.91, 'warmup_steps': 19}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8111,1.140518,0.7956,0.716117,0.649301,0.665693
2,0.0434,1.239679,0.806599,0.721813,0.696542,0.69639
3,0.0153,1.281978,0.824015,0.74743,0.720119,0.721259
4,0.0113,1.455167,0.80385,0.693913,0.693789,0.677267
5,0.0073,1.381966,0.819432,0.766558,0.731523,0.732055
6,0.0041,1.654705,0.816682,0.752869,0.732641,0.728024
7,0.0041,1.660718,0.811182,0.769866,0.723557,0.732798
8,0.0024,1.686347,0.813932,0.786608,0.726684,0.738061
9,0.0044,1.751586,0.802933,0.769291,0.734794,0.732925
10,0.0043,1.727998,0.821265,0.762146,0.726678,0.727027


[I 2025-03-16 00:03:15,201] Trial 59 finished with value: 0.7602292691862429 and parameters: {'learning_rate': 0.0013173549670040963, 'weight_decay': 0.005, 'adam_beta1': 0.91, 'warmup_steps': 19}. Best is trial 19 with value: 0.7806870868181661.


Trial 60 with params: {'learning_rate': 0.0045620052854146165, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 18}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4872,1.221372,0.800183,0.772105,0.718311,0.726209
2,0.0235,1.394493,0.80385,0.765052,0.731898,0.734803
3,0.0183,1.691552,0.80385,0.790441,0.741042,0.742865
4,0.014,1.928778,0.7956,0.755985,0.719484,0.723824
5,0.0154,2.17118,0.793767,0.771598,0.736827,0.73168
6,0.0155,2.541677,0.787351,0.736485,0.721158,0.712129
7,0.0074,2.623646,0.791934,0.767169,0.759474,0.746551
8,0.0064,2.503084,0.8011,0.760114,0.760854,0.746771
9,0.0071,2.766578,0.800183,0.768661,0.759883,0.748972
10,0.0067,2.74273,0.804766,0.749538,0.769432,0.749489


[I 2025-03-16 00:09:04,504] Trial 60 pruned. 


Trial 61 with params: {'learning_rate': 0.0038899132825031563, 'weight_decay': 0.006, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 30}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5879,1.220207,0.805683,0.743512,0.709787,0.71137
2,0.0278,1.380998,0.800183,0.76591,0.752393,0.740921
3,0.0125,1.585195,0.806599,0.780923,0.753148,0.749832
4,0.0111,1.820604,0.8011,0.763932,0.746841,0.736962
5,0.014,1.729567,0.80385,0.767864,0.755434,0.748092
6,0.0089,2.101213,0.799267,0.778204,0.748877,0.746263
7,0.0099,2.224273,0.807516,0.777907,0.743498,0.748661
8,0.0063,2.386351,0.783685,0.747165,0.738428,0.728142
9,0.0069,2.681557,0.785518,0.772362,0.730543,0.734227
10,0.0053,2.581175,0.79835,0.777028,0.712105,0.726233


[I 2025-03-16 00:12:17,018] Trial 61 pruned. 


Trial 62 with params: {'learning_rate': 0.004636856013627146, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 26}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.532,1.216097,0.808433,0.765974,0.73336,0.737941
2,0.0288,1.437812,0.814849,0.796466,0.758501,0.762019
3,0.0149,1.699006,0.794684,0.714021,0.690793,0.685426
4,0.0144,1.902047,0.80385,0.759307,0.72091,0.720736
5,0.0133,2.04622,0.800183,0.751264,0.704965,0.710344


[I 2025-03-16 00:13:47,170] Trial 62 pruned. 


Trial 63 with params: {'learning_rate': 0.0011182096270197718, 'weight_decay': 0.01, 'adam_beta1': 0.98, 'warmup_steps': 53}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2397,1.138862,0.785518,0.614156,0.588026,0.587145
2,0.0789,1.302416,0.7956,0.675268,0.676651,0.660361
3,0.018,1.499752,0.802933,0.712388,0.679069,0.681937
4,0.0108,1.588408,0.8011,0.719486,0.713099,0.698122
5,0.0058,1.693162,0.814849,0.758101,0.732429,0.728819
6,0.0057,1.685766,0.809349,0.775257,0.752598,0.747315
7,0.0041,1.701731,0.816682,0.765059,0.747271,0.741826
8,0.0022,1.943495,0.7956,0.748862,0.708435,0.707183
9,0.0047,1.892575,0.800183,0.734411,0.709634,0.704696
10,0.0023,2.066549,0.80385,0.783472,0.733436,0.738609


[I 2025-03-16 00:19:46,020] Trial 63 pruned. 


Trial 64 with params: {'learning_rate': 1.6488779238415115e-05, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 40}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.2368,2.854861,0.306141,0.043125,0.057383,0.029822
2,2.6042,2.528733,0.373052,0.03727,0.077684,0.049346
3,2.3167,2.312783,0.414299,0.082696,0.091511,0.064299
4,2.1149,2.153831,0.463795,0.117573,0.111357,0.087882
5,1.9579,2.032401,0.486709,0.113449,0.123903,0.101337
6,1.8359,1.936095,0.502291,0.130052,0.134404,0.111187
7,1.7331,1.860435,0.526123,0.139689,0.153102,0.134191
8,1.6408,1.798206,0.541705,0.14515,0.163881,0.144548
9,1.5588,1.744028,0.551787,0.150558,0.171781,0.151372
10,1.478,1.68166,0.575619,0.195961,0.192009,0.175507


[I 2025-03-16 00:25:31,820] Trial 64 pruned. 


Trial 65 with params: {'learning_rate': 0.0015964572633617566, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 24}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7408,1.107321,0.807516,0.719323,0.636286,0.65431
2,0.0348,1.219469,0.814849,0.774446,0.685922,0.708297
3,0.0144,1.281486,0.813016,0.74616,0.708018,0.709474
4,0.0088,1.527191,0.807516,0.722739,0.696663,0.688664
5,0.0063,1.554418,0.809349,0.683063,0.707466,0.685507


[I 2025-03-16 00:27:04,997] Trial 65 pruned. 


Trial 66 with params: {'learning_rate': 0.001432433799324852, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 33}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7897,1.122424,0.810266,0.765718,0.701194,0.719327
2,0.0383,1.244233,0.812099,0.758853,0.724633,0.723833
3,0.0141,1.340277,0.810266,0.74256,0.689873,0.697277
4,0.009,1.415923,0.817599,0.789795,0.739105,0.747415
5,0.0042,1.614342,0.805683,0.762938,0.72442,0.721537
6,0.0059,1.546388,0.812099,0.778347,0.72276,0.736116
7,0.0047,1.574348,0.817599,0.79342,0.718618,0.739108
8,0.0048,1.714623,0.821265,0.786929,0.740718,0.747278
9,0.0033,1.71493,0.813932,0.77579,0.756735,0.752369
10,0.0025,1.637098,0.826764,0.827486,0.740057,0.762879


[I 2025-03-16 00:36:28,845] Trial 66 finished with value: 0.739053463048445 and parameters: {'learning_rate': 0.001432433799324852, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 33}. Best is trial 19 with value: 0.7806870868181661.


Trial 67 with params: {'learning_rate': 4.411913382725009e-05, 'weight_decay': 0.008, 'adam_beta1': 0.93, 'warmup_steps': 17}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8021,2.306408,0.423465,0.107488,0.094699,0.068008
2,1.9627,1.908555,0.514207,0.149739,0.142833,0.118719
3,1.6087,1.680488,0.570119,0.1849,0.183804,0.168567
4,1.3413,1.503566,0.615949,0.263468,0.227855,0.214527
5,1.1267,1.39151,0.629698,0.276263,0.252298,0.243492


[I 2025-03-16 00:38:03,933] Trial 67 pruned. 


Trial 68 with params: {'learning_rate': 0.002537842239212602, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 27}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6045,1.16004,0.8011,0.802639,0.731336,0.747841
2,0.0259,1.183373,0.829514,0.774164,0.728087,0.737961
3,0.0146,1.361242,0.809349,0.788845,0.743458,0.749949
4,0.0094,1.495455,0.819432,0.76257,0.77132,0.754535
5,0.0084,1.545039,0.824015,0.805218,0.756079,0.765155
6,0.0049,1.610047,0.811182,0.7712,0.742447,0.745428
7,0.0066,1.815684,0.816682,0.777811,0.752067,0.750853
8,0.0033,1.801099,0.820348,0.796599,0.755829,0.762011
9,0.0045,1.852021,0.810266,0.782036,0.7496,0.75014
10,0.0034,1.86702,0.813016,0.753874,0.731925,0.730215


[I 2025-03-16 00:41:11,522] Trial 68 pruned. 


Trial 69 with params: {'learning_rate': 0.0045091958185453, 'weight_decay': 0.005, 'adam_beta1': 0.93, 'warmup_steps': 16}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5058,1.235577,0.800183,0.770579,0.728244,0.731746
2,0.0261,1.506512,0.796517,0.749014,0.7369,0.726783
3,0.0149,1.845322,0.786434,0.717011,0.687105,0.679688
4,0.0175,1.907393,0.782768,0.713666,0.702201,0.696619
5,0.0117,2.398528,0.796517,0.786265,0.72963,0.735886
6,0.0143,2.186141,0.797434,0.758542,0.715743,0.720065
7,0.0086,2.433425,0.787351,0.768288,0.718082,0.725728
8,0.0061,2.458536,0.788268,0.764802,0.733398,0.729815
9,0.0067,2.711313,0.776352,0.780975,0.712505,0.731322
10,0.0063,2.649363,0.781852,0.748103,0.737152,0.726109


[I 2025-03-16 00:44:10,828] Trial 69 pruned. 


Trial 70 with params: {'learning_rate': 0.0001132869810080476, 'weight_decay': 0.01, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2937,1.709868,0.566453,0.189681,0.171195,0.151088
2,1.2098,1.279189,0.675527,0.317108,0.313865,0.308547
3,0.7309,1.129673,0.727773,0.433937,0.416684,0.418419
4,0.4532,1.11754,0.736022,0.475478,0.459533,0.459737
5,0.2775,1.146393,0.745188,0.582083,0.509932,0.526104
6,0.1721,1.173448,0.756187,0.646688,0.591044,0.594176
7,0.1092,1.255997,0.761687,0.599373,0.601887,0.583881
8,0.0737,1.336251,0.752521,0.627894,0.604516,0.599501
9,0.0518,1.38483,0.753437,0.639939,0.607853,0.609077
10,0.0376,1.489606,0.751604,0.601107,0.603044,0.584271


[I 2025-03-16 00:50:16,405] Trial 70 pruned. 


Trial 71 with params: {'learning_rate': 0.0003468528694226627, 'weight_decay': 0.004, 'adam_beta1': 0.92, 'warmup_steps': 17}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5205,1.069827,0.740605,0.436352,0.427001,0.421283
2,0.3168,1.128288,0.766269,0.581289,0.558376,0.559307
3,0.0903,1.307103,0.776352,0.665951,0.627941,0.634356
4,0.0382,1.363671,0.785518,0.696185,0.669152,0.670722
5,0.0193,1.535936,0.784601,0.655198,0.643357,0.629609
6,0.0122,1.585826,0.789184,0.722067,0.687531,0.689894
7,0.009,1.654986,0.783685,0.677009,0.637161,0.644306
8,0.0072,1.719799,0.782768,0.709026,0.664306,0.670529
9,0.0068,1.716043,0.786434,0.698405,0.691554,0.68092
10,0.0044,1.77595,0.794684,0.679669,0.670414,0.664362


[I 2025-03-16 00:56:02,855] Trial 71 pruned. 


Trial 72 with params: {'learning_rate': 0.000416585286554437, 'weight_decay': 0.008, 'adam_beta1': 0.93, 'warmup_steps': 21}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4317,1.03081,0.738772,0.452226,0.439636,0.435242
2,0.2393,1.168752,0.778185,0.666201,0.617333,0.620328
3,0.0641,1.452429,0.76352,0.64273,0.625455,0.617643
4,0.0283,1.419842,0.788268,0.685145,0.675473,0.670764
5,0.0148,1.581732,0.784601,0.703879,0.63561,0.65292
6,0.0103,1.610155,0.785518,0.692795,0.657603,0.664295
7,0.0075,1.801662,0.789184,0.703149,0.671454,0.677036
8,0.0072,1.713741,0.790101,0.722375,0.691323,0.690055
9,0.0049,1.733894,0.790101,0.712505,0.67261,0.67897
10,0.0037,1.85496,0.807516,0.761099,0.686384,0.702112


[I 2025-03-16 01:01:55,654] Trial 72 pruned. 


Trial 73 with params: {'learning_rate': 0.0007843539245532565, 'weight_decay': 0.004, 'adam_beta1': 0.92, 'warmup_steps': 18}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0258,1.075718,0.778185,0.628417,0.586183,0.591613
2,0.0842,1.225498,0.797434,0.717253,0.692642,0.686964
3,0.0244,1.428191,0.8011,0.692593,0.688608,0.676043
4,0.0128,1.460989,0.805683,0.75997,0.713313,0.718726
5,0.0083,1.58637,0.802016,0.74444,0.69213,0.70143


[I 2025-03-16 01:03:25,431] Trial 73 pruned. 


Trial 74 with params: {'learning_rate': 0.002706835374981287, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 47}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6717,1.244352,0.791934,0.726672,0.691721,0.695794
2,0.0279,1.263814,0.808433,0.759809,0.737401,0.731314
3,0.0124,1.457573,0.820348,0.790364,0.742782,0.752805
4,0.0119,1.54359,0.79835,0.725709,0.751829,0.717809
5,0.0076,1.585322,0.817599,0.805988,0.751286,0.763239
6,0.0043,1.79965,0.813932,0.746572,0.75181,0.73614
7,0.0037,1.814705,0.825848,0.772252,0.774417,0.760046
8,0.0105,1.794139,0.814849,0.779066,0.777239,0.755881
9,0.0041,1.865183,0.814849,0.761278,0.757692,0.746768
10,0.0026,2.047565,0.814849,0.770326,0.748115,0.742531


[I 2025-03-16 01:12:28,501] Trial 74 finished with value: 0.7596187307479334 and parameters: {'learning_rate': 0.002706835374981287, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 47}. Best is trial 19 with value: 0.7806870868181661.


Trial 75 with params: {'learning_rate': 0.0009993731259741677, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 6}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8827,1.0305,0.799267,0.676805,0.626716,0.638718
2,0.0626,1.168835,0.809349,0.738897,0.669773,0.683766
3,0.0184,1.387853,0.793767,0.733071,0.681934,0.690441
4,0.0098,1.542623,0.802933,0.756257,0.697588,0.707705
5,0.0084,1.552573,0.797434,0.738792,0.688451,0.696738
6,0.0061,1.667503,0.808433,0.760193,0.718144,0.724345
7,0.0056,1.633023,0.813932,0.760501,0.717276,0.720389
8,0.0026,1.752394,0.799267,0.763361,0.711667,0.720194
9,0.0042,1.782321,0.80385,0.739605,0.712069,0.707325
10,0.0039,1.789797,0.808433,0.753749,0.694163,0.704284


[I 2025-03-16 01:22:02,480] Trial 75 finished with value: 0.7333865869172268 and parameters: {'learning_rate': 0.0009993731259741677, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 6}. Best is trial 19 with value: 0.7806870868181661.


Trial 76 with params: {'learning_rate': 0.0018446689183870065, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 29}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7042,1.06049,0.809349,0.707638,0.680853,0.682391
2,0.031,1.267502,0.8011,0.774565,0.699313,0.717575
3,0.0126,1.454736,0.814849,0.80274,0.753662,0.763166
4,0.0081,1.493047,0.806599,0.716618,0.71199,0.703306
5,0.0058,1.588269,0.807516,0.710132,0.705371,0.691342
6,0.0059,1.520201,0.800183,0.774078,0.713167,0.724544
7,0.0066,1.732456,0.810266,0.735838,0.708079,0.703268
8,0.0038,1.647571,0.813016,0.76573,0.761931,0.742835
9,0.004,1.638451,0.810266,0.746808,0.731826,0.721229
10,0.0032,1.77153,0.816682,0.754363,0.75251,0.741918


[I 2025-03-16 01:31:32,790] Trial 76 finished with value: 0.73462361983125 and parameters: {'learning_rate': 0.0018446689183870065, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 29}. Best is trial 19 with value: 0.7806870868181661.


Trial 77 with params: {'learning_rate': 0.0026477002524539586, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 50}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6426,1.117415,0.804766,0.78897,0.716773,0.735494
2,0.0285,1.274194,0.8011,0.759353,0.734205,0.727084
3,0.0127,1.391259,0.79835,0.751411,0.748988,0.73259
4,0.0072,1.510181,0.813932,0.753437,0.733285,0.728998
5,0.0053,1.607774,0.813016,0.790449,0.748715,0.758462
6,0.0087,1.725212,0.802933,0.760774,0.720341,0.722452
7,0.0071,1.889125,0.802016,0.733311,0.71358,0.704852
8,0.0043,1.946022,0.799267,0.755148,0.730999,0.728385
9,0.0045,1.925496,0.809349,0.748503,0.725441,0.720691
10,0.0028,2.113319,0.811182,0.774683,0.746186,0.74323


[I 2025-03-16 01:40:43,055] Trial 77 finished with value: 0.770784060198191 and parameters: {'learning_rate': 0.0026477002524539586, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 50}. Best is trial 19 with value: 0.7806870868181661.


Trial 78 with params: {'learning_rate': 0.0015319294519992148, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 48}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7925,1.062109,0.80385,0.693611,0.672639,0.665509
2,0.0346,1.234498,0.813932,0.750804,0.7096,0.710857
3,0.0128,1.40811,0.811182,0.780369,0.718316,0.728686
4,0.0116,1.412974,0.802933,0.726481,0.703354,0.697832
5,0.0055,1.383381,0.824931,0.736687,0.734216,0.719352
6,0.0034,1.55649,0.812099,0.761489,0.722048,0.71704
7,0.006,1.557095,0.808433,0.77731,0.720864,0.725599
8,0.0046,1.70413,0.806599,0.779222,0.723453,0.731701
9,0.0047,1.560604,0.813016,0.755788,0.721467,0.726437
10,0.0032,1.66301,0.817599,0.765073,0.742108,0.736534


[I 2025-03-16 01:46:57,935] Trial 78 pruned. 


Trial 79 with params: {'learning_rate': 0.003127165535760602, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 49}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6132,1.149724,0.802016,0.7727,0.710763,0.726381
2,0.0253,1.279079,0.809349,0.804026,0.739704,0.755149
3,0.0141,1.436494,0.811182,0.778046,0.74342,0.749468
4,0.0094,1.612582,0.811182,0.786992,0.732971,0.743353
5,0.0083,1.813938,0.804766,0.771842,0.719407,0.730276
6,0.0084,1.914301,0.815765,0.810908,0.762799,0.769893
7,0.0086,1.971297,0.808433,0.770623,0.75918,0.75151
8,0.0055,2.286297,0.802933,0.7405,0.735584,0.722757
9,0.0059,2.277833,0.797434,0.739002,0.740897,0.724895
10,0.0045,2.294208,0.812099,0.739193,0.752128,0.730172


[I 2025-03-16 01:55:55,113] Trial 79 finished with value: 0.7571411818218152 and parameters: {'learning_rate': 0.003127165535760602, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 49}. Best is trial 19 with value: 0.7806870868181661.


Trial 80 with params: {'learning_rate': 0.00454107382657781, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 30}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5248,1.260192,0.804766,0.769683,0.723639,0.728362
2,0.0266,1.359199,0.810266,0.819443,0.729443,0.755166
3,0.0168,1.516072,0.796517,0.765989,0.736798,0.737082
4,0.0149,1.822288,0.793767,0.801707,0.717701,0.740466
5,0.0161,1.992714,0.7956,0.727004,0.734106,0.714638
6,0.0121,2.058625,0.802933,0.756343,0.729356,0.730921
7,0.0109,2.126899,0.791934,0.701325,0.713208,0.689461
8,0.0087,2.301969,0.786434,0.767917,0.760065,0.746396
9,0.008,2.637664,0.764436,0.721463,0.720743,0.708037
10,0.0059,2.701601,0.779102,0.758499,0.7332,0.73162


[I 2025-03-16 02:02:03,607] Trial 80 pruned. 


Trial 81 with params: {'learning_rate': 0.0022067795230689825, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 52}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6877,1.103256,0.811182,0.778815,0.706016,0.72339
2,0.0305,1.286455,0.8011,0.725407,0.697321,0.698228
3,0.0118,1.342068,0.821265,0.762742,0.733031,0.734899
4,0.0066,1.353006,0.823098,0.789253,0.748674,0.754099
5,0.0084,1.50118,0.824931,0.772628,0.723107,0.73483
6,0.0072,1.553381,0.812099,0.736226,0.711282,0.712155
7,0.0075,1.720475,0.806599,0.796843,0.72947,0.741376
8,0.0027,1.74117,0.827681,0.794308,0.759423,0.762296
9,0.0035,1.72003,0.811182,0.753223,0.740324,0.73029
10,0.0046,1.76623,0.814849,0.763011,0.761871,0.752634


[I 2025-03-16 02:11:22,133] Trial 81 finished with value: 0.7769366878193945 and parameters: {'learning_rate': 0.0022067795230689825, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 52}. Best is trial 19 with value: 0.7806870868181661.


Trial 82 with params: {'learning_rate': 0.0017627233994348367, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 50}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7798,1.186987,0.806599,0.750742,0.680182,0.695864
2,0.0347,1.320463,0.804766,0.781986,0.721041,0.731715
3,0.012,1.493667,0.804766,0.792351,0.708095,0.733399
4,0.009,1.422396,0.813932,0.766155,0.737367,0.737242
5,0.0063,1.588992,0.814849,0.737422,0.711538,0.709102
6,0.0049,1.619227,0.8011,0.768569,0.727263,0.732203
7,0.0047,1.59568,0.821265,0.818636,0.751161,0.76342
8,0.0043,1.809134,0.805683,0.803694,0.718196,0.735827
9,0.0057,1.711619,0.8011,0.757163,0.717181,0.716818
10,0.0014,1.86349,0.821265,0.784697,0.749627,0.753444


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--accuracy/f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Sat Oct 12 13:56:14 2024) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.
[I 2025-03-16 02:20:31,706] Trial 82 finished with value: 0.7471004032865922 and parameters: {'learning_rate': 0.0017627233994348367, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 50}. Best is trial 19 with value: 0.7806870868181661.


Trial 83 with params: {'learning_rate': 0.0025701735759136577, 'weight_decay': 0.003, 'adam_beta1': 0.91, 'warmup_steps': 52}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6565,1.069156,0.817599,0.784808,0.716276,0.734715
2,0.0273,1.27453,0.809349,0.732487,0.690325,0.692598
3,0.0119,1.367179,0.816682,0.790008,0.714764,0.737532
4,0.0079,1.515084,0.818515,0.75485,0.750258,0.735316
5,0.0074,1.548078,0.827681,0.790305,0.776057,0.769474
6,0.0065,1.704843,0.809349,0.78042,0.738466,0.745117
7,0.0059,1.764575,0.80385,0.767699,0.713833,0.719719
8,0.0054,2.072232,0.807516,0.775494,0.733549,0.738413
9,0.0051,2.057418,0.806599,0.779067,0.742599,0.747341
10,0.005,2.166255,0.814849,0.783254,0.748392,0.750112


[I 2025-03-16 02:29:41,280] Trial 83 finished with value: 0.7462100349510705 and parameters: {'learning_rate': 0.0025701735759136577, 'weight_decay': 0.003, 'adam_beta1': 0.91, 'warmup_steps': 52}. Best is trial 19 with value: 0.7806870868181661.


Trial 84 with params: {'learning_rate': 0.0007895222234840208, 'weight_decay': 0.005, 'adam_beta1': 0.91, 'warmup_steps': 49}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0746,1.073251,0.780018,0.651073,0.593254,0.604034
2,0.0846,1.239172,0.797434,0.713807,0.682739,0.683064
3,0.0233,1.594499,0.777269,0.665147,0.673021,0.643603
4,0.0122,1.47058,0.805683,0.738799,0.701663,0.709393
5,0.009,1.514189,0.802016,0.718445,0.686956,0.691316
6,0.0063,1.696083,0.805683,0.777596,0.696648,0.715597
7,0.0062,1.725228,0.8011,0.797322,0.704038,0.727866
8,0.0037,1.793445,0.796517,0.740299,0.706519,0.701787
9,0.0051,1.745388,0.7956,0.715333,0.694027,0.686092
10,0.0033,1.846146,0.804766,0.762186,0.702221,0.715902


[I 2025-03-16 02:32:45,288] Trial 84 pruned. 


Trial 85 with params: {'learning_rate': 0.002362231153716838, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 40}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.657,1.103685,0.8011,0.729235,0.699926,0.702956
2,0.0286,1.207186,0.802016,0.751726,0.726821,0.719515
3,0.0115,1.275492,0.813932,0.73394,0.755073,0.731079
4,0.0075,1.434706,0.815765,0.767156,0.70635,0.718034
5,0.0067,1.64328,0.811182,0.746109,0.711675,0.711304


[I 2025-03-16 02:34:14,314] Trial 85 pruned. 


Trial 86 with params: {'learning_rate': 1.6562808358868146e-05, 'weight_decay': 0.006, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 51}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.2834,2.87782,0.310724,0.045515,0.058183,0.030573
2,2.5958,2.506856,0.381302,0.038517,0.079954,0.051125
3,2.2956,2.290199,0.424381,0.086991,0.094609,0.06826
4,2.0936,2.134464,0.461045,0.114375,0.109605,0.085206
5,1.9362,2.012882,0.489459,0.111558,0.124702,0.101166


[I 2025-03-16 02:35:43,334] Trial 86 pruned. 


Trial 87 with params: {'learning_rate': 0.002648434421098528, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 50}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6406,1.097847,0.809349,0.777376,0.698646,0.717698
2,0.026,1.240956,0.811182,0.76281,0.724453,0.730039
3,0.0113,1.407766,0.802933,0.746217,0.729833,0.722685
4,0.0117,1.468175,0.806599,0.753745,0.717852,0.719876
5,0.0076,1.484698,0.830431,0.77972,0.749186,0.746992
6,0.0073,1.594558,0.817599,0.766177,0.722944,0.728355
7,0.0074,1.711655,0.813016,0.777969,0.721239,0.73389
8,0.0041,1.778031,0.820348,0.754041,0.737454,0.734386
9,0.003,1.931639,0.819432,0.757642,0.733725,0.738007
10,0.0013,2.024425,0.813932,0.741526,0.729287,0.727694


[I 2025-03-16 02:38:53,266] Trial 87 pruned. 


Trial 88 with params: {'learning_rate': 0.0006246160551086011, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 38}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1719,1.064042,0.769019,0.581025,0.53721,0.536851
2,0.1152,1.278997,0.791017,0.671004,0.641766,0.641005
3,0.0344,1.470261,0.786434,0.662208,0.659081,0.647713
4,0.0171,1.464259,0.797434,0.736882,0.694958,0.700163
5,0.0096,1.564349,0.810266,0.76503,0.700282,0.720399
6,0.006,1.594503,0.811182,0.776923,0.704637,0.724698
7,0.0062,1.808807,0.796517,0.731373,0.677319,0.686714
8,0.0055,1.84315,0.809349,0.743089,0.70974,0.713047
9,0.0055,1.797591,0.802016,0.731408,0.713866,0.710395
10,0.0023,1.784427,0.810266,0.739442,0.705989,0.708302


[I 2025-03-16 02:42:01,989] Trial 88 pruned. 


Trial 89 with params: {'learning_rate': 2.588158862083385e-05, 'weight_decay': 0.002, 'adam_beta1': 0.99, 'warmup_steps': 21}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.2494,2.86592,0.311641,0.038117,0.057898,0.029388
2,2.4624,2.302539,0.420715,0.065727,0.092279,0.063451
3,2.0417,2.039773,0.485793,0.095677,0.122483,0.097947
4,1.8122,1.862871,0.525206,0.14609,0.147367,0.124778
5,1.6256,1.72711,0.56462,0.161897,0.173597,0.15627


[I 2025-03-16 02:43:40,868] Trial 89 pruned. 


Trial 90 with params: {'learning_rate': 0.000320356882928083, 'weight_decay': 0.003, 'adam_beta1': 0.99, 'warmup_steps': 29}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0613,1.402436,0.654445,0.267072,0.279179,0.26788
2,0.666,1.110556,0.746104,0.490015,0.479271,0.477934
3,0.1958,1.280119,0.776352,0.627411,0.599461,0.597005
4,0.0667,1.428604,0.784601,0.648185,0.623161,0.620734
5,0.0303,1.602972,0.786434,0.656968,0.668128,0.650166


[I 2025-03-16 02:45:11,923] Trial 90 pruned. 


Trial 91 with params: {'learning_rate': 0.0012512350404039272, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 23}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8442,1.076604,0.788268,0.693881,0.624682,0.642734
2,0.048,1.286005,0.800183,0.724899,0.699738,0.690413
3,0.0142,1.537559,0.804766,0.792195,0.715017,0.731116
4,0.0102,1.744732,0.787351,0.745059,0.681148,0.690072
5,0.0083,1.551949,0.805683,0.748441,0.733548,0.722789
6,0.004,1.618447,0.819432,0.802078,0.724307,0.742957
7,0.0047,1.724653,0.79835,0.75979,0.717061,0.714269
8,0.0042,1.810352,0.800183,0.742393,0.7,0.704059
9,0.0042,1.76382,0.805683,0.74598,0.727553,0.719292
10,0.003,1.686446,0.806599,0.751145,0.712621,0.715442


[I 2025-03-16 02:48:14,734] Trial 91 pruned. 


Trial 92 with params: {'learning_rate': 0.004130139437278195, 'weight_decay': 0.0, 'adam_beta1': 0.92, 'warmup_steps': 38}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5518,1.257363,0.802016,0.771559,0.708289,0.722368
2,0.0287,1.307824,0.813932,0.756418,0.738146,0.73472
3,0.0126,1.582931,0.805683,0.75506,0.733498,0.725002
4,0.0096,1.647376,0.813932,0.767512,0.747954,0.746745
5,0.014,1.970692,0.8011,0.797502,0.758498,0.7637
6,0.0138,2.266355,0.7956,0.789521,0.74515,0.747668
7,0.0065,2.287778,0.806599,0.805942,0.723067,0.746906
8,0.0092,2.409674,0.797434,0.759224,0.692562,0.709598
9,0.0059,2.604941,0.80385,0.800319,0.759595,0.764063
10,0.0044,2.672607,0.802016,0.776627,0.745819,0.748961


[I 2025-03-16 02:56:57,499] Trial 92 finished with value: 0.7670412847258831 and parameters: {'learning_rate': 0.004130139437278195, 'weight_decay': 0.0, 'adam_beta1': 0.92, 'warmup_steps': 38}. Best is trial 19 with value: 0.7806870868181661.


Trial 93 with params: {'learning_rate': 0.001554418044166106, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 32}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7779,1.112493,0.785518,0.706642,0.663704,0.668115
2,0.0365,1.305034,0.7956,0.75032,0.686508,0.69649
3,0.0137,1.333267,0.821265,0.764132,0.721261,0.72931
4,0.0078,1.490144,0.802933,0.736203,0.68958,0.694461
5,0.0055,1.58324,0.808433,0.713813,0.687663,0.682244


[I 2025-03-16 02:58:37,327] Trial 93 pruned. 


Trial 94 with params: {'learning_rate': 0.0023512300979480136, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 29}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6878,1.157462,0.817599,0.809405,0.745251,0.758156
2,0.0294,1.301661,0.807516,0.761912,0.744458,0.734887
3,0.0127,1.354267,0.814849,0.744717,0.7341,0.725014
4,0.009,1.518539,0.812099,0.743114,0.717929,0.7193
5,0.0086,1.557983,0.815765,0.792121,0.733435,0.740995
6,0.0063,1.489443,0.817599,0.741778,0.742171,0.726493
7,0.004,1.735802,0.805683,0.744852,0.73696,0.725475
8,0.0036,1.710078,0.816682,0.753029,0.752425,0.740087
9,0.0046,1.801093,0.809349,0.743607,0.748146,0.730792
10,0.0032,1.987553,0.816682,0.768342,0.730249,0.731303


[I 2025-03-16 03:07:35,408] Trial 94 finished with value: 0.7588083860060039 and parameters: {'learning_rate': 0.0023512300979480136, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 29}. Best is trial 19 with value: 0.7806870868181661.


Trial 95 with params: {'learning_rate': 0.002087290970067627, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 51}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7041,1.097282,0.800183,0.777741,0.706854,0.725452
2,0.0299,1.254153,0.815765,0.781982,0.733471,0.738164
3,0.0122,1.409262,0.813932,0.774938,0.731645,0.734252
4,0.0092,1.428788,0.813016,0.75991,0.727334,0.73214
5,0.0075,1.565944,0.817599,0.769805,0.753902,0.743434
6,0.007,1.592922,0.814849,0.759391,0.753737,0.745917
7,0.004,1.676153,0.820348,0.761801,0.760621,0.748636
8,0.0036,1.914156,0.799267,0.723845,0.711308,0.706785
9,0.0058,1.666498,0.817599,0.801264,0.742501,0.756605
10,0.0022,1.816112,0.807516,0.761452,0.734738,0.729952


[I 2025-03-16 03:10:27,078] Trial 95 pruned. 


Trial 96 with params: {'learning_rate': 0.0035263013975703594, 'weight_decay': 0.0, 'adam_beta1': 0.92, 'warmup_steps': 43}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5952,1.090437,0.808433,0.777991,0.72521,0.738157
2,0.0263,1.330972,0.811182,0.806495,0.756559,0.764395
3,0.012,1.461594,0.812099,0.78085,0.723996,0.729474
4,0.0095,1.492178,0.804766,0.740737,0.745652,0.729362
5,0.0106,1.603238,0.815765,0.760676,0.720087,0.728002
6,0.0101,1.813917,0.808433,0.798265,0.7255,0.748131
7,0.0075,1.925949,0.802933,0.773645,0.747873,0.747914
8,0.0046,2.059999,0.802933,0.753124,0.749079,0.735727
9,0.0034,2.185959,0.802016,0.774755,0.769155,0.759491
10,0.005,2.283658,0.8011,0.749581,0.744088,0.730649


[I 2025-03-16 03:16:30,407] Trial 96 pruned. 


Trial 97 with params: {'learning_rate': 2.221033692171883e-05, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 32}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.1164,2.707786,0.337305,0.037025,0.06666,0.0413
2,2.4201,2.340553,0.412466,0.067666,0.090651,0.062741
3,2.1098,2.122479,0.466544,0.099847,0.11428,0.090948
4,1.9023,1.964963,0.503208,0.13125,0.137065,0.114843
5,1.7382,1.845935,0.532539,0.139967,0.156058,0.137076
6,1.6017,1.747488,0.55637,0.15819,0.174771,0.156177
7,1.4804,1.668298,0.577452,0.201152,0.195021,0.175164
8,1.3709,1.600971,0.590284,0.197635,0.203761,0.186392
9,1.2748,1.543427,0.606783,0.258021,0.22026,0.206565
10,1.1836,1.490316,0.619615,0.268767,0.235327,0.227929


[I 2025-03-16 03:19:39,365] Trial 97 pruned. 


Trial 98 with params: {'learning_rate': 0.0020516110465848038, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 43}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7298,1.153554,0.796517,0.731947,0.682031,0.693319
2,0.0312,1.306679,0.8011,0.75308,0.707738,0.710277
3,0.0133,1.462083,0.813016,0.760347,0.698533,0.713152
4,0.0069,1.36089,0.824931,0.791933,0.743934,0.75426
5,0.0047,1.60432,0.804766,0.762455,0.722386,0.719641
6,0.0057,1.728876,0.817599,0.821275,0.740023,0.764048
7,0.0046,1.701531,0.813016,0.783886,0.742148,0.748875
8,0.0077,1.791306,0.806599,0.739348,0.738537,0.724811
9,0.0057,1.819419,0.809349,0.748675,0.748809,0.732678
10,0.0036,1.821469,0.813932,0.751039,0.779448,0.749454


[I 2025-03-16 03:28:56,316] Trial 98 finished with value: 0.7585268027788473 and parameters: {'learning_rate': 0.0020516110465848038, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 43}. Best is trial 19 with value: 0.7806870868181661.


Trial 99 with params: {'learning_rate': 0.003993478869918261, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 39}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5574,1.152442,0.809349,0.767082,0.740115,0.738465
2,0.0257,1.310307,0.802933,0.800298,0.747194,0.761247
3,0.0149,1.699242,0.808433,0.758748,0.716893,0.722339
4,0.0101,1.861472,0.807516,0.795108,0.743501,0.751555
5,0.0158,1.968184,0.8011,0.788589,0.734631,0.747592
6,0.0108,2.11962,0.806599,0.798072,0.741172,0.759238
7,0.0055,2.153159,0.805683,0.795836,0.749079,0.756043
8,0.0085,2.428753,0.813016,0.815376,0.760489,0.774368
9,0.0075,2.225393,0.812099,0.783977,0.763305,0.761496
10,0.0067,2.541261,0.806599,0.772056,0.738856,0.738921


[I 2025-03-16 03:38:35,121] Trial 99 finished with value: 0.753093495641659 and parameters: {'learning_rate': 0.003993478869918261, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 39}. Best is trial 19 with value: 0.7806870868181661.


Trial 100 with params: {'learning_rate': 0.0022126899121816027, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 38}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6714,1.111583,0.809349,0.75739,0.706444,0.716784
2,0.0294,1.266589,0.815765,0.754456,0.699139,0.705222
3,0.0108,1.432797,0.804766,0.74248,0.744539,0.732343
4,0.0091,1.391253,0.820348,0.800807,0.73161,0.746433
5,0.0079,1.528661,0.813932,0.797827,0.764341,0.765682
6,0.0061,1.599044,0.813016,0.784043,0.772369,0.7684
7,0.0065,1.577897,0.817599,0.783205,0.771602,0.766489
8,0.0033,1.803116,0.811182,0.790581,0.767638,0.767067
9,0.0036,2.024221,0.80385,0.797986,0.76038,0.763357
10,0.005,1.901213,0.821265,0.801201,0.77934,0.776288


[I 2025-03-16 03:47:42,607] Trial 100 finished with value: 0.7579728607481221 and parameters: {'learning_rate': 0.0022126899121816027, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 38}. Best is trial 19 with value: 0.7806870868181661.


Trial 101 with params: {'learning_rate': 0.002625777042957208, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 51}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6426,1.091987,0.810266,0.752456,0.70032,0.711002
2,0.0255,1.328634,0.808433,0.781377,0.708393,0.725559
3,0.013,1.422375,0.816682,0.748741,0.727872,0.722355
4,0.0084,1.538664,0.800183,0.747434,0.74797,0.727466
5,0.0071,1.702967,0.79835,0.751758,0.741674,0.731623
6,0.0079,1.717294,0.805683,0.771818,0.749739,0.747836
7,0.0058,1.840675,0.802016,0.729566,0.736,0.712006
8,0.0047,1.989252,0.79835,0.757391,0.729546,0.73063
9,0.0056,2.064953,0.791934,0.732841,0.755129,0.72912
10,0.004,2.0813,0.809349,0.765892,0.762414,0.752055


[I 2025-03-16 03:57:04,426] Trial 101 finished with value: 0.7449149657399522 and parameters: {'learning_rate': 0.002625777042957208, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 51}. Best is trial 19 with value: 0.7806870868181661.


Trial 102 with params: {'learning_rate': 0.003648493451241164, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 30}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5538,1.211682,0.807516,0.754915,0.703115,0.713153
2,0.025,1.372914,0.807516,0.779026,0.718947,0.730048
3,0.0146,1.421391,0.808433,0.773332,0.742958,0.741719
4,0.0128,1.643047,0.797434,0.726819,0.70376,0.695693
5,0.013,1.91795,0.794684,0.778546,0.715464,0.727385
6,0.0082,1.921934,0.79835,0.780568,0.721429,0.73394
7,0.0061,2.021787,0.809349,0.769427,0.732114,0.7362
8,0.0058,2.199737,0.806599,0.750896,0.710684,0.711839
9,0.0057,2.090292,0.797434,0.764348,0.749067,0.73706
10,0.0055,2.398102,0.80385,0.799913,0.72758,0.747633


[I 2025-03-16 04:03:17,920] Trial 102 pruned. 


Trial 103 with params: {'learning_rate': 1.546855136785054e-05, 'weight_decay': 0.01, 'adam_beta1': 0.98, 'warmup_steps': 10}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.346,3.005273,0.28506,0.021753,0.050391,0.029426
2,2.6838,2.565163,0.369386,0.037351,0.076267,0.048658
3,2.3456,2.330056,0.416132,0.087815,0.090999,0.063376
4,2.1329,2.169769,0.452796,0.104051,0.10621,0.081674
5,1.9756,2.046372,0.490376,0.104345,0.125638,0.10071


[I 2025-03-16 04:04:47,728] Trial 103 pruned. 


Trial 104 with params: {'learning_rate': 0.0028953685037744997, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 38}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6137,1.093704,0.821265,0.780089,0.730559,0.741267
2,0.0262,1.290268,0.814849,0.780726,0.740356,0.750814
3,0.0115,1.348916,0.813016,0.805259,0.73231,0.753891
4,0.0096,1.634192,0.806599,0.801307,0.743903,0.75444
5,0.0108,1.691843,0.812099,0.802831,0.760485,0.767441
6,0.0109,1.720664,0.80385,0.796212,0.748464,0.754992
7,0.0063,1.788468,0.816682,0.799075,0.784129,0.778746
8,0.0043,1.892754,0.809349,0.793341,0.750296,0.759795
9,0.0041,1.995092,0.804766,0.768389,0.728435,0.734387
10,0.0047,1.959946,0.808433,0.763637,0.741215,0.74258


[I 2025-03-16 04:14:12,273] Trial 104 finished with value: 0.7540488954758139 and parameters: {'learning_rate': 0.0028953685037744997, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 38}. Best is trial 19 with value: 0.7806870868181661.


Trial 105 with params: {'learning_rate': 0.0005102835177659813, 'weight_decay': 0.0, 'adam_beta1': 0.92, 'warmup_steps': 53}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.343,1.026142,0.75527,0.509142,0.487566,0.483912
2,0.167,1.250231,0.782768,0.660746,0.632719,0.634481
3,0.0451,1.42399,0.783685,0.688725,0.658298,0.657637
4,0.0199,1.369895,0.796517,0.720198,0.665453,0.673085
5,0.0126,1.518762,0.802933,0.75937,0.700006,0.714437
6,0.0072,1.560712,0.797434,0.715241,0.679331,0.680756
7,0.0058,1.773433,0.79835,0.722247,0.68965,0.689032
8,0.0061,1.67754,0.813932,0.747104,0.724058,0.723749
9,0.0045,1.808159,0.793767,0.727634,0.712101,0.698301
10,0.0048,1.775059,0.808433,0.757889,0.701068,0.711861


[I 2025-03-16 04:17:12,298] Trial 105 pruned. 


Trial 106 with params: {'learning_rate': 0.002986083670397209, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 19}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6022,1.240374,0.799267,0.76324,0.69658,0.711566
2,0.0277,1.365325,0.815765,0.746072,0.732261,0.723272
3,0.012,1.479966,0.804766,0.76067,0.734868,0.732206
4,0.01,1.566911,0.804766,0.756601,0.740608,0.734417
5,0.0124,1.680939,0.805683,0.732527,0.737354,0.720333
6,0.0062,1.875038,0.789184,0.739696,0.693244,0.696536
7,0.0059,1.886423,0.805683,0.762917,0.738597,0.734191
8,0.0061,2.047712,0.806599,0.766432,0.725716,0.729526
9,0.005,2.166548,0.812099,0.75513,0.736675,0.732169
10,0.0035,2.178366,0.806599,0.751949,0.724399,0.721744


[I 2025-03-16 04:20:07,892] Trial 106 pruned. 


Trial 107 with params: {'learning_rate': 0.0008564833515805929, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 24}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.968,1.108152,0.783685,0.653389,0.597359,0.60707
2,0.0719,1.236794,0.808433,0.734293,0.693286,0.697574
3,0.0207,1.470955,0.790101,0.663809,0.683194,0.660897
4,0.0128,1.442403,0.810266,0.771244,0.721485,0.7294
5,0.0081,1.555255,0.80385,0.739907,0.72232,0.716136


[I 2025-03-16 04:21:46,718] Trial 107 pruned. 


Trial 108 with params: {'learning_rate': 0.0033625076529504266, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 37}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5783,1.165199,0.806599,0.762532,0.725035,0.731669
2,0.0271,1.309149,0.807516,0.779661,0.723695,0.737058
3,0.0104,1.493657,0.79835,0.761923,0.732159,0.731334
4,0.0123,1.513694,0.800183,0.752614,0.735471,0.728593
5,0.0118,1.823436,0.811182,0.784338,0.740513,0.746189
6,0.0088,1.749133,0.805683,0.712848,0.706233,0.700523
7,0.005,2.087534,0.806599,0.7841,0.74094,0.744199
8,0.0044,2.086207,0.80385,0.779369,0.726737,0.735501
9,0.0041,2.264886,0.796517,0.749071,0.720315,0.715148
10,0.005,2.378127,0.788268,0.760372,0.725579,0.725537


[I 2025-03-16 04:24:56,438] Trial 108 pruned. 


Trial 109 with params: {'learning_rate': 0.0036884355504540876, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 36}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5489,1.157648,0.813932,0.790822,0.734065,0.743687
2,0.0254,1.385235,0.796517,0.787712,0.738227,0.746841
3,0.0143,1.508757,0.797434,0.781184,0.770076,0.757736
4,0.011,1.672403,0.79835,0.792265,0.727634,0.739037
5,0.0107,1.70787,0.813932,0.803009,0.760029,0.769857
6,0.0089,1.845112,0.818515,0.802434,0.76497,0.770263
7,0.0109,2.143649,0.789184,0.771433,0.713471,0.724189
8,0.0044,2.19749,0.819432,0.792613,0.746503,0.751273
9,0.0053,2.182256,0.802016,0.781805,0.778792,0.767431
10,0.0062,2.415268,0.802933,0.779086,0.737246,0.74377


[I 2025-03-16 04:31:01,401] Trial 109 pruned. 


Trial 110 with params: {'learning_rate': 0.0037799196115824783, 'weight_decay': 0.0, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 33}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5934,1.248291,0.804766,0.765201,0.714107,0.717717
2,0.0273,1.274386,0.813932,0.799549,0.76657,0.769563
3,0.0127,1.437414,0.808433,0.799766,0.750768,0.765239
4,0.0127,1.728881,0.794684,0.733376,0.70683,0.710102
5,0.012,1.88594,0.80385,0.779668,0.720078,0.729011
6,0.0103,2.182077,0.797434,0.78692,0.743835,0.75025
7,0.0093,2.169924,0.790101,0.76653,0.731271,0.725625
8,0.0075,2.168517,0.785518,0.771551,0.748448,0.745167
9,0.007,2.263099,0.8011,0.7746,0.769182,0.753631
10,0.0052,2.348547,0.799267,0.769728,0.741813,0.732243


[I 2025-03-16 04:40:26,148] Trial 110 finished with value: 0.7418734436049834 and parameters: {'learning_rate': 0.0037799196115824783, 'weight_decay': 0.0, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 33}. Best is trial 19 with value: 0.7806870868181661.


Trial 111 with params: {'learning_rate': 0.0023851635344360575, 'weight_decay': 0.0, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 47}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7394,1.215617,0.808433,0.731311,0.695269,0.69634
2,0.0303,1.223879,0.823098,0.766687,0.737393,0.734871
3,0.0112,1.341917,0.809349,0.734063,0.710907,0.712661
4,0.0077,1.494444,0.814849,0.721747,0.715416,0.703288
5,0.0057,1.555916,0.821265,0.788907,0.753846,0.752723
6,0.0053,1.796909,0.810266,0.740867,0.746463,0.726545
7,0.0082,1.936233,0.80385,0.7346,0.709713,0.704159
8,0.0023,1.999563,0.810266,0.751186,0.758941,0.736962
9,0.0035,2.039221,0.802933,0.789455,0.736845,0.742489
10,0.0041,2.086345,0.814849,0.780868,0.747322,0.746792


[I 2025-03-16 04:49:43,140] Trial 111 finished with value: 0.7515402243179732 and parameters: {'learning_rate': 0.0023851635344360575, 'weight_decay': 0.0, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 47}. Best is trial 19 with value: 0.7806870868181661.


Trial 112 with params: {'learning_rate': 0.001387721221636639, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 8}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7865,1.084017,0.790101,0.679707,0.650671,0.650747
2,0.0421,1.302979,0.8011,0.779152,0.690885,0.707775
3,0.0154,1.445117,0.807516,0.723825,0.734691,0.714008
4,0.0106,1.379039,0.813932,0.766866,0.714049,0.722495
5,0.0054,1.613102,0.807516,0.78219,0.702341,0.718468


[I 2025-03-16 04:51:10,346] Trial 112 pruned. 


Trial 113 with params: {'learning_rate': 0.0006793685322802032, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 13}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0734,1.042871,0.779102,0.590231,0.551201,0.549744
2,0.1008,1.230018,0.79835,0.720847,0.6873,0.692939
3,0.0291,1.50061,0.783685,0.682798,0.673259,0.662723
4,0.0142,1.442902,0.805683,0.799776,0.728206,0.741341
5,0.0089,1.528327,0.799267,0.759583,0.708008,0.71104
6,0.0066,1.535076,0.807516,0.74594,0.704148,0.71045
7,0.0067,1.731436,0.791017,0.741089,0.680355,0.686779
8,0.005,1.768113,0.7956,0.729182,0.691442,0.695361
9,0.0045,1.686587,0.800183,0.7698,0.710164,0.720724
10,0.0029,1.681479,0.814849,0.761987,0.739277,0.738103


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--precision/155d3220d6cd4a6553f12da68eeb3d1f97cf431206304a4bc6e2d564c29502e9 (last modified on Fri Jan 10 23:13:59 2025) since it couldn't be found locally at evaluate-metric--precision, or remotely on the Hugging Face Hub.
[I 2025-03-16 04:59:53,995] Trial 113 finished with value: 0.744166371417093 and parameters: {'learning_rate': 0.0006793685322802032, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 13}. Best is trial 19 with value: 0.7806870868181661.


Trial 114 with params: {'learning_rate': 0.0008402196133716061, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 46}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0741,1.05523,0.791017,0.62459,0.592565,0.593537
2,0.0798,1.272383,0.79835,0.758677,0.704808,0.718664
3,0.023,1.450417,0.7956,0.743609,0.715324,0.712169
4,0.0117,1.4754,0.802933,0.770989,0.713925,0.723748
5,0.008,1.544398,0.8011,0.779672,0.72314,0.734701
6,0.005,1.66179,0.80385,0.750103,0.723551,0.716292
7,0.0053,1.66045,0.802933,0.766172,0.691063,0.711734
8,0.0032,1.698101,0.815765,0.829268,0.725356,0.757387
9,0.0037,1.908072,0.796517,0.772095,0.708995,0.722484
10,0.0037,1.737675,0.80385,0.754786,0.719289,0.718201


[I 2025-03-16 05:03:02,495] Trial 114 pruned. 


Trial 115 with params: {'learning_rate': 0.0033169529958899015, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 45}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6359,1.132063,0.804766,0.797306,0.739554,0.752533
2,0.0271,1.331634,0.815765,0.777735,0.756147,0.753688
3,0.0122,1.548438,0.817599,0.772038,0.764275,0.757089
4,0.0099,1.526313,0.825848,0.803899,0.781154,0.779498
5,0.0078,1.782064,0.80385,0.777103,0.750607,0.753861
6,0.0107,1.888865,0.804766,0.76371,0.749614,0.743976
7,0.0081,2.06268,0.808433,0.746453,0.760952,0.744137
8,0.0063,2.078328,0.805683,0.787823,0.763041,0.761676
9,0.0077,2.066631,0.812099,0.794219,0.78103,0.774874
10,0.0037,2.294067,0.802016,0.804454,0.769049,0.77522


[I 2025-03-16 05:12:18,796] Trial 115 finished with value: 0.7771187132538426 and parameters: {'learning_rate': 0.0033169529958899015, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 45}. Best is trial 19 with value: 0.7806870868181661.


Trial 116 with params: {'learning_rate': 1.2747319607917617e-05, 'weight_decay': 0.003, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 15}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3394,2.994658,0.299725,0.018828,0.054568,0.027772
2,2.7371,2.649426,0.342805,0.035324,0.068077,0.042434
3,2.4593,2.443501,0.391384,0.059881,0.083219,0.054317
4,2.2662,2.294865,0.417965,0.080051,0.092179,0.065391
5,2.1149,2.175611,0.456462,0.098298,0.107617,0.08201


[I 2025-03-16 05:13:58,091] Trial 116 pruned. 


Trial 117 with params: {'learning_rate': 0.004627051389314766, 'weight_decay': 0.001, 'adam_beta1': 0.93, 'warmup_steps': 53}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5806,1.247037,0.797434,0.800846,0.714321,0.73784
2,0.0286,1.368652,0.79835,0.737174,0.728416,0.7186
3,0.0133,1.657419,0.805683,0.793661,0.744936,0.755667
4,0.017,1.805018,0.792851,0.741154,0.713211,0.711163
5,0.0173,1.97868,0.80385,0.794286,0.740956,0.753783
6,0.0092,2.225521,0.794684,0.762226,0.708556,0.718509
7,0.011,2.486995,0.794684,0.778387,0.752107,0.746481
8,0.0106,2.415575,0.790101,0.756045,0.74202,0.735476
9,0.0119,2.308925,0.775435,0.73342,0.731013,0.718622
10,0.0095,2.379515,0.784601,0.755676,0.743785,0.732871


[I 2025-03-16 05:23:16,546] Trial 117 finished with value: 0.7471896432492326 and parameters: {'learning_rate': 0.004627051389314766, 'weight_decay': 0.001, 'adam_beta1': 0.93, 'warmup_steps': 53}. Best is trial 19 with value: 0.7806870868181661.


Trial 118 with params: {'learning_rate': 0.0008247949158814486, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 53}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0522,1.091698,0.781852,0.657268,0.597186,0.607642
2,0.078,1.278944,0.794684,0.710173,0.678866,0.678025
3,0.023,1.604577,0.775435,0.68694,0.687577,0.672544
4,0.0126,1.385111,0.809349,0.761996,0.710884,0.72026
5,0.0077,1.483874,0.813016,0.731643,0.692721,0.694624
6,0.0052,1.803171,0.794684,0.730715,0.679026,0.681067
7,0.0053,1.749407,0.804766,0.753207,0.701779,0.713087
8,0.0037,1.760035,0.809349,0.73382,0.696185,0.693799
9,0.0046,1.708084,0.792851,0.758106,0.712751,0.715814
10,0.0024,1.674952,0.812099,0.757311,0.720774,0.718889


[I 2025-03-16 05:26:16,634] Trial 118 pruned. 


Trial 119 with params: {'learning_rate': 0.003705731029281801, 'weight_decay': 0.0, 'adam_beta1': 0.92, 'warmup_steps': 44}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.593,1.167388,0.796517,0.801873,0.72036,0.744832
2,0.028,1.256231,0.815765,0.753389,0.740759,0.730617
3,0.014,1.315314,0.821265,0.775441,0.776219,0.763622
4,0.01,1.599221,0.809349,0.773759,0.754127,0.749239
5,0.011,1.866156,0.805683,0.785878,0.745894,0.748768
6,0.0115,1.784019,0.808433,0.788693,0.74997,0.75478
7,0.0098,2.030862,0.810266,0.810521,0.764619,0.767938
8,0.0076,2.20265,0.806599,0.760324,0.727197,0.722587
9,0.0062,2.13826,0.810266,0.785807,0.750159,0.750918
10,0.0042,2.293909,0.818515,0.778514,0.75441,0.750657


[I 2025-03-16 05:35:04,303] Trial 119 finished with value: 0.743653206476031 and parameters: {'learning_rate': 0.003705731029281801, 'weight_decay': 0.0, 'adam_beta1': 0.92, 'warmup_steps': 44}. Best is trial 19 with value: 0.7806870868181661.


Trial 120 with params: {'learning_rate': 0.0025450047366271157, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 31}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6197,1.059031,0.802933,0.733416,0.682183,0.695746
2,0.0271,1.21024,0.802933,0.789916,0.736341,0.75004
3,0.0111,1.440823,0.805683,0.776413,0.734891,0.741412
4,0.0087,1.579811,0.806599,0.780154,0.717986,0.731323
5,0.0076,1.60734,0.808433,0.744323,0.715363,0.71397


[I 2025-03-16 05:36:31,529] Trial 120 pruned. 


Trial 121 with params: {'learning_rate': 0.004220661238506302, 'weight_decay': 0.005, 'adam_beta1': 0.92, 'warmup_steps': 51}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.582,1.073529,0.818515,0.806175,0.73103,0.752955
2,0.026,1.327156,0.806599,0.771437,0.72259,0.728369
3,0.0154,1.616129,0.8011,0.757889,0.7255,0.730209
4,0.0113,1.708858,0.791934,0.76147,0.738696,0.733915
5,0.0174,1.732202,0.797434,0.787691,0.739753,0.748982
6,0.0153,1.860283,0.79835,0.774609,0.734254,0.734165
7,0.0088,2.204323,0.791934,0.772318,0.728498,0.731184
8,0.0063,2.219417,0.805683,0.800926,0.754917,0.767434
9,0.0051,2.517092,0.8011,0.795048,0.748704,0.754267
10,0.0072,2.629824,0.797434,0.763639,0.754536,0.745691


[I 2025-03-16 05:45:40,695] Trial 121 finished with value: 0.7876336386869527 and parameters: {'learning_rate': 0.004220661238506302, 'weight_decay': 0.005, 'adam_beta1': 0.92, 'warmup_steps': 51}. Best is trial 121 with value: 0.7876336386869527.


Trial 122 with params: {'learning_rate': 0.002335593724581267, 'weight_decay': 0.007, 'adam_beta1': 0.92, 'warmup_steps': 53}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6931,1.164542,0.804766,0.750144,0.711365,0.713222
2,0.0283,1.230953,0.811182,0.806684,0.727538,0.746012
3,0.0106,1.445545,0.810266,0.786299,0.724212,0.736813
4,0.0079,1.440493,0.816682,0.760327,0.713816,0.725242
5,0.0086,1.680683,0.805683,0.74517,0.738279,0.720458
6,0.0071,1.90296,0.792851,0.734483,0.702618,0.701058
7,0.0065,1.858256,0.807516,0.747488,0.724282,0.718965
8,0.0063,1.858222,0.809349,0.754631,0.720712,0.718017
9,0.0029,2.019035,0.805683,0.730279,0.726461,0.706624
10,0.0028,1.945802,0.806599,0.738813,0.730551,0.717276


[I 2025-03-16 05:48:30,351] Trial 122 pruned. 


Trial 123 with params: {'learning_rate': 0.004989152508024512, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 42}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5251,1.149554,0.810266,0.781034,0.738554,0.746183
2,0.0273,1.457712,0.794684,0.778628,0.734272,0.736733
3,0.0156,1.781025,0.7956,0.775241,0.752481,0.748254
4,0.019,1.891945,0.788268,0.767299,0.715346,0.717548
5,0.0212,2.102953,0.791934,0.749981,0.731693,0.724515
6,0.0137,2.282921,0.794684,0.792896,0.736193,0.747782
7,0.0096,2.445074,0.790101,0.754485,0.735766,0.733428
8,0.0108,2.553363,0.796517,0.772554,0.744445,0.739868
9,0.0094,2.848495,0.783685,0.793059,0.732144,0.741493
10,0.0073,2.895748,0.796517,0.779705,0.745398,0.746255


[I 2025-03-16 05:54:40,518] Trial 123 pruned. 


Trial 124 with params: {'learning_rate': 0.0009468908403514332, 'weight_decay': 0.004, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 51}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0597,1.049887,0.786434,0.630059,0.598152,0.602788
2,0.0692,1.227908,0.802933,0.752455,0.702912,0.711329
3,0.0194,1.449637,0.79835,0.718807,0.707672,0.697181
4,0.0124,1.421271,0.813016,0.777337,0.72636,0.732841
5,0.0071,1.51442,0.812099,0.733043,0.702986,0.699977
6,0.0055,1.605861,0.815765,0.772781,0.736235,0.742787
7,0.0038,1.680212,0.813016,0.746918,0.72146,0.718651
8,0.0044,1.71658,0.802933,0.784183,0.702144,0.719529
9,0.0036,1.621006,0.817599,0.748754,0.752365,0.741313
10,0.0016,1.746966,0.809349,0.715387,0.711259,0.699624


[I 2025-03-16 06:00:59,396] Trial 124 pruned. 


Trial 125 with params: {'learning_rate': 0.002665221640543764, 'weight_decay': 0.005, 'adam_beta1': 0.92, 'warmup_steps': 49}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6602,1.218119,0.810266,0.768938,0.707916,0.722213
2,0.0275,1.294481,0.800183,0.762211,0.747074,0.741919
3,0.0117,1.415153,0.823098,0.771113,0.750307,0.747176
4,0.0093,1.536784,0.810266,0.787206,0.733807,0.744901
5,0.0093,1.614126,0.8011,0.793849,0.706748,0.730228
6,0.0058,1.75812,0.815765,0.748363,0.734533,0.729595
7,0.0062,2.142939,0.796517,0.759642,0.730693,0.728771
8,0.0075,1.987472,0.808433,0.755061,0.748711,0.735341
9,0.0079,2.090396,0.792851,0.695103,0.741212,0.698249
10,0.0032,2.278643,0.799267,0.73353,0.743197,0.724494


[I 2025-03-16 06:04:05,969] Trial 125 pruned. 


Trial 126 with params: {'learning_rate': 0.004496520300815876, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 48}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5511,1.167415,0.810266,0.808958,0.739783,0.760149
2,0.026,1.442445,0.810266,0.765971,0.732932,0.729795
3,0.0154,1.521014,0.808433,0.76723,0.738954,0.741682
4,0.0143,1.731466,0.809349,0.757831,0.734697,0.728176
5,0.019,1.84909,0.809349,0.724937,0.761125,0.722867
6,0.0105,1.879932,0.808433,0.748926,0.723919,0.718314
7,0.0098,2.030764,0.810266,0.767563,0.762864,0.742567
8,0.0091,2.325412,0.805683,0.765271,0.727175,0.73054
9,0.0085,2.387487,0.796517,0.762378,0.724818,0.730042
10,0.0075,2.519383,0.802933,0.765001,0.738201,0.735676


[I 2025-03-16 06:13:09,689] Trial 126 finished with value: 0.7727121390987568 and parameters: {'learning_rate': 0.004496520300815876, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 48}. Best is trial 121 with value: 0.7876336386869527.


Trial 127 with params: {'learning_rate': 0.0044589524766823685, 'weight_decay': 0.005, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 44}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.58,1.197655,0.813932,0.795131,0.743191,0.754576
2,0.0288,1.282394,0.80385,0.768499,0.748291,0.74553
3,0.0141,1.416302,0.813016,0.779245,0.758379,0.755989
4,0.0157,1.72572,0.807516,0.763067,0.737831,0.731107
5,0.0156,1.833207,0.805683,0.743055,0.743156,0.727137
6,0.0122,2.026461,0.791934,0.748698,0.743256,0.728954
7,0.0099,2.251047,0.792851,0.763083,0.722539,0.719553
8,0.0077,2.373842,0.79835,0.754097,0.731094,0.726948
9,0.0071,2.488371,0.799267,0.768241,0.73136,0.730021
10,0.0061,2.469054,0.788268,0.73005,0.722747,0.704203


[I 2025-03-16 06:16:04,927] Trial 127 pruned. 


Trial 128 with params: {'learning_rate': 0.002682014624647669, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 49}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6473,1.109971,0.808433,0.794145,0.746815,0.754177
2,0.0261,1.246105,0.817599,0.800128,0.763053,0.7628
3,0.0113,1.326495,0.814849,0.793217,0.771144,0.767207
4,0.0087,1.393942,0.814849,0.782049,0.739467,0.746031
5,0.0079,1.469892,0.812099,0.802606,0.773988,0.775347
6,0.0079,1.61678,0.813932,0.815889,0.770778,0.781642
7,0.0061,1.86959,0.802016,0.777651,0.768628,0.757999
8,0.0055,1.895859,0.813016,0.800627,0.747125,0.759157
9,0.0041,1.838492,0.819432,0.797616,0.743379,0.755388
10,0.0039,1.868148,0.812099,0.763578,0.736735,0.734645


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--f1/34c46321f42186df33a6260966e34a368f14868d9cc2ba47d142112e2800d233 (last modified on Fri Jan 10 23:14:01 2025) since it couldn't be found locally at evaluate-metric--f1, or remotely on the Hugging Face Hub.
[I 2025-03-16 06:22:08,096] Trial 128 pruned. 


Trial 129 with params: {'learning_rate': 0.00476707047498301, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 51}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5457,1.174066,0.817599,0.783878,0.755825,0.755299
2,0.0268,1.425602,0.796517,0.777077,0.738996,0.741216
3,0.0186,1.79937,0.789184,0.758791,0.707655,0.716894
4,0.0175,1.760644,0.794684,0.749606,0.730554,0.730515
5,0.0173,1.81086,0.8011,0.752643,0.734002,0.733008
6,0.0119,2.261033,0.796517,0.767401,0.712303,0.716455
7,0.0123,2.308676,0.804766,0.819265,0.735783,0.7561
8,0.006,2.477633,0.797434,0.774785,0.724652,0.73181
9,0.0057,2.682333,0.791017,0.787436,0.724204,0.739604
10,0.0111,3.080648,0.791017,0.702572,0.676327,0.672076


[I 2025-03-16 06:24:58,348] Trial 129 pruned. 


Trial 130 with params: {'learning_rate': 0.004601809708159433, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 38}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5459,1.139343,0.807516,0.746162,0.711204,0.71768
2,0.0265,1.45693,0.792851,0.759857,0.717256,0.721953
3,0.0179,1.623126,0.785518,0.778271,0.728306,0.736127
4,0.0154,1.856727,0.790101,0.759417,0.706571,0.7145
5,0.0152,2.075061,0.778185,0.74316,0.715889,0.711756
6,0.0109,2.26381,0.780018,0.784465,0.716262,0.726071
7,0.0097,2.190032,0.791017,0.717853,0.728931,0.709924
8,0.0087,2.217989,0.794684,0.722448,0.712421,0.701034
9,0.0071,2.255877,0.789184,0.728089,0.723374,0.708519
10,0.0062,2.486188,0.800183,0.73804,0.71997,0.712738


[I 2025-03-16 06:31:05,456] Trial 130 pruned. 


Trial 131 with params: {'learning_rate': 0.004893565186094253, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 44}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5333,1.253825,0.807516,0.754422,0.722384,0.725383
2,0.0304,1.48404,0.7956,0.7835,0.766944,0.757681
3,0.015,1.588595,0.802933,0.780626,0.7259,0.74151
4,0.0169,1.761914,0.791934,0.78375,0.725705,0.736049
5,0.0133,1.910941,0.808433,0.789954,0.74002,0.74651
6,0.0146,1.999731,0.80385,0.755466,0.706687,0.713333
7,0.0128,2.361475,0.786434,0.747663,0.705003,0.706643
8,0.012,2.329497,0.791017,0.795455,0.714753,0.735623
9,0.0094,2.791998,0.796517,0.770264,0.749069,0.745367
10,0.0077,2.999153,0.794684,0.81401,0.734723,0.755555


[I 2025-03-16 06:37:09,735] Trial 131 pruned. 


Trial 132 with params: {'learning_rate': 0.0013299708965319012, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 21}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8154,1.10861,0.790101,0.672959,0.648228,0.64483
2,0.0434,1.318952,0.8011,0.764551,0.706493,0.712604
3,0.0144,1.416474,0.802016,0.753269,0.724892,0.723978
4,0.0107,1.389085,0.817599,0.792249,0.720234,0.735122
5,0.0061,1.601069,0.807516,0.755386,0.711098,0.719018
6,0.0044,1.559247,0.815765,0.767664,0.725039,0.734499
7,0.0056,1.473872,0.823098,0.798356,0.750989,0.756834
8,0.004,1.599536,0.822181,0.78656,0.746124,0.753796
9,0.005,1.696292,0.811182,0.772858,0.743869,0.743329
10,0.0016,1.796368,0.807516,0.753865,0.734284,0.724153


[I 2025-03-16 06:45:59,791] Trial 132 finished with value: 0.748704672838971 and parameters: {'learning_rate': 0.0013299708965319012, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 21}. Best is trial 121 with value: 0.7876336386869527.


Trial 133 with params: {'learning_rate': 0.0014056851167670637, 'weight_decay': 0.01, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 11}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.845,1.074608,0.791017,0.63281,0.628548,0.618912
2,0.0449,1.268657,0.819432,0.739043,0.70671,0.711185
3,0.0151,1.398036,0.807516,0.661345,0.690787,0.660806
4,0.0089,1.476848,0.822181,0.743926,0.712482,0.716732
5,0.0051,1.612704,0.804766,0.687821,0.71993,0.69206
6,0.0052,1.714244,0.813932,0.748298,0.713478,0.712814
7,0.005,1.700762,0.814849,0.745501,0.717537,0.716316
8,0.004,1.915038,0.794684,0.691482,0.68972,0.675591
9,0.0043,1.608079,0.815765,0.754884,0.723603,0.728386
10,0.0027,1.780071,0.817599,0.776329,0.72521,0.735716


[I 2025-03-16 06:55:11,101] Trial 133 finished with value: 0.7599352197764755 and parameters: {'learning_rate': 0.0014056851167670637, 'weight_decay': 0.01, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 11}. Best is trial 121 with value: 0.7876336386869527.


Trial 134 with params: {'learning_rate': 0.0011640256458428468, 'weight_decay': 0.008, 'adam_beta1': 0.97, 'warmup_steps': 27}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0526,1.085676,0.797434,0.648407,0.636144,0.628938
2,0.0627,1.331517,0.807516,0.760987,0.703076,0.716584
3,0.0169,1.398715,0.807516,0.717422,0.706377,0.695112
4,0.0101,1.440552,0.811182,0.738213,0.698844,0.703301
5,0.0079,1.632257,0.810266,0.767856,0.725852,0.729213
6,0.0067,1.618024,0.815765,0.748506,0.698949,0.709162
7,0.0036,1.637374,0.814849,0.753279,0.701522,0.709994
8,0.0037,1.744824,0.809349,0.75433,0.697957,0.713319
9,0.0028,1.869116,0.80385,0.742182,0.707441,0.707189
10,0.0027,1.860459,0.809349,0.747026,0.698124,0.705976


[I 2025-03-16 06:58:21,803] Trial 134 pruned. 


Trial 135 with params: {'learning_rate': 0.004594948931260098, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 42}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.529,1.269244,0.799267,0.78022,0.71017,0.72103
2,0.0275,1.267119,0.810266,0.79057,0.758059,0.758794
3,0.0148,1.518273,0.788268,0.756587,0.713748,0.720665
4,0.0154,1.70503,0.789184,0.788972,0.710037,0.728998
5,0.0136,1.950899,0.79835,0.755809,0.72875,0.7264
6,0.0121,2.355093,0.787351,0.788288,0.709163,0.723852
7,0.0111,2.414741,0.785518,0.743078,0.719283,0.717198
8,0.0123,2.543576,0.789184,0.756817,0.732026,0.729767
9,0.0084,2.826613,0.79835,0.772809,0.751443,0.746593
10,0.0081,2.766796,0.791017,0.754592,0.72985,0.72916


[I 2025-03-16 07:01:28,246] Trial 135 pruned. 


Trial 136 with params: {'learning_rate': 0.001458883563904425, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 42}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8056,1.108238,0.7956,0.754283,0.656249,0.683844
2,0.0374,1.353551,0.79835,0.720828,0.688972,0.688395
3,0.0136,1.36069,0.813016,0.766872,0.735177,0.731176
4,0.0114,1.430754,0.816682,0.768021,0.722541,0.730601
5,0.0066,1.456165,0.816682,0.792187,0.73362,0.747435
6,0.0037,1.62575,0.814849,0.770375,0.730163,0.738413
7,0.0046,1.616047,0.802016,0.765309,0.703176,0.718857
8,0.0058,1.783866,0.814849,0.735261,0.705786,0.704976
9,0.0028,1.828612,0.813932,0.788181,0.758249,0.757457
10,0.002,1.846679,0.815765,0.771615,0.743084,0.747375


[I 2025-03-16 07:11:07,388] Trial 136 finished with value: 0.747629131141275 and parameters: {'learning_rate': 0.001458883563904425, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 42}. Best is trial 121 with value: 0.7876336386869527.


Trial 137 with params: {'learning_rate': 0.002768786181433085, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 14}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5832,1.217516,0.791934,0.758923,0.719316,0.723411
2,0.0281,1.236114,0.810266,0.811659,0.736933,0.754105
3,0.0126,1.304935,0.815765,0.770903,0.759933,0.750366
4,0.0099,1.44627,0.815765,0.767183,0.735997,0.737926
5,0.0089,1.506428,0.80385,0.761708,0.75629,0.741432
6,0.0064,1.857607,0.806599,0.770457,0.749484,0.745574
7,0.0062,1.750096,0.793767,0.758057,0.724066,0.724758
8,0.0058,1.85341,0.796517,0.768425,0.723065,0.72976
9,0.004,2.125721,0.7956,0.767801,0.702063,0.71171
10,0.0028,1.948058,0.809349,0.743086,0.739397,0.728269


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--recall/11f90e583db35601050aed380d48e83202a896976b9608432fba9244fb447f24 (last modified on Fri Jan 10 23:14:00 2025) since it couldn't be found locally at evaluate-metric--recall, or remotely on the Hugging Face Hub.
[I 2025-03-16 07:21:03,056] Trial 137 finished with value: 0.7404442376249246 and parameters: {'learning_rate': 0.002768786181433085, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 14}. Best is trial 121 with value: 0.7876336386869527.


Trial 138 with params: {'learning_rate': 0.0042333794526996215, 'weight_decay': 0.007, 'adam_beta1': 0.93, 'warmup_steps': 53}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6017,1.142343,0.804766,0.76051,0.742205,0.738791
2,0.0264,1.270562,0.822181,0.799067,0.761555,0.769051
3,0.0155,1.772752,0.785518,0.762436,0.727219,0.72762
4,0.0135,1.758195,0.79835,0.762973,0.736626,0.731201
5,0.0111,1.985019,0.797434,0.74776,0.729363,0.726489
6,0.0118,1.980266,0.8011,0.726695,0.761504,0.73152
7,0.0143,2.025752,0.80385,0.756958,0.740209,0.734863
8,0.0055,2.31575,0.805683,0.765666,0.738836,0.739314
9,0.0062,2.389343,0.802016,0.768413,0.758022,0.748908
10,0.0053,2.7721,0.79835,0.779064,0.74216,0.743753


[I 2025-03-16 07:30:11,560] Trial 138 finished with value: 0.7661304142091605 and parameters: {'learning_rate': 0.0042333794526996215, 'weight_decay': 0.007, 'adam_beta1': 0.93, 'warmup_steps': 53}. Best is trial 121 with value: 0.7876336386869527.


Trial 139 with params: {'learning_rate': 1.1619982946199605e-05, 'weight_decay': 0.001, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 47}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3984,3.053211,0.245646,0.022794,0.039185,0.025778
2,2.7965,2.701546,0.336389,0.037331,0.066305,0.041439
3,2.5188,2.499326,0.384051,0.038206,0.080999,0.051568
4,2.3267,2.34887,0.405133,0.05865,0.087516,0.058457
5,2.1747,2.232334,0.43538,0.089088,0.099632,0.07429
6,2.059,2.135046,0.461962,0.102125,0.110487,0.08486
7,1.9634,2.057921,0.479377,0.103494,0.12066,0.097612
8,1.8809,1.995221,0.494959,0.134461,0.131179,0.109411
9,1.8108,1.940626,0.499542,0.133411,0.133187,0.110597
10,1.7464,1.885244,0.516957,0.134038,0.150122,0.131751


[I 2025-03-16 07:36:36,386] Trial 139 pruned. 


Trial 140 with params: {'learning_rate': 0.002522236258600532, 'weight_decay': 0.005, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 53}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7076,1.206527,0.80385,0.733597,0.671387,0.680363
2,0.0283,1.299713,0.8011,0.735,0.694809,0.692858
3,0.0099,1.391922,0.814849,0.775353,0.707272,0.724082
4,0.0089,1.501817,0.805683,0.775017,0.744604,0.740026
5,0.007,1.65722,0.816682,0.779358,0.742336,0.740406
6,0.0056,1.902121,0.804766,0.775589,0.744889,0.74395
7,0.0091,2.004345,0.808433,0.773866,0.737018,0.736193
8,0.0036,2.063001,0.808433,0.77624,0.754118,0.752292
9,0.0028,2.061379,0.799267,0.761708,0.728482,0.728769
10,0.0039,2.110804,0.813932,0.776837,0.75561,0.750193


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--accuracy/f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Sat Oct 12 13:56:14 2024) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.
[I 2025-03-16 07:42:25,637] Trial 140 pruned. 


Trial 141 with params: {'learning_rate': 0.003862644602167287, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.93, 'warmup_steps': 46}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6029,1.203359,0.8011,0.734407,0.686678,0.69347
2,0.0284,1.256363,0.813016,0.738206,0.736345,0.725892
3,0.0118,1.54232,0.818515,0.796763,0.738159,0.753079
4,0.015,1.60169,0.813016,0.779997,0.754841,0.749394
5,0.0107,1.662765,0.813016,0.784928,0.743389,0.752845
6,0.0089,1.771966,0.813932,0.792073,0.760572,0.759832
7,0.0109,2.088948,0.786434,0.748234,0.759392,0.73648
8,0.0051,2.088167,0.814849,0.801097,0.772697,0.765276
9,0.0042,2.189258,0.805683,0.798299,0.764344,0.761371
10,0.0063,2.18263,0.808433,0.787309,0.732497,0.741251


[I 2025-03-16 07:51:31,072] Trial 141 finished with value: 0.7521415360985937 and parameters: {'learning_rate': 0.003862644602167287, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.93, 'warmup_steps': 46}. Best is trial 121 with value: 0.7876336386869527.


Trial 142 with params: {'learning_rate': 4.847550956494395e-05, 'weight_decay': 0.005, 'adam_beta1': 0.92, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7289,2.248647,0.439963,0.078133,0.101043,0.075277
2,1.8954,1.84489,0.532539,0.147933,0.15363,0.131676
3,1.5319,1.615745,0.5967,0.180059,0.205847,0.186638
4,1.258,1.45147,0.626948,0.285873,0.237576,0.228365
5,1.0413,1.34141,0.648946,0.310011,0.27988,0.274617
6,0.8578,1.261716,0.673694,0.383416,0.335963,0.341797
7,0.7099,1.214453,0.689276,0.385341,0.366812,0.369426
8,0.5904,1.19837,0.706691,0.426217,0.411241,0.412064
9,0.4944,1.174404,0.710357,0.4405,0.430071,0.429611
10,0.4175,1.171273,0.72044,0.487924,0.447214,0.460176


[I 2025-03-16 07:54:36,689] Trial 142 pruned. 


Trial 143 with params: {'learning_rate': 0.0024213900431778548, 'weight_decay': 0.008, 'adam_beta1': 0.93, 'warmup_steps': 52}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6984,1.112668,0.80385,0.749193,0.684485,0.698896
2,0.0285,1.275022,0.815765,0.776352,0.743906,0.739649
3,0.0114,1.35629,0.813016,0.764577,0.729589,0.731529
4,0.0091,1.530753,0.810266,0.776949,0.742528,0.744626
5,0.0075,1.553346,0.815765,0.791348,0.756988,0.757384
6,0.0043,1.907532,0.802016,0.797994,0.720075,0.741471
7,0.0056,1.724211,0.809349,0.79441,0.759141,0.761216
8,0.0069,1.989197,0.789184,0.720154,0.728598,0.717405
9,0.0038,1.790391,0.816682,0.746154,0.747272,0.733054
10,0.0022,1.9141,0.819432,0.782857,0.738849,0.74172


[I 2025-03-16 08:00:41,754] Trial 143 pruned. 


Trial 144 with params: {'learning_rate': 0.00436388357508892, 'weight_decay': 0.007, 'adam_beta1': 0.92, 'warmup_steps': 48}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5651,1.265509,0.800183,0.768905,0.747359,0.747272
2,0.0266,1.391776,0.804766,0.763282,0.747734,0.73564
3,0.016,1.677802,0.809349,0.803015,0.740381,0.756938
4,0.0126,1.72901,0.797434,0.760568,0.72326,0.724404
5,0.0142,1.860455,0.788268,0.743514,0.728377,0.717394
6,0.0125,2.162863,0.792851,0.760158,0.725753,0.726088
7,0.0121,2.628015,0.785518,0.741922,0.741912,0.726983
8,0.0088,2.54909,0.791934,0.762225,0.706307,0.715056
9,0.008,2.587889,0.781852,0.749807,0.724803,0.715804
10,0.0056,2.516358,0.791017,0.766603,0.733602,0.735456


[I 2025-03-16 08:06:26,162] Trial 144 pruned. 


Trial 145 with params: {'learning_rate': 0.0016667533629401603, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 42}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7534,1.090509,0.806599,0.724634,0.674883,0.681712
2,0.0334,1.246,0.810266,0.746372,0.698506,0.701894
3,0.013,1.381333,0.802933,0.725282,0.704181,0.696532
4,0.0093,1.521126,0.802933,0.75142,0.711954,0.704656
5,0.0066,1.538686,0.807516,0.78039,0.707054,0.725014
6,0.0054,1.671552,0.797434,0.771271,0.703823,0.718725
7,0.0062,1.565165,0.821265,0.797848,0.746745,0.757591
8,0.0036,1.656576,0.809349,0.789741,0.735657,0.74296
9,0.0023,1.657771,0.811182,0.76892,0.733675,0.729307
10,0.0024,1.750129,0.810266,0.758748,0.751257,0.738541


[I 2025-03-16 08:12:35,800] Trial 145 pruned. 


Trial 146 with params: {'learning_rate': 0.0025042207758972437, 'weight_decay': 0.008, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 51}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7383,1.217662,0.800183,0.744344,0.697158,0.701863
2,0.0303,1.307056,0.80385,0.782735,0.7122,0.722679
3,0.0115,1.484217,0.806599,0.758878,0.762497,0.746491
4,0.0077,1.688281,0.8011,0.774939,0.737474,0.73713
5,0.0074,1.728261,0.814849,0.793602,0.750093,0.758926
6,0.0082,1.736584,0.811182,0.784669,0.74884,0.753812
7,0.0068,1.895618,0.818515,0.736232,0.725931,0.71842
8,0.0036,2.012645,0.819432,0.791922,0.752918,0.75228
9,0.0035,2.009809,0.813016,0.775087,0.752022,0.74566
10,0.0021,2.120783,0.813932,0.789095,0.749915,0.748962


[I 2025-03-16 08:18:29,713] Trial 146 pruned. 


Trial 147 with params: {'learning_rate': 7.433382475760949e-05, 'weight_decay': 0.004, 'adam_beta1': 0.92, 'warmup_steps': 49}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.563,1.991441,0.501375,0.098113,0.130279,0.10459
2,1.5783,1.560217,0.595784,0.212325,0.207843,0.19239
3,1.1379,1.343448,0.651696,0.325514,0.28061,0.281084
4,0.8296,1.212027,0.696609,0.398213,0.355376,0.362695
5,0.608,1.144491,0.71769,0.426862,0.414826,0.412596


[I 2025-03-16 08:20:04,294] Trial 147 pruned. 


Trial 148 with params: {'learning_rate': 0.0026416561240707064, 'weight_decay': 0.003, 'adam_beta1': 0.97, 'warmup_steps': 19}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7358,1.311783,0.8011,0.763316,0.69851,0.711612
2,0.0318,1.386834,0.791934,0.707166,0.712729,0.693158
3,0.0124,1.54061,0.804766,0.684653,0.716482,0.688974
4,0.0077,1.768018,0.796517,0.707599,0.712018,0.690358
5,0.0089,1.791564,0.7956,0.69744,0.728842,0.700632
6,0.0066,1.864163,0.792851,0.744543,0.714526,0.716044
7,0.0054,2.026941,0.79835,0.75511,0.731943,0.728429
8,0.006,2.066484,0.802933,0.760251,0.733201,0.731364
9,0.0048,2.083822,0.791934,0.755811,0.729415,0.727348
10,0.0029,2.246291,0.804766,0.750409,0.751272,0.737517


[I 2025-03-16 08:29:14,725] Trial 148 finished with value: 0.7488210755640284 and parameters: {'learning_rate': 0.0026416561240707064, 'weight_decay': 0.003, 'adam_beta1': 0.97, 'warmup_steps': 19}. Best is trial 121 with value: 0.7876336386869527.


Trial 149 with params: {'learning_rate': 0.004436340314329022, 'weight_decay': 0.0, 'adam_beta1': 0.92, 'warmup_steps': 32}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5456,1.1892,0.807516,0.792285,0.70957,0.733787
2,0.0263,1.429172,0.819432,0.793994,0.74409,0.753338
3,0.0153,1.427815,0.813016,0.787785,0.756419,0.758791
4,0.0153,1.845165,0.8011,0.788107,0.738296,0.741752
5,0.0205,1.870116,0.7956,0.722478,0.734648,0.71195
6,0.0095,2.11703,0.80385,0.719812,0.737215,0.709728
7,0.0118,2.219728,0.802016,0.764559,0.745803,0.734383
8,0.0104,2.157964,0.802933,0.780042,0.746955,0.746188
9,0.0067,2.29845,0.805683,0.753484,0.741116,0.733583
10,0.0053,2.438079,0.802016,0.786067,0.746044,0.752318


[I 2025-03-16 08:38:23,870] Trial 149 finished with value: 0.7660287672962912 and parameters: {'learning_rate': 0.004436340314329022, 'weight_decay': 0.0, 'adam_beta1': 0.92, 'warmup_steps': 32}. Best is trial 121 with value: 0.7876336386869527.


In [64]:
print(best_trial3)

BestRun(run_id='121', objective=0.7876336386869527, hyperparameters={'learning_rate': 0.004220661238506302, 'weight_decay': 0.005, 'adam_beta1': 0.92, 'warmup_steps': 51}, run_summary=None)


In [65]:
base.reset_seed()

In [66]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill-embedd_fine_aug_hp-search", logging_dir=f"~/logs/{DATASET}/bilstm-distill-embedd_fine_aug_hp-search", remove_unused_columns=False, epochs=num_epochs, batch_size=batch_size)

In [67]:
def hp_space(trial):
    params =  {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-3, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0, 1e-2, step=1e-3),
        "adam_beta1" : trial.suggest_float("adam_beta1", 0.9, 0.99, step=0.01),
        "warmup_steps" : trial.suggest_int("warmup_steps", 0, warm_up),
        "lambda_param": trial.suggest_float("lambda_param",0,1,step=.1),
        "temperature": trial.suggest_float("temperature", 2,7, step=.5)
    }
    print(f"Trial {trial.number} with params: {params}")
    return params

In [68]:
pruner = optuna.pruners.HyperbandPruner(min_resource=min_r, max_resource=max_r, reduction_factor=2, bootstrap_count=2)
sampler = optuna.samplers.TPESampler(seed=42, multivariate=True)



In [69]:
trainer = base.DistilTrainer(
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    model_init = lambda: get_BiLSTM(),
    #callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)
  

In [70]:
best_trial4 = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    compute_objective=lambda metrics: metrics["eval_f1"],
    pruner=pruner,
    sampler=sampler,
    study_name="Distill-aug-embedd",
    n_trials=150
)

[I 2025-03-16 08:38:24,130] A new study created in memory with name: Distill-aug-embedd


Trial 0 with params: {'learning_rate': 0.0001025350969016849, 'weight_decay': 0.01, 'adam_beta1': 0.97, 'warmup_steps': 32, 'lambda_param': 0.1, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6149,1.233279,0.532539,0.131392,0.149685,0.126736
2,0.9459,0.939816,0.651696,0.27443,0.248557,0.232239
3,0.6593,0.802266,0.706691,0.351731,0.321257,0.313599
4,0.491,0.740946,0.732356,0.403767,0.367625,0.367174
5,0.3754,0.692331,0.757104,0.506122,0.433462,0.448497
6,0.2965,0.656675,0.769936,0.538537,0.476443,0.488199
7,0.2419,0.649463,0.776352,0.531836,0.499089,0.505106
8,0.2027,0.63534,0.776352,0.544576,0.512211,0.515118
9,0.1754,0.624493,0.784601,0.63073,0.558692,0.578574
10,0.1534,0.615558,0.792851,0.646697,0.593026,0.609056


[I 2025-03-16 08:41:37,248] Trial 0 pruned. 


Trial 1 with params: {'learning_rate': 1.4347159517201392e-05, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.96, 'warmup_steps': 38, 'lambda_param': 0.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.165,1.928477,0.304308,0.037615,0.055955,0.027463
2,1.7366,1.660202,0.368469,0.037112,0.076052,0.04835
3,1.5428,1.534754,0.410632,0.065453,0.088739,0.058827
4,1.4284,1.447345,0.445463,0.072088,0.10147,0.074958
5,1.341,1.383497,0.463795,0.104474,0.110863,0.085686
6,1.277,1.332784,0.494042,0.134514,0.127017,0.1049
7,1.2235,1.29182,0.508708,0.133293,0.134973,0.113114
8,1.1758,1.256514,0.52429,0.157973,0.146942,0.126675
9,1.1346,1.223239,0.537122,0.150027,0.155259,0.137027
10,1.0945,1.195001,0.558203,0.153833,0.172861,0.152574


[I 2025-03-16 08:44:36,853] Trial 1 pruned. 


Trial 2 with params: {'learning_rate': 0.001764971584817572, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 9, 'lambda_param': 0.30000000000000004, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5353,0.533739,0.810266,0.734864,0.662573,0.683761
2,0.0966,0.502403,0.825848,0.804862,0.736609,0.757948
3,0.0751,0.494817,0.829514,0.812077,0.745867,0.762308
4,0.0703,0.493894,0.829514,0.822562,0.752053,0.772734
5,0.0672,0.49236,0.83593,0.843369,0.755494,0.780412
6,0.0649,0.500437,0.839597,0.833746,0.757494,0.781003
7,0.0657,0.531217,0.819432,0.804904,0.72835,0.750646
8,0.0657,0.520091,0.826764,0.809494,0.74684,0.763039
9,0.0642,0.528209,0.823098,0.830182,0.765367,0.786059
10,0.0625,0.524633,0.828598,0.821151,0.766317,0.778206


[I 2025-03-16 08:50:52,163] Trial 2 pruned. 


Trial 3 with params: {'learning_rate': 0.0001464895513280072, 'weight_decay': 0.003, 'adam_beta1': 0.96, 'warmup_steps': 7, 'lambda_param': 0.30000000000000004, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4423,1.071823,0.609533,0.178211,0.200444,0.179808
2,0.7517,0.807663,0.702108,0.324705,0.316457,0.307294
3,0.4774,0.716356,0.746104,0.453101,0.392627,0.398576
4,0.3295,0.660544,0.76077,0.517336,0.447012,0.461583
5,0.2404,0.633195,0.769019,0.590725,0.509265,0.530747
6,0.186,0.607502,0.788268,0.631068,0.568913,0.583316
7,0.1509,0.594016,0.8011,0.651171,0.612505,0.618894
8,0.1282,0.585129,0.791017,0.671802,0.596605,0.616277
9,0.1136,0.588726,0.791017,0.702945,0.606327,0.637176
10,0.1032,0.585358,0.8011,0.722713,0.643744,0.664336


[I 2025-03-16 08:57:05,715] Trial 3 pruned. 


Trial 4 with params: {'learning_rate': 0.00017018418817029164, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 27, 'lambda_param': 0.6000000000000001, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3454,0.997914,0.628781,0.236629,0.219864,0.195124
2,0.6627,0.763758,0.71769,0.392901,0.342127,0.343654
3,0.4026,0.670902,0.759853,0.505582,0.429308,0.447624
4,0.2634,0.642379,0.776352,0.566159,0.497119,0.516244
5,0.1901,0.619241,0.787351,0.630818,0.577119,0.588188
6,0.1482,0.604896,0.8011,0.637303,0.606848,0.612337
7,0.1233,0.587438,0.789184,0.651399,0.591837,0.608682
8,0.1086,0.582474,0.79835,0.689598,0.60457,0.631027
9,0.0984,0.581391,0.79835,0.664424,0.620788,0.633335
10,0.0923,0.579871,0.8011,0.693444,0.635085,0.653384


[I 2025-03-16 09:03:06,215] Trial 4 pruned. 


Trial 5 with params: {'learning_rate': 0.00043625993625605574, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 51, 'lambda_param': 1.0, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9743,0.680512,0.76352,0.457208,0.429487,0.426024
2,0.2483,0.566052,0.80385,0.619829,0.58541,0.591837
3,0.1288,0.559638,0.8011,0.720052,0.638134,0.663665
4,0.0972,0.533783,0.819432,0.791513,0.699602,0.726111
5,0.0846,0.537423,0.814849,0.797722,0.718026,0.7439
6,0.077,0.529565,0.826764,0.802414,0.748823,0.765495
7,0.0731,0.543942,0.822181,0.795877,0.733474,0.754229
8,0.0701,0.533415,0.821265,0.795357,0.732086,0.751394
9,0.0674,0.527523,0.820348,0.810716,0.740815,0.764235
10,0.0667,0.533511,0.814849,0.804633,0.723613,0.750062


[I 2025-03-16 09:09:13,325] Trial 5 pruned. 


Trial 6 with params: {'learning_rate': 6.639623079859457e-05, 'weight_decay': 0.001, 'adam_beta1': 0.96, 'warmup_steps': 23, 'lambda_param': 0.1, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7203,1.363818,0.483043,0.124058,0.118626,0.092748
2,1.1418,1.11814,0.601283,0.177796,0.198506,0.178674
3,0.9046,0.97645,0.641613,0.261299,0.231155,0.211771
4,0.7347,0.881589,0.68011,0.297808,0.282686,0.272645
5,0.6047,0.820811,0.700275,0.347399,0.311741,0.307343
6,0.5062,0.778843,0.719523,0.370231,0.349791,0.345529
7,0.4305,0.756938,0.735105,0.43864,0.392741,0.394976
8,0.3708,0.731917,0.743355,0.484973,0.417764,0.430362
9,0.3253,0.71508,0.758937,0.503506,0.458374,0.465824
10,0.2869,0.697037,0.764436,0.527298,0.474741,0.48444


[I 2025-03-16 09:15:24,129] Trial 6 pruned. 


Trial 7 with params: {'learning_rate': 1.2382649697023537e-05, 'weight_decay': 0.01, 'adam_beta1': 0.92, 'warmup_steps': 35, 'lambda_param': 0.30000000000000004, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1689,1.948252,0.302475,0.038522,0.055358,0.027872
2,1.7775,1.710128,0.343721,0.035842,0.067995,0.042431
3,1.5969,1.586199,0.391384,0.059893,0.083302,0.054178
4,1.4847,1.500428,0.421632,0.071667,0.092498,0.064492
5,1.4014,1.43719,0.448213,0.072654,0.102285,0.075637
6,1.338,1.388162,0.466544,0.089275,0.113271,0.088119
7,1.2864,1.346861,0.486709,0.133425,0.122858,0.100271
8,1.2419,1.314249,0.495875,0.133784,0.127827,0.105377
9,1.2041,1.286271,0.506874,0.147578,0.133308,0.111466
10,1.1683,1.256482,0.527956,0.155448,0.151786,0.132019


[I 2025-03-16 09:21:36,123] Trial 7 pruned. 


Trial 8 with params: {'learning_rate': 0.00029891977384598987, 'weight_decay': 0.002, 'adam_beta1': 0.99, 'warmup_steps': 41, 'lambda_param': 1.0, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4317,1.015298,0.613199,0.196327,0.210483,0.182844
2,0.6141,0.697318,0.751604,0.411675,0.403566,0.395498
3,0.2875,0.615858,0.790101,0.543679,0.509357,0.510939
4,0.1687,0.579906,0.8011,0.671485,0.598756,0.617973
5,0.1201,0.567579,0.805683,0.734241,0.637119,0.668285
6,0.1,0.560989,0.809349,0.780419,0.674101,0.709964
7,0.0884,0.548883,0.821265,0.817924,0.704698,0.742987
8,0.0806,0.550494,0.821265,0.808779,0.716018,0.746487
9,0.0772,0.55123,0.811182,0.809801,0.700246,0.735561
10,0.0751,0.540638,0.824931,0.812676,0.734358,0.756055


[I 2025-03-16 09:27:55,114] Trial 8 pruned. 


Trial 9 with params: {'learning_rate': 0.00041087915453240814, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'warmup_steps': 10, 'lambda_param': 0.0, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9663,0.709133,0.757104,0.433483,0.41505,0.414723
2,0.2684,0.573182,0.806599,0.638863,0.586871,0.599523
3,0.1377,0.565543,0.802933,0.738756,0.629152,0.664619
4,0.1021,0.54013,0.815765,0.797455,0.714647,0.741098
5,0.0871,0.549425,0.813932,0.807419,0.713396,0.74234
6,0.0789,0.535277,0.820348,0.817304,0.732533,0.758573
7,0.0743,0.539373,0.819432,0.816449,0.727187,0.75545
8,0.0708,0.530581,0.818515,0.829524,0.721021,0.756648
9,0.0687,0.5264,0.824015,0.820908,0.732684,0.76137
10,0.0683,0.529551,0.824015,0.81767,0.731889,0.758987


[I 2025-03-16 09:37:23,462] Trial 9 finished with value: 0.7812423600367872 and parameters: {'learning_rate': 0.00041087915453240814, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'warmup_steps': 10, 'lambda_param': 0.0, 'temperature': 3.5}. Best is trial 9 with value: 0.7812423600367872.


Trial 10 with params: {'learning_rate': 0.0006182305620915354, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.92, 'warmup_steps': 10, 'lambda_param': 0.1, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8311,0.615504,0.790101,0.537731,0.49402,0.502451
2,0.1809,0.542568,0.818515,0.747484,0.663481,0.691522
3,0.1049,0.540011,0.813932,0.780146,0.699043,0.726482
4,0.0855,0.508491,0.824015,0.820735,0.725986,0.754982
5,0.0763,0.520239,0.818515,0.825967,0.723801,0.755727
6,0.0716,0.506523,0.828598,0.830896,0.737688,0.767092
7,0.0692,0.521166,0.828598,0.825122,0.74732,0.770118
8,0.0682,0.527018,0.823098,0.835923,0.746152,0.774344
9,0.0664,0.516663,0.829514,0.814638,0.741105,0.762361
10,0.0644,0.506061,0.83593,0.830728,0.754066,0.777256


[I 2025-03-16 09:46:38,295] Trial 10 finished with value: 0.7886907397694771 and parameters: {'learning_rate': 0.0006182305620915354, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.92, 'warmup_steps': 10, 'lambda_param': 0.1, 'temperature': 2.5}. Best is trial 10 with value: 0.7886907397694771.


Trial 11 with params: {'learning_rate': 0.002025027061073857, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9, 'warmup_steps': 6, 'lambda_param': 0.1, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5144,0.522513,0.822181,0.751061,0.667649,0.69202
2,0.0934,0.505647,0.829514,0.802836,0.744332,0.760581
3,0.0752,0.496659,0.835014,0.819594,0.755835,0.775142
4,0.0707,0.49672,0.836847,0.826414,0.75434,0.776721
5,0.0684,0.493998,0.836847,0.818549,0.755405,0.775931


[I 2025-03-16 09:48:17,002] Trial 11 pruned. 


Trial 12 with params: {'learning_rate': 0.00026587460103771795, 'weight_decay': 0.008, 'adam_beta1': 0.92, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.152,0.831991,0.701192,0.33479,0.313285,0.306114
2,0.4421,0.647673,0.778185,0.511615,0.472447,0.47622
3,0.2294,0.599127,0.8011,0.667276,0.578351,0.598195
4,0.1494,0.576348,0.804766,0.686642,0.617337,0.637955
5,0.115,0.568891,0.80385,0.710746,0.64288,0.664065


[I 2025-03-16 09:49:48,670] Trial 12 pruned. 


Trial 13 with params: {'learning_rate': 7.100205390479974e-05, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9, 'warmup_steps': 12, 'lambda_param': 0.0, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.644,1.334662,0.494959,0.110725,0.125547,0.099645
2,1.1099,1.099419,0.604033,0.174026,0.20331,0.180367
3,0.8677,0.955085,0.646196,0.275097,0.241872,0.224834
4,0.6932,0.865612,0.688359,0.300336,0.287768,0.280658
5,0.567,0.806206,0.71494,0.357467,0.330965,0.325711


[I 2025-03-16 09:51:26,277] Trial 13 pruned. 


Trial 14 with params: {'learning_rate': 0.002177130318028656, 'weight_decay': 0.01, 'adam_beta1': 0.92, 'warmup_steps': 32, 'lambda_param': 0.2, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.526,0.521639,0.824015,0.789869,0.711598,0.731785
2,0.0913,0.517702,0.826764,0.807766,0.739279,0.759891
3,0.0743,0.502189,0.833181,0.83038,0.756161,0.781904
4,0.0701,0.493454,0.836847,0.82977,0.757349,0.779803
5,0.0685,0.485305,0.839597,0.836465,0.755251,0.784023
6,0.0653,0.480513,0.837764,0.842133,0.771388,0.793719
7,0.0648,0.486835,0.840513,0.820567,0.757899,0.778005
8,0.0645,0.49814,0.837764,0.846121,0.756069,0.782922
9,0.0645,0.511513,0.829514,0.841909,0.758496,0.785599
10,0.0645,0.497737,0.84143,0.841493,0.772592,0.795069


[I 2025-03-16 09:54:42,830] Trial 14 pruned. 


Trial 15 with params: {'learning_rate': 0.0034070661542039116, 'weight_decay': 0.01, 'adam_beta1': 0.96, 'warmup_steps': 10, 'lambda_param': 0.2, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5076,0.525449,0.827681,0.756657,0.692287,0.706916
2,0.092,0.533519,0.823098,0.812157,0.740087,0.760324
3,0.0762,0.505181,0.84143,0.831968,0.777525,0.79381
4,0.0717,0.526939,0.827681,0.826928,0.758675,0.779495
5,0.0693,0.527946,0.825848,0.821368,0.762102,0.779785
6,0.0682,0.531795,0.826764,0.814118,0.746168,0.769191
7,0.0666,0.543684,0.828598,0.83432,0.767414,0.787816
8,0.0664,0.534675,0.833181,0.819403,0.75505,0.772759
9,0.0652,0.529165,0.83868,0.813982,0.766916,0.779282
10,0.0643,0.546132,0.830431,0.817034,0.760467,0.777188


[I 2025-03-16 10:04:08,999] Trial 15 finished with value: 0.8006623744509614 and parameters: {'learning_rate': 0.0034070661542039116, 'weight_decay': 0.01, 'adam_beta1': 0.96, 'warmup_steps': 10, 'lambda_param': 0.2, 'temperature': 3.0}. Best is trial 15 with value: 0.8006623744509614.


Trial 16 with params: {'learning_rate': 0.002355540293680302, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.99, 'warmup_steps': 20, 'lambda_param': 0.1, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7202,0.623886,0.782768,0.578178,0.564364,0.557082
2,0.1213,0.556237,0.809349,0.724998,0.696378,0.698211
3,0.081,0.539541,0.814849,0.78328,0.726365,0.743431
4,0.0729,0.532873,0.824015,0.788146,0.729286,0.74545
5,0.0693,0.538254,0.823098,0.791476,0.736863,0.75257
6,0.0674,0.531333,0.823098,0.788393,0.737728,0.749253
7,0.0668,0.530616,0.826764,0.810844,0.756932,0.771107
8,0.066,0.539741,0.818515,0.777641,0.735389,0.746223
9,0.0654,0.541354,0.820348,0.810925,0.745289,0.764822
10,0.0639,0.547344,0.819432,0.830386,0.760228,0.783225


[I 2025-03-16 10:13:34,209] Trial 16 finished with value: 0.800642720217273 and parameters: {'learning_rate': 0.002355540293680302, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.99, 'warmup_steps': 20, 'lambda_param': 0.1, 'temperature': 2.0}. Best is trial 15 with value: 0.8006623744509614.


Trial 17 with params: {'learning_rate': 0.0035971731070310255, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.99, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.601,0.609195,0.800183,0.664161,0.621762,0.628276
2,0.1139,0.551674,0.824931,0.770288,0.706194,0.719632
3,0.0829,0.534606,0.828598,0.804869,0.728566,0.749498
4,0.076,0.545203,0.821265,0.786225,0.731468,0.738994
5,0.0722,0.539577,0.827681,0.816484,0.745066,0.764014
6,0.0701,0.540286,0.829514,0.757738,0.708169,0.716665
7,0.0691,0.547811,0.816682,0.746447,0.707146,0.710149
8,0.0683,0.531663,0.826764,0.776253,0.727796,0.734223
9,0.0675,0.551927,0.823098,0.755836,0.71204,0.719192
10,0.067,0.552822,0.820348,0.764554,0.73939,0.738395


[I 2025-03-16 10:22:35,361] Trial 17 finished with value: 0.8015099909118112 and parameters: {'learning_rate': 0.0035971731070310255, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.99, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 2.5}. Best is trial 17 with value: 0.8015099909118112.


Trial 18 with params: {'learning_rate': 0.002284476796877085, 'weight_decay': 0.008, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 3, 'lambda_param': 0.5, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5297,0.520354,0.827681,0.754537,0.674841,0.697579
2,0.0949,0.517092,0.827681,0.804776,0.748758,0.764392
3,0.0759,0.512778,0.831347,0.847513,0.77513,0.795953
4,0.0709,0.517221,0.831347,0.82506,0.763617,0.779432
5,0.0676,0.505076,0.825848,0.832145,0.754662,0.779009
6,0.0665,0.51264,0.829514,0.827819,0.75685,0.777639
7,0.0674,0.527885,0.821265,0.817217,0.751245,0.770602
8,0.0655,0.541654,0.818515,0.811539,0.749858,0.76862
9,0.0646,0.527903,0.830431,0.83166,0.767256,0.786715
10,0.0637,0.521247,0.824931,0.804972,0.74854,0.761321


[I 2025-03-16 10:29:00,045] Trial 18 pruned. 


Trial 19 with params: {'learning_rate': 0.000269585406302118, 'weight_decay': 0.008, 'adam_beta1': 0.99, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3882,1.015028,0.617782,0.222508,0.213773,0.188298
2,0.6346,0.707843,0.757104,0.40829,0.403217,0.394374
3,0.3126,0.629146,0.780935,0.518544,0.495483,0.489563
4,0.1874,0.588097,0.792851,0.629018,0.569095,0.584796
5,0.1319,0.563331,0.809349,0.740844,0.646385,0.677182
6,0.1089,0.572247,0.80385,0.751509,0.644995,0.67912
7,0.0945,0.552394,0.816682,0.773624,0.689628,0.719229
8,0.0852,0.563751,0.802933,0.786118,0.689393,0.720967
9,0.0808,0.565192,0.809349,0.798218,0.688717,0.723895
10,0.0771,0.555099,0.809349,0.80096,0.70568,0.736643


[I 2025-03-16 10:38:05,780] Trial 19 finished with value: 0.7535185985714903 and parameters: {'learning_rate': 0.000269585406302118, 'weight_decay': 0.008, 'adam_beta1': 0.99, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 3.5}. Best is trial 17 with value: 0.8015099909118112.


Trial 20 with params: {'learning_rate': 0.004509680229173063, 'weight_decay': 0.006, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4181,0.512491,0.829514,0.821273,0.759245,0.773733
2,0.0891,0.512909,0.831347,0.813042,0.754995,0.771739
3,0.0756,0.515997,0.826764,0.815579,0.75037,0.76996
4,0.072,0.542469,0.827681,0.827176,0.765474,0.782582
5,0.0701,0.527774,0.833181,0.839825,0.780443,0.79846
6,0.0687,0.541803,0.827681,0.833009,0.770145,0.788666
7,0.0686,0.526652,0.830431,0.823014,0.771153,0.785112
8,0.0682,0.536759,0.825848,0.837557,0.766215,0.788097
9,0.0659,0.535367,0.828598,0.836125,0.771923,0.791745
10,0.0646,0.557473,0.824931,0.818362,0.746739,0.769985


[I 2025-03-16 10:47:26,485] Trial 20 finished with value: 0.8039178968776831 and parameters: {'learning_rate': 0.004509680229173063, 'weight_decay': 0.006, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 2.0}. Best is trial 20 with value: 0.8039178968776831.


Trial 21 with params: {'learning_rate': 0.002536694252708335, 'weight_decay': 0.007, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 15, 'lambda_param': 0.0, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5402,0.510512,0.830431,0.745178,0.691479,0.705751
2,0.092,0.506013,0.836847,0.821595,0.751351,0.769224
3,0.0741,0.492097,0.837764,0.817037,0.7578,0.776125
4,0.0698,0.497558,0.842346,0.856003,0.785278,0.805558
5,0.0683,0.50635,0.830431,0.790457,0.753235,0.762933
6,0.0671,0.501504,0.835014,0.839742,0.762595,0.785078
7,0.0659,0.489073,0.834097,0.841397,0.772124,0.790422
8,0.0644,0.504981,0.830431,0.837008,0.779008,0.796996
9,0.0641,0.500006,0.83868,0.860414,0.779537,0.804065
10,0.0627,0.506853,0.834097,0.830658,0.759636,0.781049


[I 2025-03-16 10:56:41,237] Trial 21 finished with value: 0.802726798026551 and parameters: {'learning_rate': 0.002536694252708335, 'weight_decay': 0.007, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 15, 'lambda_param': 0.0, 'temperature': 2.5}. Best is trial 20 with value: 0.8039178968776831.


Trial 22 with params: {'learning_rate': 0.003541103753699351, 'weight_decay': 0.005, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 12, 'lambda_param': 0.0, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4762,0.510046,0.822181,0.798166,0.720726,0.740326
2,0.0896,0.517067,0.821265,0.800993,0.730412,0.750025
3,0.0749,0.514848,0.833181,0.825137,0.757258,0.778276
4,0.0702,0.524002,0.822181,0.836364,0.754761,0.779792
5,0.069,0.518324,0.830431,0.841233,0.768869,0.788936
6,0.0678,0.534627,0.823098,0.837388,0.748115,0.774625
7,0.0669,0.516209,0.824931,0.834267,0.776653,0.794248
8,0.0653,0.527436,0.829514,0.832741,0.766221,0.786292
9,0.0641,0.514547,0.836847,0.853365,0.769518,0.797334
10,0.0637,0.546639,0.823098,0.815853,0.7598,0.776561


[I 2025-03-16 11:06:07,942] Trial 22 finished with value: 0.7867148000131174 and parameters: {'learning_rate': 0.003541103753699351, 'weight_decay': 0.005, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 12, 'lambda_param': 0.0, 'temperature': 3.5}. Best is trial 20 with value: 0.8039178968776831.


Trial 23 with params: {'learning_rate': 0.0016194732072799362, 'weight_decay': 0.005, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 6, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5878,0.514861,0.822181,0.716462,0.646195,0.665119
2,0.1031,0.506568,0.833181,0.769174,0.738478,0.74353
3,0.0766,0.501525,0.836847,0.785058,0.740617,0.752436
4,0.071,0.505535,0.834097,0.798592,0.754384,0.762273
5,0.07,0.501618,0.832264,0.789405,0.742175,0.754086
6,0.0666,0.509393,0.827681,0.79308,0.72659,0.743661
7,0.0664,0.514853,0.824015,0.807491,0.744197,0.761707
8,0.0643,0.495834,0.833181,0.803505,0.754193,0.767287
9,0.0635,0.49591,0.830431,0.82828,0.756944,0.779118
10,0.063,0.505129,0.836847,0.820978,0.757188,0.778566


[I 2025-03-16 11:15:31,544] Trial 23 finished with value: 0.8073428278491649 and parameters: {'learning_rate': 0.0016194732072799362, 'weight_decay': 0.005, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 6, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}. Best is trial 23 with value: 0.8073428278491649.


Trial 24 with params: {'learning_rate': 0.004868646214727486, 'weight_decay': 0.004, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 4, 'lambda_param': 0.5, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4103,0.516853,0.826764,0.796986,0.733276,0.752122
2,0.0903,0.523344,0.829514,0.815884,0.733615,0.759566
3,0.0777,0.510682,0.832264,0.818761,0.751692,0.77148
4,0.0746,0.525494,0.831347,0.81813,0.757962,0.77464
5,0.0718,0.542683,0.828598,0.841119,0.762987,0.787787
6,0.072,0.53307,0.819432,0.787467,0.731908,0.747076
7,0.0688,0.541774,0.823098,0.82076,0.757218,0.775875
8,0.0681,0.545003,0.824931,0.818207,0.737033,0.760628
9,0.0676,0.566077,0.820348,0.813716,0.755222,0.773345
10,0.0673,0.583443,0.809349,0.832366,0.758149,0.781925


[I 2025-03-16 11:25:01,834] Trial 24 finished with value: 0.7880668308319586 and parameters: {'learning_rate': 0.004868646214727486, 'weight_decay': 0.004, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 4, 'lambda_param': 0.5, 'temperature': 2.5}. Best is trial 23 with value: 0.8073428278491649.


Trial 25 with params: {'learning_rate': 0.0012538556940378794, 'weight_decay': 0.007, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 23, 'lambda_param': 0.0, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6651,0.557896,0.807516,0.640383,0.607869,0.611753
2,0.1147,0.512904,0.821265,0.785788,0.737369,0.747541
3,0.0809,0.50544,0.825848,0.792769,0.739215,0.750534
4,0.0731,0.50463,0.828598,0.816012,0.745389,0.764837
5,0.0685,0.50511,0.834097,0.826383,0.751291,0.776338
6,0.0666,0.494786,0.837764,0.842822,0.773215,0.792677
7,0.0656,0.511477,0.830431,0.8223,0.760116,0.777052
8,0.0664,0.512036,0.829514,0.819661,0.752361,0.771807
9,0.0646,0.50929,0.836847,0.840593,0.77151,0.791717
10,0.0623,0.498796,0.836847,0.827045,0.759613,0.778385


[I 2025-03-16 11:34:28,482] Trial 25 finished with value: 0.7857505240441873 and parameters: {'learning_rate': 0.0012538556940378794, 'weight_decay': 0.007, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 23, 'lambda_param': 0.0, 'temperature': 2.5}. Best is trial 23 with value: 0.8073428278491649.


Trial 26 with params: {'learning_rate': 0.0018055828513499916, 'weight_decay': 0.004, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 3, 'lambda_param': 0.2, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5535,0.526228,0.821265,0.746413,0.659765,0.685351
2,0.0989,0.517168,0.835014,0.813715,0.7487,0.768042
3,0.0763,0.49163,0.84143,0.834484,0.761357,0.783129
4,0.0707,0.517954,0.834097,0.827884,0.751828,0.775377
5,0.0687,0.506031,0.83868,0.836636,0.753942,0.781271
6,0.0667,0.504693,0.83868,0.814521,0.758584,0.772816
7,0.0657,0.507577,0.834097,0.816669,0.749764,0.769118
8,0.0649,0.512343,0.828598,0.807832,0.739644,0.758528
9,0.0646,0.522265,0.832264,0.810149,0.762031,0.774526
10,0.0629,0.509581,0.839597,0.832361,0.76499,0.784286


[I 2025-03-16 11:40:38,539] Trial 26 pruned. 


Trial 27 with params: {'learning_rate': 0.0015093897556069298, 'weight_decay': 0.002, 'adam_beta1': 0.96, 'warmup_steps': 27, 'lambda_param': 0.7000000000000001, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6734,0.538969,0.810266,0.645485,0.616937,0.620061
2,0.1106,0.51266,0.819432,0.773556,0.692837,0.71556
3,0.0792,0.50325,0.832264,0.794525,0.722892,0.745447
4,0.0716,0.501913,0.834097,0.79849,0.737097,0.756291
5,0.0686,0.511353,0.824931,0.820382,0.732601,0.760866


[I 2025-03-16 11:42:14,316] Trial 27 pruned. 


Trial 28 with params: {'learning_rate': 0.00481223024264649, 'weight_decay': 0.007, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.411,0.545547,0.824931,0.829693,0.729266,0.757541
2,0.0897,0.528875,0.834097,0.813141,0.751004,0.769092
3,0.0774,0.519705,0.83868,0.852881,0.76457,0.793539
4,0.0738,0.53175,0.834097,0.837983,0.761702,0.785224
5,0.0723,0.532046,0.833181,0.830876,0.767127,0.787165
6,0.0705,0.53528,0.835014,0.850249,0.778891,0.803368
7,0.0708,0.554195,0.826764,0.823754,0.747381,0.774298
8,0.0702,0.541851,0.829514,0.845515,0.76208,0.789248
9,0.0676,0.531642,0.834097,0.855134,0.777044,0.80265
10,0.0665,0.552884,0.828598,0.844112,0.769022,0.792431


[I 2025-03-16 11:52:02,216] Trial 28 finished with value: 0.808074449849158 and parameters: {'learning_rate': 0.00481223024264649, 'weight_decay': 0.007, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 2.0}. Best is trial 28 with value: 0.808074449849158.


Trial 29 with params: {'learning_rate': 0.0019393378883667213, 'weight_decay': 0.007, 'adam_beta1': 0.93, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5319,0.519375,0.817599,0.743065,0.667745,0.687984
2,0.0962,0.515863,0.831347,0.779882,0.725806,0.739131
3,0.0758,0.504755,0.826764,0.776169,0.726945,0.73869
4,0.0707,0.519685,0.822181,0.828078,0.737467,0.764728
5,0.0688,0.509949,0.834097,0.823445,0.752626,0.774642
6,0.067,0.524636,0.831347,0.842312,0.758705,0.786753
7,0.0662,0.510062,0.83593,0.827944,0.769998,0.789445
8,0.0643,0.50097,0.832264,0.826964,0.755706,0.777276
9,0.0634,0.522361,0.83593,0.853728,0.760739,0.791427
10,0.0639,0.518172,0.826764,0.82733,0.7462,0.772735


[I 2025-03-16 11:55:15,236] Trial 29 pruned. 


Trial 30 with params: {'learning_rate': 0.0004155315526374879, 'weight_decay': 0.007, 'adam_beta1': 0.99, 'warmup_steps': 0, 'lambda_param': 0.8, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2254,0.875004,0.670944,0.274177,0.272236,0.255983
2,0.4527,0.647807,0.773602,0.504223,0.48611,0.478501
3,0.196,0.575165,0.793767,0.679305,0.606707,0.626231
4,0.1201,0.545323,0.813932,0.729172,0.653089,0.677149
5,0.0943,0.545925,0.807516,0.78519,0.69506,0.723422
6,0.083,0.537605,0.816682,0.794168,0.720096,0.744029
7,0.0764,0.546472,0.816682,0.803063,0.712364,0.742849
8,0.072,0.537856,0.817599,0.801332,0.720303,0.746911
9,0.07,0.542224,0.815765,0.807053,0.715844,0.744775
10,0.0705,0.528231,0.815765,0.812112,0.724058,0.753117


[I 2025-03-16 12:01:36,050] Trial 30 pruned. 


Trial 31 with params: {'learning_rate': 0.0004884097879022973, 'weight_decay': 0.006, 'adam_beta1': 0.96, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9814,0.683718,0.75802,0.434431,0.402394,0.40377
2,0.2491,0.569644,0.80385,0.64893,0.596244,0.611243
3,0.1285,0.578144,0.79835,0.694532,0.642767,0.652043
4,0.097,0.536391,0.813932,0.815393,0.705561,0.741769
5,0.0829,0.539148,0.813016,0.795622,0.707154,0.738505
6,0.0761,0.532665,0.817599,0.822754,0.713438,0.74905
7,0.073,0.533908,0.811182,0.810534,0.715731,0.747087
8,0.0706,0.538271,0.819432,0.815294,0.725433,0.75688
9,0.0685,0.527736,0.821265,0.815968,0.722532,0.753922
10,0.0664,0.526375,0.825848,0.834098,0.736606,0.768861


[I 2025-03-16 12:07:53,717] Trial 31 pruned. 


Trial 32 with params: {'learning_rate': 0.0037808031128860643, 'weight_decay': 0.006, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 15, 'lambda_param': 0.0, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4678,0.503756,0.835014,0.820219,0.721607,0.753779
2,0.089,0.520715,0.825848,0.81559,0.755376,0.771083
3,0.0752,0.515761,0.825848,0.821039,0.745111,0.767169
4,0.0709,0.520012,0.828598,0.82657,0.754161,0.777429
5,0.0683,0.51923,0.831347,0.798566,0.757727,0.76865
6,0.0674,0.55339,0.820348,0.815023,0.737286,0.759932
7,0.0677,0.555571,0.813932,0.786965,0.732894,0.747148
8,0.067,0.543653,0.824931,0.80514,0.744964,0.761592
9,0.0656,0.563221,0.819432,0.806193,0.743907,0.760458
10,0.064,0.556901,0.819432,0.822341,0.738623,0.764562


[I 2025-03-16 12:13:59,375] Trial 32 pruned. 


Trial 33 with params: {'learning_rate': 0.003550295158989822, 'weight_decay': 0.008, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 5, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4643,0.502397,0.837764,0.808723,0.749014,0.764418
2,0.089,0.513983,0.823098,0.810692,0.723973,0.74764
3,0.0751,0.500713,0.833181,0.823851,0.759417,0.778085
4,0.0715,0.50795,0.833181,0.829049,0.762627,0.781107
5,0.0703,0.507769,0.826764,0.811874,0.753989,0.770669
6,0.0684,0.526711,0.831347,0.83884,0.766277,0.788302
7,0.0672,0.529107,0.830431,0.840937,0.778712,0.797215
8,0.0667,0.538126,0.833181,0.836384,0.766335,0.788943
9,0.0648,0.535172,0.827681,0.818457,0.760145,0.776677
10,0.064,0.564305,0.824931,0.82619,0.760733,0.7804


[I 2025-03-16 12:23:21,223] Trial 33 finished with value: 0.7996690606457222 and parameters: {'learning_rate': 0.003550295158989822, 'weight_decay': 0.008, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 5, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}. Best is trial 28 with value: 0.808074449849158.


Trial 34 with params: {'learning_rate': 0.004752509413917654, 'weight_decay': 0.005, 'adam_beta1': 0.97, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4941,0.527869,0.820348,0.740176,0.689949,0.698136
2,0.0982,0.520261,0.826764,0.807167,0.749459,0.765202
3,0.0817,0.523452,0.832264,0.837217,0.753764,0.778292
4,0.0763,0.537603,0.821265,0.824919,0.752557,0.772012
5,0.0742,0.535146,0.824931,0.80321,0.751255,0.762236


[I 2025-03-16 12:24:51,370] Trial 34 pruned. 


Trial 35 with params: {'learning_rate': 0.001016608363718263, 'weight_decay': 0.005, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 27, 'lambda_param': 0.4, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7227,0.559049,0.807516,0.630002,0.589616,0.595352
2,0.1281,0.525304,0.828598,0.778333,0.709896,0.730518
3,0.0848,0.50121,0.83593,0.827842,0.738272,0.766007
4,0.0756,0.511423,0.827681,0.811501,0.723411,0.750598
5,0.0712,0.503744,0.831347,0.787672,0.714369,0.73809
6,0.0674,0.500731,0.826764,0.803734,0.724233,0.746244
7,0.0648,0.50206,0.834097,0.829385,0.739242,0.769666
8,0.0646,0.52069,0.83593,0.827174,0.76415,0.780786
9,0.0649,0.511932,0.827681,0.827234,0.733227,0.762958
10,0.0643,0.502986,0.833181,0.817066,0.740703,0.760087


[I 2025-03-16 12:28:04,904] Trial 35 pruned. 


Trial 36 with params: {'learning_rate': 0.00013525897784933723, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 39, 'lambda_param': 0.0, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4411,1.089182,0.598533,0.165221,0.197507,0.169514
2,0.7846,0.839931,0.701192,0.311708,0.309009,0.301987
3,0.5155,0.733901,0.736939,0.459693,0.374395,0.386983
4,0.3604,0.693712,0.757104,0.520647,0.445304,0.461536
5,0.2639,0.647521,0.774519,0.537058,0.490248,0.501647


[I 2025-03-16 12:29:42,516] Trial 36 pruned. 


Trial 37 with params: {'learning_rate': 2.197945691935017e-05, 'weight_decay': 0.007, 'adam_beta1': 0.97, 'warmup_steps': 36, 'lambda_param': 0.9, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0845,1.787548,0.32264,0.044081,0.062016,0.036221
2,1.5803,1.509756,0.418882,0.070579,0.091707,0.06389
3,1.3782,1.386766,0.468378,0.089526,0.112953,0.08812
4,1.2635,1.299179,0.500458,0.129085,0.129516,0.104754
5,1.1709,1.232168,0.528873,0.1558,0.147779,0.127527
6,1.0998,1.180135,0.574702,0.15907,0.18033,0.159064
7,1.0336,1.131852,0.593951,0.166821,0.195506,0.174203
8,0.9745,1.090844,0.608616,0.171687,0.206958,0.183369
9,0.9221,1.052665,0.619615,0.191931,0.214678,0.190402
10,0.8716,1.018857,0.633364,0.209311,0.22533,0.205232


[I 2025-03-16 12:32:43,582] Trial 37 pruned. 


Trial 38 with params: {'learning_rate': 0.003662945698360812, 'weight_decay': 0.003, 'adam_beta1': 0.99, 'warmup_steps': 15, 'lambda_param': 0.1, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.625,0.60368,0.79835,0.692139,0.628972,0.640853
2,0.1113,0.562801,0.813016,0.757001,0.711214,0.718608
3,0.0807,0.549453,0.820348,0.790158,0.739596,0.751135
4,0.0747,0.563083,0.815765,0.799779,0.747074,0.75711
5,0.071,0.554002,0.828598,0.790979,0.756169,0.76026


[I 2025-03-16 12:34:17,976] Trial 38 pruned. 


Trial 39 with params: {'learning_rate': 0.00016626989109537525, 'weight_decay': 0.002, 'adam_beta1': 0.99, 'warmup_steps': 45, 'lambda_param': 0.4, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6143,1.21703,0.536205,0.118365,0.148457,0.121624
2,0.8801,0.840575,0.692942,0.335123,0.296209,0.285854
3,0.5226,0.705711,0.749771,0.410339,0.393033,0.388588
4,0.3355,0.638128,0.781852,0.507447,0.479886,0.479858
5,0.2284,0.608746,0.783685,0.533525,0.501712,0.500487
6,0.173,0.592859,0.802016,0.674666,0.590577,0.611608
7,0.1399,0.594234,0.793767,0.663814,0.593497,0.61283
8,0.1189,0.578597,0.805683,0.695978,0.641923,0.657386
9,0.1062,0.567121,0.802933,0.700358,0.63891,0.656978
10,0.0972,0.567595,0.80385,0.732824,0.655246,0.679528


[I 2025-03-16 12:40:52,049] Trial 39 pruned. 


Trial 40 with params: {'learning_rate': 0.00440191993114764, 'weight_decay': 0.007, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4265,0.515363,0.823098,0.825246,0.74134,0.767945
2,0.0921,0.526611,0.823098,0.811799,0.746759,0.767985
3,0.0779,0.533672,0.825848,0.794991,0.75795,0.766663
4,0.0743,0.542272,0.824931,0.807686,0.746163,0.764466
5,0.0726,0.539562,0.828598,0.815778,0.764344,0.777609
6,0.0698,0.541542,0.820348,0.812405,0.749446,0.768264
7,0.0702,0.554848,0.821265,0.801371,0.747756,0.760154
8,0.069,0.545127,0.820348,0.804056,0.752895,0.765812
9,0.0672,0.585224,0.816682,0.826972,0.752134,0.776402
10,0.0661,0.579277,0.818515,0.81542,0.745895,0.769673


[I 2025-03-16 12:50:25,589] Trial 40 finished with value: 0.7927968442366082 and parameters: {'learning_rate': 0.00440191993114764, 'weight_decay': 0.007, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 2.5}. Best is trial 28 with value: 0.808074449849158.


Trial 41 with params: {'learning_rate': 0.001008208739925685, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.99, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.906,0.662395,0.756187,0.458101,0.427338,0.426752
2,0.198,0.541366,0.810266,0.679466,0.632687,0.641383
3,0.0971,0.510694,0.818515,0.76588,0.68436,0.706922
4,0.0787,0.510436,0.827681,0.810126,0.724778,0.750115
5,0.0725,0.525406,0.821265,0.797316,0.717104,0.740323
6,0.069,0.52726,0.818515,0.805169,0.707125,0.739289
7,0.0667,0.517385,0.824931,0.825693,0.738507,0.765614
8,0.0655,0.517204,0.828598,0.820719,0.746616,0.766746
9,0.0661,0.536872,0.820348,0.803045,0.722529,0.74572
10,0.0646,0.514937,0.823098,0.792764,0.724813,0.745797


[I 2025-03-16 12:53:28,406] Trial 41 pruned. 


Trial 42 with params: {'learning_rate': 4.712098624605705e-05, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 25, 'lambda_param': 0.9, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7882,1.470195,0.439963,0.072413,0.100075,0.073308
2,1.2765,1.252037,0.52154,0.158697,0.145438,0.12329
3,1.0804,1.125677,0.592117,0.169478,0.196683,0.173455
4,0.9306,1.022794,0.629698,0.216708,0.221178,0.199407
5,0.8023,0.947822,0.654445,0.272127,0.249339,0.233411
6,0.6984,0.893138,0.681943,0.316831,0.285809,0.27692
7,0.6147,0.8558,0.695692,0.321425,0.301391,0.29277
8,0.5471,0.82714,0.712191,0.372614,0.338283,0.335087
9,0.4929,0.805575,0.72319,0.375758,0.355178,0.350767
10,0.4447,0.780592,0.728689,0.414566,0.372693,0.378955


[I 2025-03-16 12:56:41,492] Trial 42 pruned. 


Trial 43 with params: {'learning_rate': 0.00015448517085097122, 'weight_decay': 0.01, 'adam_beta1': 0.99, 'warmup_steps': 51, 'lambda_param': 0.1, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6359,1.232133,0.529789,0.118633,0.14566,0.119852
2,0.9039,0.86017,0.686526,0.309728,0.287151,0.268735
3,0.5537,0.723318,0.739688,0.412207,0.373677,0.374108
4,0.3656,0.658534,0.770852,0.493204,0.460225,0.462289
5,0.2532,0.619881,0.780935,0.529796,0.493109,0.498385


[I 2025-03-16 12:58:18,681] Trial 43 pruned. 


Trial 44 with params: {'learning_rate': 0.004711849559462062, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.97, 'warmup_steps': 5, 'lambda_param': 0.0, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4394,0.5302,0.823098,0.778497,0.721698,0.731925
2,0.0907,0.524847,0.822181,0.803724,0.74802,0.762632
3,0.077,0.526548,0.818515,0.810853,0.746672,0.767035
4,0.0731,0.541641,0.819432,0.82259,0.757447,0.776117
5,0.0708,0.544814,0.826764,0.82589,0.752028,0.77742
6,0.0692,0.556248,0.818515,0.840168,0.746196,0.776657
7,0.0697,0.552521,0.823098,0.829856,0.759843,0.783409
8,0.0688,0.54743,0.824015,0.820506,0.754198,0.775406
9,0.0679,0.542758,0.824931,0.830251,0.762945,0.78415
10,0.0661,0.548501,0.830431,0.842768,0.783409,0.801776


[I 2025-03-16 13:07:56,279] Trial 44 finished with value: 0.7948176358761317 and parameters: {'learning_rate': 0.004711849559462062, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.97, 'warmup_steps': 5, 'lambda_param': 0.0, 'temperature': 2.0}. Best is trial 28 with value: 0.808074449849158.


Trial 45 with params: {'learning_rate': 0.003277559398278786, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 7, 'lambda_param': 1.0, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4412,0.508262,0.828598,0.842429,0.768775,0.79071
2,0.0879,0.498944,0.828598,0.821545,0.744703,0.765759
3,0.0748,0.489029,0.840513,0.837113,0.776489,0.794577
4,0.0715,0.52184,0.828598,0.836493,0.747254,0.773283
5,0.0688,0.493899,0.843263,0.83183,0.767924,0.787117
6,0.0675,0.512226,0.828598,0.810736,0.758705,0.771329
7,0.0668,0.54318,0.819432,0.816581,0.782194,0.784608
8,0.0668,0.528199,0.830431,0.842066,0.758485,0.778947
9,0.0655,0.542764,0.825848,0.836716,0.763206,0.784743
10,0.0637,0.533079,0.827681,0.846388,0.782581,0.802332


[I 2025-03-16 13:14:18,901] Trial 45 pruned. 


Trial 46 with params: {'learning_rate': 0.004564239095929444, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 14, 'lambda_param': 0.2, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4133,0.514734,0.829514,0.805117,0.738515,0.755392
2,0.0868,0.504375,0.834097,0.847579,0.764219,0.789173
3,0.0748,0.511305,0.826764,0.850846,0.779802,0.799203
4,0.0716,0.53772,0.824931,0.83308,0.777887,0.792088
5,0.0696,0.519052,0.831347,0.848683,0.781765,0.80135
6,0.0675,0.538689,0.829514,0.817582,0.762476,0.779407
7,0.068,0.541184,0.824931,0.842139,0.781073,0.796977
8,0.0673,0.546852,0.826764,0.820724,0.7653,0.78036
9,0.0651,0.540581,0.832264,0.843577,0.784494,0.798542
10,0.0649,0.549287,0.822181,0.825673,0.758955,0.780062


[I 2025-03-16 13:17:15,990] Trial 46 pruned. 


Trial 47 with params: {'learning_rate': 0.0017169847652564352, 'weight_decay': 0.008, 'adam_beta1': 0.99, 'warmup_steps': 4, 'lambda_param': 0.0, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7744,0.615148,0.784601,0.549161,0.522267,0.525005
2,0.1426,0.546495,0.820348,0.745441,0.702032,0.712712
3,0.0842,0.516456,0.819432,0.807332,0.741199,0.758529
4,0.0742,0.522635,0.828598,0.817896,0.755896,0.773788
5,0.0693,0.519668,0.824931,0.819157,0.744375,0.768355
6,0.0672,0.526328,0.820348,0.811397,0.729484,0.756519
7,0.0664,0.524486,0.825848,0.815479,0.750328,0.770281
8,0.0651,0.520818,0.829514,0.816433,0.756344,0.776113
9,0.0654,0.526589,0.820348,0.823892,0.743179,0.769526
10,0.0639,0.532176,0.828598,0.817532,0.738204,0.764067


[I 2025-03-16 13:23:06,617] Trial 47 pruned. 


Trial 48 with params: {'learning_rate': 0.003033763700895482, 'weight_decay': 0.005, 'adam_beta1': 0.97, 'warmup_steps': 10, 'lambda_param': 1.0, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5365,0.535535,0.813932,0.701216,0.654444,0.663842
2,0.0937,0.507287,0.825848,0.810748,0.72297,0.749368
3,0.075,0.508599,0.823098,0.826282,0.744402,0.77201
4,0.0707,0.514221,0.829514,0.82704,0.762131,0.781755
5,0.0686,0.522881,0.824931,0.818689,0.737858,0.764253
6,0.0669,0.529638,0.824931,0.816991,0.761781,0.778492
7,0.0667,0.536464,0.818515,0.810838,0.739427,0.763047
8,0.0663,0.532235,0.826764,0.83147,0.749018,0.77686
9,0.065,0.530726,0.828598,0.827499,0.743797,0.772095
10,0.064,0.523127,0.827681,0.82433,0.752218,0.775145


[I 2025-03-16 13:32:04,988] Trial 48 finished with value: 0.7885075354875746 and parameters: {'learning_rate': 0.003033763700895482, 'weight_decay': 0.005, 'adam_beta1': 0.97, 'warmup_steps': 10, 'lambda_param': 1.0, 'temperature': 6.0}. Best is trial 28 with value: 0.808074449849158.


Trial 49 with params: {'learning_rate': 2.4721217192981437e-05, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 28, 'lambda_param': 0.5, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9845,1.695695,0.35472,0.036688,0.072046,0.045631
2,1.5172,1.473695,0.43538,0.071567,0.098449,0.071675
3,1.3394,1.357434,0.472961,0.089216,0.115896,0.091168
4,1.2245,1.271122,0.515124,0.151772,0.139057,0.117776
5,1.1321,1.203593,0.555454,0.165859,0.169038,0.150355
6,1.0533,1.146079,0.585701,0.174366,0.187795,0.170182
7,0.9843,1.10055,0.6022,0.166192,0.203671,0.178254
8,0.9201,1.056341,0.618698,0.214293,0.213085,0.192646
9,0.8642,1.021808,0.627864,0.207685,0.224703,0.201476
10,0.8112,0.98694,0.637947,0.237316,0.231597,0.212381


[I 2025-03-16 13:35:07,934] Trial 49 pruned. 


Trial 50 with params: {'learning_rate': 2.356716916016172e-05, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 4, 'lambda_param': 0.30000000000000004, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9935,1.712865,0.346471,0.035347,0.069107,0.043073
2,1.532,1.484367,0.430797,0.071106,0.096238,0.069186
3,1.3542,1.36967,0.474794,0.090096,0.117149,0.092013
4,1.2413,1.286305,0.51604,0.15658,0.139783,0.11657
5,1.1507,1.220118,0.538955,0.167113,0.154659,0.135202
6,1.0739,1.162061,0.575619,0.168216,0.179305,0.159539
7,1.0062,1.116788,0.593034,0.166427,0.197859,0.173733
8,0.9441,1.073306,0.609533,0.190282,0.207203,0.185563
9,0.8889,1.036822,0.625115,0.206473,0.223726,0.199086
10,0.8361,1.001242,0.640697,0.257802,0.234134,0.21638


[I 2025-03-16 13:41:16,047] Trial 50 pruned. 


Trial 51 with params: {'learning_rate': 0.0022959697451017842, 'weight_decay': 0.01, 'adam_beta1': 0.96, 'warmup_steps': 26, 'lambda_param': 0.30000000000000004, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5769,0.516959,0.818515,0.747135,0.669457,0.697143
2,0.0957,0.501545,0.824015,0.805432,0.725348,0.749509
3,0.0756,0.493216,0.831347,0.804768,0.729724,0.753379
4,0.0711,0.499477,0.824015,0.800789,0.730369,0.753554
5,0.0678,0.490249,0.836847,0.842711,0.759935,0.788039
6,0.0672,0.495,0.837764,0.82638,0.753111,0.775721
7,0.0644,0.509418,0.828598,0.801515,0.741297,0.759883
8,0.0632,0.486149,0.834097,0.821065,0.760562,0.778469
9,0.0641,0.515357,0.83593,0.848447,0.774344,0.797206
10,0.0642,0.513474,0.830431,0.850426,0.767191,0.79209


[I 2025-03-16 13:50:27,819] Trial 51 finished with value: 0.8121965030181337 and parameters: {'learning_rate': 0.0022959697451017842, 'weight_decay': 0.01, 'adam_beta1': 0.96, 'warmup_steps': 26, 'lambda_param': 0.30000000000000004, 'temperature': 3.0}. Best is trial 51 with value: 0.8121965030181337.


Trial 52 with params: {'learning_rate': 0.0027647136518752047, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.96, 'warmup_steps': 33, 'lambda_param': 0.6000000000000001, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5564,0.516038,0.824931,0.792326,0.708695,0.736477
2,0.0925,0.505772,0.831347,0.824535,0.755575,0.775337
3,0.0745,0.50052,0.827681,0.793377,0.747266,0.759988
4,0.0699,0.507959,0.837764,0.826884,0.771437,0.788032
5,0.0683,0.505653,0.833181,0.823492,0.759068,0.779512
6,0.0657,0.497831,0.83593,0.827716,0.744542,0.771904
7,0.0652,0.509308,0.839597,0.858421,0.784359,0.808319
8,0.0657,0.520602,0.831347,0.860823,0.778145,0.803463
9,0.0647,0.519669,0.84143,0.85566,0.787413,0.806285
10,0.0636,0.541416,0.831347,0.857451,0.778402,0.804997


[I 2025-03-16 13:56:40,416] Trial 52 pruned. 


Trial 53 with params: {'learning_rate': 0.0035561735187205107, 'weight_decay': 0.0, 'adam_beta1': 0.99, 'warmup_steps': 31, 'lambda_param': 1.0, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6459,0.596275,0.802016,0.644956,0.616022,0.620392
2,0.1128,0.549475,0.817599,0.781165,0.737479,0.748531
3,0.0813,0.531938,0.823098,0.807517,0.727239,0.752475
4,0.0749,0.546944,0.821265,0.814647,0.740697,0.76492
5,0.0716,0.534981,0.824931,0.825103,0.73891,0.768239
6,0.0695,0.538557,0.818515,0.808595,0.738567,0.760963
7,0.0686,0.543559,0.815765,0.785336,0.743486,0.750479
8,0.0674,0.541414,0.827681,0.819915,0.760851,0.778759
9,0.0665,0.545636,0.823098,0.808249,0.747488,0.765182
10,0.0654,0.544576,0.824015,0.81228,0.744638,0.765165


[I 2025-03-16 13:59:47,979] Trial 53 pruned. 


Trial 54 with params: {'learning_rate': 0.002448383863538723, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 26, 'lambda_param': 0.2, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.539,0.527102,0.814849,0.746772,0.66541,0.689508
2,0.0916,0.52439,0.816682,0.808438,0.740446,0.760648
3,0.0745,0.508328,0.837764,0.809044,0.746489,0.765917
4,0.0699,0.505581,0.829514,0.80571,0.757276,0.768473
5,0.0671,0.498927,0.837764,0.809606,0.76032,0.774895
6,0.066,0.509218,0.832264,0.825712,0.76114,0.781045
7,0.066,0.502872,0.831347,0.838055,0.760622,0.78271
8,0.0652,0.497717,0.843263,0.826046,0.76292,0.782186
9,0.0644,0.495247,0.84418,0.838957,0.784158,0.798785
10,0.0644,0.515363,0.83593,0.859662,0.77706,0.804329


[I 2025-03-16 14:08:56,459] Trial 54 finished with value: 0.8053590169617414 and parameters: {'learning_rate': 0.002448383863538723, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 26, 'lambda_param': 0.2, 'temperature': 4.5}. Best is trial 51 with value: 0.8121965030181337.


Trial 55 with params: {'learning_rate': 0.0003628063397120217, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.96, 'warmup_steps': 26, 'lambda_param': 0.1, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1266,0.755243,0.730522,0.363655,0.354435,0.341542
2,0.3309,0.581444,0.793767,0.544255,0.514272,0.520654
3,0.1616,0.581655,0.796517,0.728589,0.6321,0.663091
4,0.113,0.549596,0.813932,0.748598,0.640858,0.674627
5,0.0929,0.539313,0.812099,0.768001,0.698578,0.721268
6,0.0834,0.529704,0.826764,0.825375,0.734833,0.766278
7,0.0782,0.540203,0.819432,0.808393,0.7344,0.755699
8,0.0735,0.540498,0.819432,0.829566,0.726688,0.760743
9,0.0712,0.522085,0.824931,0.817961,0.735247,0.762663
10,0.069,0.526333,0.822181,0.798095,0.731442,0.749498


[I 2025-03-16 14:12:09,232] Trial 55 pruned. 


Trial 56 with params: {'learning_rate': 0.0020243329591293183, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.96, 'warmup_steps': 22, 'lambda_param': 0.0, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5941,0.519458,0.822181,0.735326,0.663855,0.683837
2,0.0988,0.502763,0.831347,0.814946,0.743737,0.763745
3,0.0763,0.478428,0.83593,0.795014,0.734199,0.751256
4,0.0709,0.495144,0.834097,0.812151,0.75092,0.768571
5,0.0676,0.485833,0.84143,0.843938,0.763798,0.788058
6,0.0663,0.497534,0.833181,0.813282,0.7432,0.765978
7,0.0654,0.479333,0.83593,0.812295,0.737308,0.759342
8,0.065,0.482314,0.843263,0.840417,0.765376,0.786553
9,0.0638,0.506539,0.827681,0.82103,0.75105,0.769788
10,0.0628,0.508392,0.832264,0.821614,0.761488,0.776449


[I 2025-03-16 14:21:25,327] Trial 56 finished with value: 0.7840059484423061 and parameters: {'learning_rate': 0.0020243329591293183, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.96, 'warmup_steps': 22, 'lambda_param': 0.0, 'temperature': 4.5}. Best is trial 51 with value: 0.8121965030181337.


Trial 57 with params: {'learning_rate': 0.0014326886208203128, 'weight_decay': 0.01, 'adam_beta1': 0.97, 'warmup_steps': 25, 'lambda_param': 0.30000000000000004, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7231,0.554846,0.804766,0.610451,0.580415,0.579557
2,0.1189,0.520661,0.822181,0.746721,0.718883,0.721519
3,0.0814,0.497605,0.83593,0.802211,0.740318,0.761156
4,0.0734,0.491116,0.829514,0.807699,0.748814,0.765902
5,0.0691,0.508144,0.835014,0.822474,0.760627,0.777316
6,0.0673,0.507358,0.830431,0.809168,0.748346,0.765494
7,0.0658,0.514107,0.829514,0.7939,0.715495,0.741377
8,0.0648,0.505906,0.834097,0.809183,0.749476,0.765377
9,0.0653,0.513917,0.824931,0.780948,0.732694,0.741326
10,0.0646,0.493994,0.840513,0.817178,0.747721,0.769119


[I 2025-03-16 14:27:33,761] Trial 57 pruned. 


Trial 58 with params: {'learning_rate': 0.004445377653209586, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.93, 'warmup_steps': 22, 'lambda_param': 0.1, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4303,0.508322,0.825848,0.786627,0.726071,0.743626
2,0.087,0.512103,0.829514,0.789338,0.732632,0.745777
3,0.0742,0.504864,0.833181,0.826893,0.757204,0.778272
4,0.0709,0.500211,0.833181,0.827989,0.755172,0.773311
5,0.0693,0.51687,0.824015,0.857952,0.758035,0.788634
6,0.0694,0.511199,0.833181,0.852756,0.776723,0.798875
7,0.068,0.540102,0.823098,0.830397,0.755138,0.7774
8,0.0672,0.532038,0.830431,0.835938,0.7593,0.785297
9,0.0648,0.534113,0.829514,0.82957,0.753885,0.77834
10,0.0651,0.522566,0.834097,0.846239,0.780058,0.799816


[I 2025-03-16 14:33:38,181] Trial 58 pruned. 


Trial 59 with params: {'learning_rate': 0.0033139797388126854, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.96, 'warmup_steps': 32, 'lambda_param': 0.2, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5188,0.515348,0.824015,0.808941,0.726186,0.750493
2,0.0892,0.518888,0.822181,0.801491,0.729324,0.750084
3,0.0738,0.506341,0.837764,0.830913,0.777113,0.790401
4,0.0704,0.506355,0.83593,0.834904,0.790162,0.800889
5,0.0687,0.503904,0.83868,0.854249,0.791554,0.812648
6,0.0663,0.531968,0.822181,0.820473,0.757832,0.778759
7,0.066,0.51281,0.832264,0.834304,0.774902,0.79081
8,0.0647,0.512172,0.830431,0.807282,0.763697,0.776856
9,0.064,0.517351,0.830431,0.834043,0.76864,0.790991
10,0.0629,0.508707,0.83593,0.853441,0.783082,0.806587


[I 2025-03-16 14:42:59,881] Trial 59 finished with value: 0.8063334664385181 and parameters: {'learning_rate': 0.0033139797388126854, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.96, 'warmup_steps': 32, 'lambda_param': 0.2, 'temperature': 3.5}. Best is trial 51 with value: 0.8121965030181337.


Trial 60 with params: {'learning_rate': 0.000747607511460211, 'weight_decay': 0.01, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 37, 'lambda_param': 0.2, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8518,0.597626,0.788268,0.550442,0.509954,0.517468
2,0.1638,0.533901,0.821265,0.765403,0.669253,0.701685
3,0.0954,0.541555,0.817599,0.804966,0.720884,0.747623
4,0.0801,0.526592,0.821265,0.817073,0.699047,0.733862
5,0.0735,0.528313,0.819432,0.793033,0.725201,0.748054


[I 2025-03-16 14:44:29,731] Trial 60 pruned. 


Trial 61 with params: {'learning_rate': 0.002743243867685171, 'weight_decay': 0.008, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 42, 'lambda_param': 0.0, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5481,0.512506,0.823098,0.760923,0.693697,0.71485
2,0.0916,0.482349,0.835014,0.823226,0.756768,0.775797
3,0.0738,0.472099,0.84143,0.817828,0.748538,0.767976
4,0.07,0.485846,0.835014,0.827509,0.770925,0.787907
5,0.0686,0.487916,0.837764,0.801208,0.751122,0.764848
6,0.0658,0.497783,0.834097,0.834476,0.762099,0.783781
7,0.0656,0.500839,0.829514,0.843151,0.784821,0.802894
8,0.0647,0.508044,0.832264,0.827503,0.756874,0.777836
9,0.0643,0.514703,0.828598,0.832529,0.7631,0.781723
10,0.0637,0.515024,0.834097,0.84311,0.766537,0.789926


[I 2025-03-16 14:53:39,541] Trial 61 finished with value: 0.8062484442983453 and parameters: {'learning_rate': 0.002743243867685171, 'weight_decay': 0.008, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 42, 'lambda_param': 0.0, 'temperature': 4.5}. Best is trial 51 with value: 0.8121965030181337.


Trial 62 with params: {'learning_rate': 0.0013102005238832101, 'weight_decay': 0.006, 'adam_beta1': 0.93, 'warmup_steps': 31, 'lambda_param': 0.30000000000000004, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6395,0.541002,0.810266,0.671792,0.618619,0.630746
2,0.1087,0.506797,0.83593,0.803523,0.748717,0.763302
3,0.079,0.499504,0.835014,0.791144,0.736643,0.751319
4,0.0715,0.490896,0.840513,0.817526,0.744255,0.768795
5,0.0679,0.510944,0.834097,0.79931,0.736646,0.754477
6,0.0659,0.50436,0.836847,0.809717,0.744964,0.762871
7,0.0642,0.504563,0.836847,0.803304,0.741328,0.758581
8,0.0654,0.502378,0.83593,0.804711,0.742662,0.758429
9,0.0633,0.511308,0.839597,0.801543,0.743499,0.760421
10,0.0623,0.510276,0.834097,0.794339,0.732413,0.750102


[I 2025-03-16 14:59:41,783] Trial 62 pruned. 


Trial 63 with params: {'learning_rate': 0.0011273931201871512, 'weight_decay': 0.007, 'adam_beta1': 0.96, 'warmup_steps': 43, 'lambda_param': 0.0, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7709,0.573673,0.8011,0.612319,0.571176,0.57914
2,0.1289,0.517105,0.818515,0.767755,0.717683,0.731449
3,0.085,0.522972,0.819432,0.797194,0.717052,0.743612
4,0.0741,0.510957,0.828598,0.807499,0.747408,0.764915
5,0.0701,0.512715,0.825848,0.799022,0.714282,0.741903
6,0.0672,0.520489,0.823098,0.805475,0.705495,0.740856
7,0.0664,0.533079,0.816682,0.77983,0.720598,0.737673
8,0.0659,0.526786,0.818515,0.80261,0.722267,0.747943
9,0.0644,0.517005,0.824931,0.799693,0.73462,0.754498
10,0.0634,0.506441,0.826764,0.794549,0.729769,0.750691


[I 2025-03-16 15:02:36,594] Trial 63 pruned. 


Trial 64 with params: {'learning_rate': 0.003657028468147327, 'weight_decay': 0.006, 'adam_beta1': 0.99, 'warmup_steps': 30, 'lambda_param': 0.1, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6424,0.609667,0.800183,0.670568,0.652382,0.652595
2,0.1111,0.5579,0.817599,0.752855,0.699824,0.714228
3,0.0815,0.544755,0.817599,0.783409,0.724498,0.741698
4,0.0742,0.554739,0.817599,0.795701,0.728821,0.748997
5,0.0711,0.553513,0.821265,0.803594,0.741158,0.7603
6,0.0695,0.562361,0.816682,0.807622,0.736444,0.757466
7,0.0686,0.56392,0.813016,0.794786,0.735098,0.752698
8,0.067,0.549509,0.822181,0.816192,0.752837,0.769773
9,0.0666,0.556641,0.815765,0.795306,0.741253,0.756099
10,0.0664,0.554071,0.824931,0.807155,0.746484,0.764121


[I 2025-03-16 15:11:43,073] Trial 64 finished with value: 0.7728431832056861 and parameters: {'learning_rate': 0.003657028468147327, 'weight_decay': 0.006, 'adam_beta1': 0.99, 'warmup_steps': 30, 'lambda_param': 0.1, 'temperature': 4.5}. Best is trial 51 with value: 0.8121965030181337.


Trial 65 with params: {'learning_rate': 0.0016160549340730266, 'weight_decay': 0.01, 'adam_beta1': 0.93, 'warmup_steps': 40, 'lambda_param': 0.1, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6043,0.531728,0.817599,0.737104,0.660605,0.684752
2,0.0989,0.503474,0.835014,0.820552,0.738805,0.763247
3,0.0764,0.482006,0.840513,0.820971,0.753044,0.773696
4,0.0697,0.489319,0.842346,0.826295,0.756894,0.777771
5,0.0675,0.484929,0.842346,0.843553,0.766082,0.787443
6,0.0668,0.503424,0.83593,0.819576,0.731584,0.756357
7,0.0655,0.509998,0.828598,0.815894,0.730475,0.757399
8,0.0647,0.521878,0.831347,0.826478,0.739811,0.764322
9,0.0631,0.49538,0.837764,0.817041,0.747435,0.76567
10,0.062,0.498575,0.835014,0.840986,0.759604,0.78308


[I 2025-03-16 15:21:11,559] Trial 65 finished with value: 0.8076668976717051 and parameters: {'learning_rate': 0.0016160549340730266, 'weight_decay': 0.01, 'adam_beta1': 0.93, 'warmup_steps': 40, 'lambda_param': 0.1, 'temperature': 5.5}. Best is trial 51 with value: 0.8121965030181337.


Trial 66 with params: {'learning_rate': 0.003374086696871621, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 51, 'lambda_param': 0.0, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5083,0.520183,0.823098,0.783426,0.715415,0.735203
2,0.0886,0.484186,0.839597,0.823648,0.756409,0.774687
3,0.0741,0.493428,0.827681,0.826459,0.729675,0.758113
4,0.0704,0.510139,0.832264,0.833851,0.765876,0.782732
5,0.0682,0.489955,0.84418,0.835493,0.771078,0.789834
6,0.0664,0.507695,0.83593,0.859403,0.780467,0.804299
7,0.0658,0.521901,0.829514,0.815684,0.788171,0.790226
8,0.0649,0.554659,0.825848,0.851534,0.768372,0.7927
9,0.0646,0.526553,0.837764,0.823172,0.780102,0.782471
10,0.0637,0.531406,0.829514,0.830189,0.759933,0.778478


[I 2025-03-16 15:24:16,514] Trial 66 pruned. 


Trial 67 with params: {'learning_rate': 0.0007349791826737676, 'weight_decay': 0.01, 'adam_beta1': 0.91, 'warmup_steps': 40, 'lambda_param': 0.0, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7888,0.589242,0.80385,0.56155,0.537712,0.541476
2,0.1544,0.535192,0.816682,0.744172,0.69333,0.708692
3,0.0943,0.529502,0.824015,0.790483,0.728226,0.746069
4,0.079,0.511189,0.827681,0.842529,0.727151,0.768121
5,0.0719,0.502983,0.832264,0.810005,0.727192,0.754621
6,0.069,0.507201,0.834097,0.809492,0.746012,0.763316
7,0.0666,0.511062,0.824931,0.831348,0.741162,0.769227
8,0.0652,0.506503,0.830431,0.829773,0.745858,0.771021
9,0.0651,0.502057,0.835014,0.816804,0.756301,0.7721
10,0.0627,0.502846,0.832264,0.82643,0.747053,0.771653


[I 2025-03-16 15:33:25,527] Trial 67 finished with value: 0.7866573451147606 and parameters: {'learning_rate': 0.0007349791826737676, 'weight_decay': 0.01, 'adam_beta1': 0.91, 'warmup_steps': 40, 'lambda_param': 0.0, 'temperature': 5.5}. Best is trial 51 with value: 0.8121965030181337.


Trial 68 with params: {'learning_rate': 0.0027772563393527188, 'weight_decay': 0.01, 'adam_beta1': 0.93, 'warmup_steps': 46, 'lambda_param': 0.2, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5194,0.519414,0.824015,0.803981,0.723077,0.7461
2,0.0887,0.510061,0.830431,0.817803,0.74044,0.762955
3,0.0746,0.500176,0.824931,0.826784,0.737233,0.765659
4,0.0698,0.502238,0.831347,0.824103,0.749296,0.771351
5,0.068,0.503285,0.83868,0.829083,0.765776,0.783238
6,0.0662,0.516339,0.832264,0.84674,0.758011,0.785307
7,0.0668,0.503125,0.833181,0.833091,0.762083,0.784083
8,0.0658,0.541615,0.824931,0.84404,0.763043,0.787135
9,0.064,0.512266,0.83593,0.823716,0.771396,0.782874
10,0.0638,0.528407,0.828598,0.823153,0.78584,0.790127


[I 2025-03-16 15:36:32,187] Trial 68 pruned. 


Trial 69 with params: {'learning_rate': 0.0023362248527442997, 'weight_decay': 0.01, 'adam_beta1': 0.97, 'warmup_steps': 43, 'lambda_param': 0.2, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6323,0.540451,0.815765,0.719286,0.663908,0.676805
2,0.0988,0.519163,0.831347,0.782152,0.741553,0.74972
3,0.0759,0.500093,0.830431,0.784191,0.74368,0.75359
4,0.0707,0.505174,0.828598,0.818433,0.75295,0.771971
5,0.069,0.495021,0.84143,0.811466,0.755789,0.771421
6,0.0666,0.50898,0.831347,0.825078,0.747667,0.773138
7,0.0654,0.500298,0.83593,0.83295,0.775261,0.791527
8,0.064,0.501695,0.836847,0.853768,0.771957,0.793553
9,0.0644,0.51578,0.833181,0.809974,0.750005,0.765257
10,0.0637,0.514934,0.833181,0.823506,0.759493,0.78041


[I 2025-03-16 15:45:45,191] Trial 69 finished with value: 0.798929863656468 and parameters: {'learning_rate': 0.0023362248527442997, 'weight_decay': 0.01, 'adam_beta1': 0.97, 'warmup_steps': 43, 'lambda_param': 0.2, 'temperature': 4.0}. Best is trial 51 with value: 0.8121965030181337.


Trial 70 with params: {'learning_rate': 3.3020219267760435e-05, 'weight_decay': 0.001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 51, 'lambda_param': 0.9, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9358,1.602986,0.388634,0.039836,0.082417,0.052942
2,1.4122,1.37304,0.477544,0.0866,0.117856,0.093103
3,1.2271,1.254022,0.511457,0.131685,0.137035,0.113026
4,1.1016,1.160876,0.575619,0.163356,0.179776,0.160083
5,0.9934,1.087527,0.605866,0.168309,0.203419,0.179604
6,0.8999,1.02456,0.629698,0.216717,0.221781,0.199834
7,0.8183,0.977984,0.647113,0.292818,0.242842,0.227267
8,0.748,0.936101,0.666361,0.301889,0.262635,0.253275
9,0.6876,0.906742,0.692026,0.300901,0.297898,0.285164
10,0.6347,0.875083,0.693859,0.302092,0.300712,0.292255


[I 2025-03-16 15:51:51,599] Trial 70 pruned. 


Trial 71 with params: {'learning_rate': 0.001304615257644689, 'weight_decay': 0.01, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 16, 'lambda_param': 0.30000000000000004, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6628,0.544069,0.815765,0.621671,0.612207,0.609285
2,0.1138,0.519439,0.819432,0.761966,0.727769,0.734613
3,0.0815,0.506642,0.824931,0.789337,0.71056,0.73647
4,0.0723,0.50093,0.831347,0.80673,0.74072,0.762151
5,0.0689,0.491071,0.837764,0.85375,0.737778,0.775421
6,0.0676,0.50786,0.832264,0.811535,0.738238,0.759095
7,0.0673,0.510852,0.826764,0.798955,0.732445,0.753073
8,0.0659,0.520242,0.824931,0.824104,0.733113,0.761463
9,0.0634,0.502818,0.83868,0.819483,0.751074,0.770732
10,0.0625,0.495941,0.835014,0.840279,0.763379,0.786174


[I 2025-03-16 15:55:02,979] Trial 71 pruned. 


Trial 72 with params: {'learning_rate': 0.0016677605293858977, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.97, 'warmup_steps': 37, 'lambda_param': 0.0, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7032,0.551982,0.806599,0.621978,0.594201,0.596654
2,0.1113,0.502578,0.831347,0.763243,0.714104,0.724599
3,0.0778,0.489862,0.83593,0.809596,0.752858,0.768928
4,0.0718,0.512948,0.824931,0.806331,0.721504,0.747235
5,0.0684,0.51679,0.832264,0.818126,0.741789,0.763773
6,0.0665,0.524431,0.827681,0.825669,0.730028,0.759909
7,0.0661,0.515845,0.830431,0.835795,0.746207,0.774874
8,0.0646,0.517111,0.827681,0.804297,0.750843,0.762068
9,0.064,0.513703,0.83593,0.85269,0.763335,0.791566
10,0.0634,0.497951,0.836847,0.827603,0.748513,0.772468


[I 2025-03-16 16:01:03,083] Trial 72 pruned. 


Trial 73 with params: {'learning_rate': 0.00028181224349129727, 'weight_decay': 0.006, 'adam_beta1': 0.93, 'warmup_steps': 2, 'lambda_param': 0.4, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1395,0.81758,0.707608,0.353898,0.323269,0.317912
2,0.4176,0.636379,0.785518,0.526976,0.481795,0.489063
3,0.2148,0.592276,0.793767,0.645162,0.572669,0.595092
4,0.1408,0.5756,0.79835,0.704641,0.609147,0.635315
5,0.1107,0.564851,0.808433,0.687233,0.648532,0.65701


[I 2025-03-16 16:02:33,028] Trial 73 pruned. 


Trial 74 with params: {'learning_rate': 0.002987031075978696, 'weight_decay': 0.006, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 34, 'lambda_param': 0.2, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5172,0.51504,0.829514,0.762795,0.703166,0.719527
2,0.0895,0.507501,0.833181,0.828146,0.743977,0.77022
3,0.0747,0.516779,0.827681,0.827571,0.740141,0.767631
4,0.0696,0.516336,0.824015,0.806232,0.743979,0.765832
5,0.0678,0.511991,0.833181,0.826344,0.7592,0.780826
6,0.0663,0.505774,0.836847,0.825072,0.758802,0.778598
7,0.0657,0.507081,0.83868,0.834536,0.775105,0.793639
8,0.0643,0.511745,0.835014,0.82465,0.786594,0.794682
9,0.0645,0.526812,0.819432,0.806213,0.741761,0.757645
10,0.0653,0.531088,0.828598,0.816231,0.752932,0.771889


[I 2025-03-16 16:08:40,738] Trial 74 pruned. 


Trial 75 with params: {'learning_rate': 0.004287647111582854, 'weight_decay': 0.008, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 38, 'lambda_param': 0.0, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4653,0.523254,0.821265,0.797014,0.720254,0.743202
2,0.0883,0.519871,0.821265,0.787168,0.732166,0.746968
3,0.0745,0.500338,0.833181,0.802079,0.737528,0.753041
4,0.0702,0.515025,0.830431,0.789903,0.745156,0.757855
5,0.068,0.520955,0.825848,0.792125,0.742899,0.75705
6,0.068,0.536852,0.826764,0.800004,0.743595,0.759647
7,0.0681,0.537738,0.818515,0.813494,0.748337,0.76936
8,0.0679,0.542123,0.822181,0.811526,0.74432,0.763106
9,0.0667,0.526155,0.831347,0.818111,0.764245,0.780067
10,0.0648,0.540756,0.824015,0.806783,0.742223,0.761172


[I 2025-03-16 16:11:46,523] Trial 75 pruned. 


Trial 76 with params: {'learning_rate': 0.0016472751465206785, 'weight_decay': 0.008, 'adam_beta1': 0.92, 'warmup_steps': 11, 'lambda_param': 0.4, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5637,0.526536,0.817599,0.732614,0.665565,0.685124
2,0.1003,0.508215,0.828598,0.806702,0.739691,0.75654
3,0.0768,0.499986,0.829514,0.798934,0.743485,0.757241
4,0.0717,0.505589,0.835014,0.815855,0.759015,0.773118
5,0.0685,0.510153,0.830431,0.818926,0.733576,0.761305


[I 2025-03-16 16:13:11,439] Trial 76 pruned. 


Trial 77 with params: {'learning_rate': 0.001977067719125014, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 30, 'lambda_param': 0.30000000000000004, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5938,0.520074,0.819432,0.715473,0.663392,0.679448
2,0.0981,0.509851,0.830431,0.806543,0.738524,0.755761
3,0.0764,0.495636,0.839597,0.826941,0.74818,0.771014
4,0.0709,0.488015,0.83868,0.824872,0.763069,0.780613
5,0.0686,0.494939,0.83868,0.832398,0.757258,0.778678
6,0.0673,0.506807,0.837764,0.816081,0.754569,0.770892
7,0.0662,0.505741,0.830431,0.820866,0.748055,0.768869
8,0.0645,0.506804,0.834097,0.811331,0.735543,0.755027
9,0.0638,0.508315,0.832264,0.805513,0.744617,0.761741
10,0.0645,0.496067,0.83593,0.823239,0.740267,0.766822


[I 2025-03-16 16:16:08,426] Trial 77 pruned. 


Trial 78 with params: {'learning_rate': 1.3245726232440102e-05, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.96, 'warmup_steps': 6, 'lambda_param': 0.30000000000000004, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1683,1.954884,0.303391,0.018185,0.055628,0.027324
2,1.7612,1.68275,0.359303,0.036234,0.072898,0.046059
3,1.5678,1.557834,0.405133,0.066661,0.0873,0.05745
4,1.4536,1.471265,0.434464,0.073583,0.097428,0.071304
5,1.3676,1.40951,0.455545,0.089459,0.105734,0.0788
6,1.3047,1.356974,0.48121,0.108807,0.11976,0.095596
7,1.2522,1.316755,0.495875,0.131982,0.127227,0.103973
8,1.2062,1.28277,0.511457,0.158012,0.138019,0.116769
9,1.1668,1.24968,0.519707,0.153065,0.141978,0.121935
10,1.1289,1.222624,0.548121,0.152882,0.163162,0.144128


[I 2025-03-16 16:22:29,990] Trial 78 pruned. 


Trial 79 with params: {'learning_rate': 1.2801409085483677e-05, 'weight_decay': 0.0, 'adam_beta1': 0.97, 'warmup_steps': 26, 'lambda_param': 0.4, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1939,1.998799,0.252979,0.022845,0.041376,0.026661
2,1.7952,1.703489,0.348304,0.0356,0.069367,0.04347
3,1.586,1.571185,0.402383,0.041253,0.086155,0.055491
4,1.4679,1.483195,0.430797,0.0722,0.095829,0.069078
5,1.3818,1.421509,0.454629,0.07018,0.10493,0.077041
6,1.3191,1.369769,0.472044,0.102341,0.11413,0.08924
7,1.2668,1.329036,0.490376,0.132305,0.12475,0.101405
8,1.2219,1.296616,0.510541,0.13486,0.136406,0.11261
9,1.1843,1.263237,0.520623,0.155552,0.141663,0.121328
10,1.1478,1.237076,0.531622,0.150288,0.152319,0.134641


[I 2025-03-16 16:28:21,348] Trial 79 pruned. 


Trial 80 with params: {'learning_rate': 0.0026363146681537583, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 18, 'lambda_param': 0.4, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4919,0.509678,0.819432,0.793614,0.711654,0.736866
2,0.0884,0.51421,0.824015,0.790814,0.736661,0.749602
3,0.0728,0.506896,0.834097,0.828409,0.764651,0.782157
4,0.0698,0.511606,0.827681,0.8181,0.761559,0.77671
5,0.0682,0.524523,0.827681,0.83303,0.768489,0.786514
6,0.0661,0.506224,0.827681,0.806243,0.747074,0.764951
7,0.0653,0.501691,0.833181,0.834423,0.763637,0.785372
8,0.0637,0.533433,0.828598,0.841939,0.76878,0.788569
9,0.0636,0.522589,0.827681,0.855637,0.768061,0.794507
10,0.0634,0.514234,0.832264,0.847115,0.76619,0.791531


[I 2025-03-16 16:34:22,648] Trial 80 pruned. 


Trial 81 with params: {'learning_rate': 0.004734653995499678, 'weight_decay': 0.003, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4273,0.529413,0.817599,0.783665,0.711406,0.73233
2,0.0922,0.526828,0.823098,0.80402,0.740861,0.759566
3,0.079,0.528245,0.823098,0.841712,0.747072,0.779547
4,0.0753,0.54246,0.826764,0.857875,0.739841,0.778327
5,0.0732,0.531726,0.830431,0.820319,0.758733,0.778925
6,0.0705,0.555364,0.813932,0.806632,0.764595,0.773595
7,0.0702,0.549397,0.826764,0.832686,0.770419,0.78744
8,0.0699,0.57766,0.819432,0.846351,0.748936,0.778363
9,0.0696,0.558216,0.822181,0.858003,0.765591,0.793579
10,0.0675,0.567941,0.818515,0.844032,0.774266,0.796537


[I 2025-03-16 16:43:55,478] Trial 81 finished with value: 0.7963849374222804 and parameters: {'learning_rate': 0.004734653995499678, 'weight_decay': 0.003, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 3.5}. Best is trial 51 with value: 0.8121965030181337.


Trial 82 with params: {'learning_rate': 0.003927166299655035, 'weight_decay': 0.007, 'adam_beta1': 0.93, 'warmup_steps': 10, 'lambda_param': 0.0, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4286,0.484226,0.83593,0.820101,0.747112,0.771221
2,0.0881,0.503617,0.833181,0.827938,0.752065,0.773958
3,0.0747,0.487146,0.839597,0.874177,0.784324,0.814594
4,0.0706,0.494098,0.834097,0.866501,0.782839,0.8089
5,0.0685,0.499745,0.835014,0.853845,0.77154,0.798342
6,0.0675,0.504175,0.84143,0.866172,0.78207,0.806697
7,0.0672,0.503247,0.83868,0.845562,0.784506,0.802238
8,0.068,0.530968,0.836847,0.843327,0.785468,0.799187
9,0.0664,0.558049,0.826764,0.827293,0.75926,0.779723
10,0.0649,0.537338,0.828598,0.84135,0.770433,0.790958


[I 2025-03-16 16:53:28,774] Trial 82 finished with value: 0.803110297880145 and parameters: {'learning_rate': 0.003927166299655035, 'weight_decay': 0.007, 'adam_beta1': 0.93, 'warmup_steps': 10, 'lambda_param': 0.0, 'temperature': 2.0}. Best is trial 51 with value: 0.8121965030181337.


Trial 83 with params: {'learning_rate': 0.00266461494269436, 'weight_decay': 0.007, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 6, 'lambda_param': 0.2, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4891,0.51007,0.824931,0.828909,0.738216,0.767063
2,0.09,0.503618,0.836847,0.824234,0.764788,0.780285
3,0.0745,0.483144,0.837764,0.839129,0.759136,0.780836
4,0.07,0.491616,0.833181,0.831825,0.755125,0.77805
5,0.0679,0.489833,0.837764,0.833114,0.759425,0.783273
6,0.0659,0.48593,0.834097,0.837136,0.748078,0.777812
7,0.066,0.479273,0.842346,0.849888,0.78641,0.802073
8,0.0647,0.505761,0.833181,0.855981,0.769322,0.797097
9,0.0646,0.487191,0.843263,0.859791,0.798813,0.818519
10,0.0644,0.497256,0.83593,0.850424,0.78801,0.806521


[I 2025-03-16 17:02:50,681] Trial 83 finished with value: 0.8072998308459078 and parameters: {'learning_rate': 0.00266461494269436, 'weight_decay': 0.007, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 6, 'lambda_param': 0.2, 'temperature': 2.5}. Best is trial 51 with value: 0.8121965030181337.


Trial 84 with params: {'learning_rate': 0.001474334550874076, 'weight_decay': 0.004, 'adam_beta1': 0.93, 'warmup_steps': 5, 'lambda_param': 0.4, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5884,0.531187,0.819432,0.735893,0.654986,0.677874
2,0.1038,0.505894,0.827681,0.796512,0.730226,0.749368
3,0.0772,0.510081,0.825848,0.810209,0.74049,0.759296
4,0.0715,0.494301,0.83593,0.844137,0.771514,0.794985
5,0.0682,0.496188,0.832264,0.839914,0.760103,0.78622
6,0.0671,0.494471,0.83593,0.829612,0.769375,0.786184
7,0.0661,0.49993,0.84143,0.823447,0.768337,0.783979
8,0.065,0.496932,0.836847,0.83796,0.769409,0.789359
9,0.0635,0.49089,0.831347,0.834403,0.761777,0.783077
10,0.0627,0.486749,0.837764,0.837916,0.773139,0.792752


[I 2025-03-16 17:12:11,979] Trial 84 finished with value: 0.7975285196490819 and parameters: {'learning_rate': 0.001474334550874076, 'weight_decay': 0.004, 'adam_beta1': 0.93, 'warmup_steps': 5, 'lambda_param': 0.4, 'temperature': 3.0}. Best is trial 51 with value: 0.8121965030181337.


Trial 85 with params: {'learning_rate': 0.0011690934278709142, 'weight_decay': 0.006, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 10, 'lambda_param': 0.2, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6863,0.546075,0.813016,0.642848,0.612575,0.619057
2,0.1217,0.512737,0.821265,0.788177,0.720099,0.738342
3,0.0832,0.49512,0.834097,0.808159,0.745991,0.763241
4,0.0737,0.493786,0.839597,0.79613,0.750463,0.761589
5,0.0695,0.487877,0.845096,0.82555,0.76286,0.781848
6,0.0674,0.506615,0.831347,0.822809,0.748428,0.77251
7,0.0672,0.514335,0.826764,0.804939,0.744215,0.760291
8,0.0649,0.51295,0.829514,0.808556,0.742933,0.762148
9,0.0641,0.49108,0.836847,0.840707,0.758903,0.784724
10,0.0629,0.507115,0.835014,0.820306,0.744332,0.76812


[I 2025-03-16 17:15:12,256] Trial 85 pruned. 


Trial 86 with params: {'learning_rate': 0.00024696163656226093, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 46, 'lambda_param': 0.2, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2111,0.850058,0.692942,0.311924,0.292172,0.283757
2,0.4723,0.659194,0.770852,0.535392,0.461791,0.470649
3,0.2486,0.607994,0.790101,0.604474,0.546119,0.564018
4,0.1594,0.591855,0.792851,0.65372,0.59217,0.611032
5,0.1208,0.575325,0.802016,0.681034,0.617307,0.635305
6,0.1022,0.578286,0.80385,0.707737,0.651577,0.668012
7,0.0911,0.565571,0.802016,0.733017,0.653848,0.676735
8,0.0845,0.55974,0.807516,0.725884,0.654627,0.67537
9,0.0801,0.556093,0.813932,0.756523,0.704492,0.717191
10,0.0762,0.558167,0.807516,0.795282,0.709838,0.733059


[I 2025-03-16 17:21:14,602] Trial 86 pruned. 


Trial 87 with params: {'learning_rate': 0.004475410706509862, 'weight_decay': 0.01, 'adam_beta1': 0.98, 'warmup_steps': 24, 'lambda_param': 0.2, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5409,0.553418,0.814849,0.71856,0.670923,0.677831
2,0.0956,0.553326,0.817599,0.77606,0.707724,0.726504
3,0.077,0.545604,0.817599,0.815843,0.722616,0.751882
4,0.0721,0.565531,0.817599,0.819779,0.726783,0.753748
5,0.0688,0.572569,0.820348,0.824158,0.735348,0.763042
6,0.0681,0.567702,0.823098,0.852099,0.756734,0.786386
7,0.0686,0.567257,0.812099,0.809921,0.716798,0.746571
8,0.067,0.583683,0.814849,0.818843,0.726917,0.756364
9,0.0667,0.589027,0.818515,0.817326,0.740791,0.76647
10,0.066,0.584275,0.815765,0.813954,0.727331,0.755337


[I 2025-03-16 17:30:42,566] Trial 87 finished with value: 0.7745063332767106 and parameters: {'learning_rate': 0.004475410706509862, 'weight_decay': 0.01, 'adam_beta1': 0.98, 'warmup_steps': 24, 'lambda_param': 0.2, 'temperature': 3.5}. Best is trial 51 with value: 0.8121965030181337.


Trial 88 with params: {'learning_rate': 0.0017372714220887152, 'weight_decay': 0.007, 'adam_beta1': 0.93, 'warmup_steps': 52, 'lambda_param': 0.4, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6071,0.523177,0.829514,0.749159,0.671121,0.694471
2,0.0971,0.503023,0.830431,0.81034,0.733031,0.756185
3,0.0755,0.490575,0.83868,0.823927,0.745779,0.770339
4,0.0699,0.506283,0.834097,0.817474,0.748982,0.769677
5,0.0678,0.498731,0.839597,0.819385,0.750042,0.770566


[I 2025-03-16 17:32:14,018] Trial 88 pruned. 


Trial 89 with params: {'learning_rate': 0.0015006164224987905, 'weight_decay': 0.01, 'adam_beta1': 0.92, 'warmup_steps': 26, 'lambda_param': 0.2, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5973,0.526057,0.816682,0.725395,0.641705,0.665208
2,0.1031,0.516348,0.822181,0.780637,0.711318,0.731172
3,0.0778,0.507056,0.830431,0.788248,0.745937,0.75523
4,0.0699,0.4933,0.837764,0.824713,0.744077,0.769478
5,0.0678,0.503363,0.834097,0.845937,0.766534,0.79218
6,0.0666,0.507501,0.83868,0.863964,0.770696,0.799903
7,0.0669,0.515961,0.829514,0.831566,0.76627,0.785203
8,0.0643,0.51068,0.834097,0.847305,0.759038,0.785335
9,0.063,0.505461,0.831347,0.841101,0.76366,0.789169
10,0.0626,0.501342,0.836847,0.84536,0.771817,0.795356


[I 2025-03-16 17:41:26,246] Trial 89 finished with value: 0.807890443497247 and parameters: {'learning_rate': 0.0015006164224987905, 'weight_decay': 0.01, 'adam_beta1': 0.92, 'warmup_steps': 26, 'lambda_param': 0.2, 'temperature': 6.5}. Best is trial 51 with value: 0.8121965030181337.


Trial 90 with params: {'learning_rate': 0.003771932591056331, 'weight_decay': 0.008, 'adam_beta1': 0.92, 'warmup_steps': 24, 'lambda_param': 0.2, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4491,0.50704,0.836847,0.794747,0.725876,0.745983
2,0.0868,0.519162,0.826764,0.814608,0.732951,0.757957
3,0.0729,0.502707,0.828598,0.800974,0.736583,0.754432
4,0.0702,0.518673,0.828598,0.814845,0.745393,0.765321
5,0.0687,0.516707,0.826764,0.861289,0.771375,0.802411
6,0.067,0.523095,0.824015,0.812606,0.75927,0.771645
7,0.0671,0.515336,0.83593,0.833309,0.780378,0.794768
8,0.0648,0.523747,0.829514,0.849838,0.76533,0.792829
9,0.064,0.507483,0.833181,0.837292,0.771245,0.791631
10,0.0632,0.528604,0.829514,0.834024,0.759588,0.779691


[I 2025-03-16 17:44:29,822] Trial 90 pruned. 


Trial 91 with params: {'learning_rate': 0.004015488866235626, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'warmup_steps': 31, 'lambda_param': 0.2, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4411,0.503828,0.831347,0.80827,0.737395,0.760084
2,0.0875,0.49218,0.835014,0.852476,0.766109,0.794433
3,0.074,0.503891,0.837764,0.851912,0.774473,0.798909
4,0.0702,0.526211,0.825848,0.851047,0.767882,0.794538
5,0.069,0.508826,0.836847,0.834516,0.788714,0.796881
6,0.0687,0.546545,0.825848,0.836025,0.783776,0.795835
7,0.0676,0.525238,0.837764,0.845917,0.790456,0.806669
8,0.0654,0.546527,0.829514,0.852692,0.783239,0.803621
9,0.0642,0.514853,0.840513,0.858885,0.791876,0.812725
10,0.063,0.525867,0.834097,0.846829,0.793645,0.809139


[I 2025-03-16 17:50:39,208] Trial 91 pruned. 


Trial 92 with params: {'learning_rate': 0.0008551867006766407, 'weight_decay': 0.01, 'adam_beta1': 0.92, 'warmup_steps': 33, 'lambda_param': 0.30000000000000004, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7472,0.566629,0.806599,0.633674,0.568185,0.580162
2,0.1382,0.526034,0.825848,0.756032,0.691085,0.710542
3,0.0892,0.509259,0.828598,0.779998,0.719152,0.737397
4,0.0775,0.508055,0.832264,0.844882,0.741099,0.772876
5,0.072,0.50301,0.832264,0.829141,0.740596,0.771215
6,0.068,0.509025,0.834097,0.848086,0.753442,0.784071
7,0.0658,0.508955,0.83868,0.851683,0.756469,0.788575
8,0.0644,0.515741,0.831347,0.829695,0.747109,0.771759
9,0.0643,0.514647,0.832264,0.844065,0.758987,0.784352
10,0.0636,0.508592,0.83868,0.81617,0.763134,0.777909


[I 2025-03-16 18:00:12,526] Trial 92 finished with value: 0.7961638898224431 and parameters: {'learning_rate': 0.0008551867006766407, 'weight_decay': 0.01, 'adam_beta1': 0.92, 'warmup_steps': 33, 'lambda_param': 0.30000000000000004, 'temperature': 6.5}. Best is trial 51 with value: 0.8121965030181337.


Trial 93 with params: {'learning_rate': 0.00034858740125773117, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 52, 'lambda_param': 0.9, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0678,0.734171,0.736939,0.417027,0.371608,0.369451
2,0.3198,0.588627,0.794684,0.546449,0.532048,0.532429
3,0.1617,0.576655,0.796517,0.686166,0.609022,0.632025
4,0.1135,0.561627,0.805683,0.725727,0.66019,0.680129
5,0.0937,0.548565,0.813016,0.780942,0.6968,0.723356
6,0.0835,0.547578,0.814849,0.790272,0.723116,0.74372
7,0.0775,0.545899,0.818515,0.784067,0.727339,0.743924
8,0.0741,0.543369,0.814849,0.800776,0.72477,0.748645
9,0.0713,0.53425,0.819432,0.792712,0.738646,0.754243
10,0.0694,0.540127,0.823098,0.797856,0.729864,0.749662


[I 2025-03-16 18:06:06,846] Trial 93 pruned. 


Trial 94 with params: {'learning_rate': 0.0004203360207916289, 'weight_decay': 0.01, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 47, 'lambda_param': 0.30000000000000004, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0385,0.709647,0.750687,0.40988,0.394345,0.388656
2,0.2738,0.58026,0.8011,0.634652,0.573,0.592765
3,0.1385,0.570971,0.800183,0.725099,0.646279,0.669577
4,0.1025,0.534883,0.822181,0.800487,0.71526,0.742573
5,0.0872,0.543707,0.810266,0.799583,0.707028,0.737065
6,0.0787,0.529492,0.822181,0.820774,0.731506,0.76175
7,0.0735,0.546922,0.816682,0.803073,0.725683,0.751252
8,0.0713,0.537043,0.818515,0.814785,0.721225,0.751893
9,0.0705,0.52675,0.817599,0.80139,0.719984,0.746489
10,0.0671,0.534355,0.817599,0.807017,0.74423,0.762491


[I 2025-03-16 18:09:02,531] Trial 94 pruned. 


Trial 95 with params: {'learning_rate': 0.0007505646032787231, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.92, 'warmup_steps': 11, 'lambda_param': 0.1, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7634,0.585056,0.79835,0.545422,0.538074,0.534309
2,0.1555,0.527326,0.822181,0.744787,0.65936,0.68762
3,0.0956,0.530679,0.820348,0.812329,0.717002,0.746096
4,0.0806,0.512635,0.833181,0.840888,0.737598,0.769538
5,0.0741,0.513386,0.831347,0.837598,0.739182,0.771127
6,0.0694,0.498852,0.831347,0.830537,0.746002,0.770568
7,0.0673,0.510094,0.831347,0.857651,0.753885,0.787814
8,0.0652,0.50598,0.837764,0.85223,0.758502,0.786713
9,0.0648,0.493599,0.842346,0.85839,0.759815,0.791955
10,0.064,0.500206,0.83868,0.831402,0.751841,0.776147


[I 2025-03-16 18:15:14,874] Trial 95 pruned. 


Trial 96 with params: {'learning_rate': 0.004809688838545827, 'weight_decay': 0.006, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 4, 'lambda_param': 0.2, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4113,0.520845,0.824931,0.822047,0.736071,0.758993
2,0.0909,0.534894,0.822181,0.801137,0.726314,0.747593
3,0.0778,0.537339,0.826764,0.811354,0.747576,0.767976
4,0.0735,0.551649,0.822181,0.810412,0.747798,0.767276
5,0.0708,0.54167,0.827681,0.843052,0.765313,0.79156
6,0.0702,0.542406,0.823098,0.831853,0.761086,0.785454
7,0.0696,0.573548,0.824015,0.851769,0.767669,0.796068
8,0.0687,0.566705,0.823098,0.835955,0.764555,0.786344
9,0.0671,0.562922,0.816682,0.833231,0.764961,0.786655
10,0.0658,0.604219,0.814849,0.833014,0.766846,0.785362


[I 2025-03-16 18:21:44,386] Trial 96 pruned. 


Trial 97 with params: {'learning_rate': 0.002326693838177691, 'weight_decay': 0.01, 'adam_beta1': 0.96, 'warmup_steps': 27, 'lambda_param': 0.0, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5795,0.525855,0.818515,0.726408,0.67102,0.686271
2,0.0965,0.510591,0.826764,0.777267,0.731287,0.74387
3,0.0758,0.493802,0.833181,0.807868,0.736171,0.7568
4,0.0715,0.508284,0.822181,0.812345,0.737882,0.760762
5,0.0682,0.509441,0.832264,0.803482,0.757038,0.768788
6,0.0669,0.507931,0.824931,0.819324,0.752837,0.772801
7,0.0663,0.516995,0.831347,0.823177,0.755308,0.771873
8,0.0652,0.521764,0.828598,0.829316,0.756581,0.777387
9,0.0647,0.512224,0.832264,0.802155,0.735534,0.754984
10,0.0637,0.505054,0.835014,0.816084,0.757061,0.771183


[I 2025-03-16 18:24:43,380] Trial 97 pruned. 


Trial 98 with params: {'learning_rate': 0.00035089984050994246, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 3, 'lambda_param': 0.8, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0431,0.747857,0.731439,0.348667,0.35036,0.339883
2,0.332,0.594657,0.796517,0.586689,0.54009,0.543785
3,0.1646,0.570118,0.797434,0.691037,0.606934,0.631586
4,0.1146,0.562,0.808433,0.744706,0.669188,0.692061
5,0.0944,0.555085,0.810266,0.754224,0.680558,0.701107
6,0.0842,0.549393,0.814849,0.805196,0.715561,0.744679
7,0.0789,0.552203,0.811182,0.799041,0.723057,0.743626
8,0.074,0.548876,0.810266,0.789213,0.697803,0.724126
9,0.0709,0.540197,0.824015,0.825655,0.759284,0.780539
10,0.0701,0.550552,0.817599,0.80237,0.742347,0.757735


[I 2025-03-16 18:27:44,471] Trial 98 pruned. 


Trial 99 with params: {'learning_rate': 2.8092689649211085e-05, 'weight_decay': 0.002, 'adam_beta1': 0.97, 'warmup_steps': 12, 'lambda_param': 0.2, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0054,1.688181,0.36022,0.036938,0.07315,0.046413
2,1.4807,1.422441,0.450046,0.079171,0.103638,0.077671
3,1.2848,1.30219,0.503208,0.134005,0.131853,0.107006
4,1.1647,1.212121,0.545371,0.155793,0.159003,0.139893
5,1.0621,1.13881,0.588451,0.168713,0.188367,0.168925
6,0.9785,1.083682,0.615032,0.187468,0.212204,0.185967
7,0.9039,1.0299,0.632447,0.219433,0.222728,0.200046
8,0.8368,0.988522,0.63978,0.249983,0.230881,0.210754
9,0.7778,0.952035,0.654445,0.241018,0.245298,0.227136
10,0.7253,0.92178,0.673694,0.30515,0.270997,0.265693


[I 2025-03-16 18:30:55,902] Trial 99 pruned. 


Trial 100 with params: {'learning_rate': 0.002273290973975607, 'weight_decay': 0.006, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5155,0.535643,0.816682,0.764537,0.681439,0.705807
2,0.0948,0.520903,0.817599,0.801912,0.717099,0.741761
3,0.0764,0.503534,0.830431,0.787383,0.729266,0.747788
4,0.0715,0.515053,0.830431,0.823863,0.752421,0.773024
5,0.0698,0.520012,0.829514,0.819016,0.737971,0.764747
6,0.0676,0.529764,0.825848,0.809314,0.743023,0.763761
7,0.067,0.514103,0.830431,0.81444,0.753676,0.773575
8,0.0654,0.535171,0.822181,0.814042,0.737764,0.761156
9,0.0651,0.524929,0.827681,0.845459,0.753095,0.782514
10,0.0638,0.542184,0.826764,0.830846,0.742294,0.770717


[I 2025-03-16 18:37:15,564] Trial 100 pruned. 


Trial 101 with params: {'learning_rate': 0.004228166592729536, 'weight_decay': 0.007, 'adam_beta1': 0.92, 'warmup_steps': 8, 'lambda_param': 0.0, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4044,0.504242,0.830431,0.794021,0.733663,0.750574
2,0.0866,0.511297,0.829514,0.809269,0.755008,0.769534
3,0.0751,0.502399,0.83593,0.838411,0.767781,0.789458
4,0.0714,0.502549,0.83868,0.837249,0.763111,0.786018
5,0.0697,0.506096,0.836847,0.833537,0.768748,0.788752
6,0.0682,0.504135,0.839597,0.836454,0.768253,0.789457
7,0.0674,0.502717,0.847846,0.870657,0.789636,0.811046
8,0.0666,0.538485,0.829514,0.857263,0.780748,0.802803
9,0.0668,0.525859,0.834097,0.829669,0.77756,0.788791
10,0.0653,0.550899,0.829514,0.840685,0.771701,0.793334


[I 2025-03-16 18:46:29,177] Trial 101 finished with value: 0.8104994731958252 and parameters: {'learning_rate': 0.004228166592729536, 'weight_decay': 0.007, 'adam_beta1': 0.92, 'warmup_steps': 8, 'lambda_param': 0.0, 'temperature': 3.5}. Best is trial 51 with value: 0.8121965030181337.


Trial 102 with params: {'learning_rate': 0.002559054618277403, 'weight_decay': 0.007, 'adam_beta1': 0.92, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4683,0.524402,0.820348,0.797816,0.696082,0.728972
2,0.0894,0.50509,0.827681,0.815906,0.748771,0.767313
3,0.0736,0.488252,0.83593,0.85069,0.759706,0.790625
4,0.0697,0.501687,0.828598,0.83055,0.743283,0.769658
5,0.069,0.519394,0.835014,0.798579,0.733733,0.75127
6,0.0666,0.510492,0.832264,0.83915,0.755378,0.780563
7,0.065,0.526938,0.827681,0.814033,0.735972,0.757198
8,0.0645,0.531179,0.835014,0.827083,0.756738,0.776867
9,0.063,0.514253,0.836847,0.845891,0.764888,0.790363
10,0.0627,0.533676,0.826764,0.836535,0.751576,0.777249


[I 2025-03-16 18:55:50,410] Trial 102 finished with value: 0.7981899135598972 and parameters: {'learning_rate': 0.002559054618277403, 'weight_decay': 0.007, 'adam_beta1': 0.92, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 4.0}. Best is trial 51 with value: 0.8121965030181337.


Trial 103 with params: {'learning_rate': 3.900145029980524e-05, 'weight_decay': 0.01, 'adam_beta1': 0.93, 'warmup_steps': 17, 'lambda_param': 0.8, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8553,1.536896,0.409716,0.066303,0.088747,0.058941
2,1.3462,1.317535,0.495875,0.135227,0.12784,0.103189
3,1.1622,1.200074,0.539872,0.158196,0.155606,0.135867
4,1.0283,1.09922,0.59945,0.171073,0.199734,0.178615
5,0.9101,1.024897,0.625115,0.212713,0.221028,0.195593


[I 2025-03-16 18:57:24,665] Trial 103 pruned. 


Trial 104 with params: {'learning_rate': 0.004549505615604967, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.93, 'warmup_steps': 6, 'lambda_param': 0.1, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4241,0.496107,0.835014,0.794815,0.72434,0.745262
2,0.0892,0.523079,0.826764,0.825774,0.747785,0.770901
3,0.0766,0.508653,0.837764,0.855807,0.779571,0.80445
4,0.0723,0.521161,0.835014,0.86921,0.773042,0.803844
5,0.0707,0.518953,0.83593,0.854971,0.782625,0.803372
6,0.0693,0.534988,0.833181,0.840635,0.767368,0.789119
7,0.07,0.537626,0.830431,0.827523,0.77064,0.784536
8,0.07,0.549705,0.829514,0.813766,0.752112,0.767618
9,0.0667,0.543248,0.829514,0.838135,0.757619,0.782803
10,0.066,0.535834,0.828598,0.846221,0.760687,0.784208


[I 2025-03-16 19:00:37,483] Trial 104 pruned. 


Trial 105 with params: {'learning_rate': 0.004017392169670613, 'weight_decay': 0.01, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 21, 'lambda_param': 0.4, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4492,0.512312,0.824015,0.785979,0.731876,0.747855
2,0.0869,0.510506,0.828598,0.812444,0.735347,0.75807
3,0.0739,0.520387,0.824931,0.822169,0.750241,0.773025
4,0.0704,0.53044,0.828598,0.837414,0.763939,0.78634
5,0.0685,0.514266,0.833181,0.831567,0.774614,0.790939
6,0.068,0.543334,0.820348,0.818755,0.749432,0.771285
7,0.0677,0.565614,0.822181,0.804995,0.740104,0.760414
8,0.0674,0.559281,0.824931,0.834424,0.754131,0.778629
9,0.065,0.577749,0.824015,0.842244,0.761358,0.786426
10,0.0649,0.563198,0.824015,0.840423,0.754452,0.779323


[I 2025-03-16 19:09:41,317] Trial 105 finished with value: 0.8024007404261739 and parameters: {'learning_rate': 0.004017392169670613, 'weight_decay': 0.01, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 21, 'lambda_param': 0.4, 'temperature': 3.0}. Best is trial 51 with value: 0.8121965030181337.


Trial 106 with params: {'learning_rate': 0.000957289736022682, 'weight_decay': 0.01, 'adam_beta1': 0.93, 'warmup_steps': 28, 'lambda_param': 0.2, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7223,0.565153,0.810266,0.645152,0.583751,0.597825
2,0.1292,0.520858,0.825848,0.814235,0.724743,0.753181
3,0.0863,0.520902,0.831347,0.80254,0.729193,0.753126
4,0.0758,0.498381,0.83593,0.831176,0.736232,0.766565
5,0.0704,0.50034,0.83868,0.827624,0.74239,0.770206
6,0.0667,0.512428,0.829514,0.823878,0.734645,0.763959
7,0.0658,0.497709,0.833181,0.845554,0.737886,0.774654
8,0.0635,0.512517,0.834097,0.838151,0.755503,0.782128
9,0.0638,0.501406,0.840513,0.823585,0.754424,0.773897
10,0.0645,0.511779,0.831347,0.823945,0.750026,0.770025


[I 2025-03-16 19:18:53,664] Trial 106 finished with value: 0.7889678575446838 and parameters: {'learning_rate': 0.000957289736022682, 'weight_decay': 0.01, 'adam_beta1': 0.93, 'warmup_steps': 28, 'lambda_param': 0.2, 'temperature': 5.0}. Best is trial 51 with value: 0.8121965030181337.


Trial 107 with params: {'learning_rate': 0.0016029320835677379, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 36, 'lambda_param': 0.0, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss


[W 2025-03-16 19:18:59,784] Trial 107 failed with parameters: {'learning_rate': 0.0016029320835677379, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 36, 'lambda_param': 0.0, 'temperature': 4.0} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/usr/local/lib/python3.10/dist-packages/transformers/integrations/integration_utils.py", line 250, in _objective
    trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2241, in train
    return inner_training_loop(
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2548, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", li

KeyboardInterrupt: 

In [71]:
print(best_trial4)

NameError: name 'best_trial4' is not defined

In [72]:
print("Best normal training score: ", best_trial)
print("Best distilation trianing score: ", best_trial2)
print("Best normal training score with augmentations: ", best_trial3)
print("Best distilation trianing score with augmentations: ",best_trial4)

Best normal training score:  BestRun(run_id='52', objective=0.7267722601596618, hyperparameters={'learning_rate': 0.004185238693319757, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 4}, run_summary=None)
Best distilation trianing score:  BestRun(run_id='126', objective=0.7476729369943443, hyperparameters={'learning_rate': 0.004849961480952609, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.8, 'temperature': 5.0}, run_summary=None)
Best normal training score with augmentations:  BestRun(run_id='121', objective=0.7876336386869527, hyperparameters={'learning_rate': 0.004220661238506302, 'weight_decay': 0.005, 'adam_beta1': 0.92, 'warmup_steps': 51}, run_summary=None)


NameError: name 'best_trial4' is not defined