In [25]:
from transformers import BasicTokenizer, EarlyStoppingCallback, Trainer
from datasets import concatenate_datasets, load_from_disk
import kagglehub
import optuna
import torch
import math
import base

In [26]:
base.reset_seed()

In [27]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


In [28]:
my_glove = kagglehub.dataset_download("thanakomsn/glove6b300dtxt")
print(my_glove)

/home/jovyan/.cache/kagglehub/datasets/thanakomsn/glove6b300dtxt/versions/1


In [29]:
GLOVE_FILE = f"{my_glove}/glove.6B.300d.txt"
DATASET = "trec"

In [30]:
train_data = load_from_disk(f"~/data/{DATASET}/train-logits_fine")
eval_data = load_from_disk(f"~/data/{DATASET}/eval-logits_fine")
test_data = load_from_disk(f"~/data/{DATASET}/test-logits_fine")

all_train_data = load_from_disk(f"~/data/{DATASET}/train-logits-augmented_fine")

all_data = concatenate_datasets([load_from_disk(file) for file in [f"~/data/{DATASET}/eval-logits_fine", f"~/data/{DATASET}/test-logits_fine", f"~/data/{DATASET}/train-logits-augmented_fine"]])
tokenizer = BasicTokenizer(do_lower_case=True)

In [31]:
train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), train_data))
eval_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), eval_data))
test_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), test_data))

all_train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_train_data))

all_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_data))

In [32]:
vocab = base.get_vocab(all_data_tokens)

In [33]:
word_index = dict(zip(vocab, range(len(vocab))))

In [34]:
embeddings_index = base.get_embeddings_indeces(GLOVE_FILE)

Found 400000 word vectors.


In [35]:
print(len(vocab))
num_tokens = len(vocab) + 2
embedding_dim = 300

8766


In [36]:
embedding_matrix = base.get_embedding_matrix(num_tokens, embedding_dim, word_index, embeddings_index)

Converted 8551 words (215) misses


In [37]:
train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),train_data_tokens))
eval_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),eval_data_tokens))
test_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),test_data_tokens))

all_train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),all_train_data_tokens))

In [38]:
train_padded_data = list(map(lambda x: base.padd(x,60), train_data_index))
eval_padded_data = list(map(lambda x: base.padd(x,60), eval_data_index))
test_padded_data = list(map(lambda x: base.padd(x,60), test_data_index))

all_train_padded_data = list(map(lambda x: base.padd(x,60), all_train_data_index))

In [39]:
train_data = train_data.add_column("input_ids", train_padded_data)
eval_data = eval_data.add_column("input_ids", eval_padded_data)
test_data = test_data.add_column("input_ids", test_padded_data)

all_train_data = all_train_data.add_column("input_ids", all_train_padded_data)

In [40]:
num_epochs = 30
batch_size = 128

In [41]:
#Nápočet epoch na steps
data_length = len(train_data)
min_r = math.ceil(data_length/batch_size)*5
max_r = math.ceil(data_length/batch_size)*num_epochs
warm_up = math.ceil(data_length/batch_size/10)

In [42]:
def hp_space(trial):
    params =  {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-3, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0, 1e-2, step=1e-3),
        "adam_beta1" : trial.suggest_float("adam_beta1", 0.9, 0.99, step=0.01),
        "warmup_steps" : trial.suggest_int("warmup_steps", 0, warm_up)
    }   
    print(f"Trial {trial.number} with params: {params}")
    return params

In [43]:
pruner = optuna.pruners.HyperbandPruner(min_resource=min_r, max_resource=max_r, reduction_factor=2, bootstrap_count=2)
sampler = optuna.samplers.TPESampler(seed=42, multivariate=True)



In [44]:
def get_BiLSTM():
    return base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=50)

In [45]:
base.reset_seed()

In [46]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base_fine_hp-search", logging_dir=f"~/logs/{DATASET}/bilstm-base_fine_hp-search", epochs=num_epochs, batch_size=batch_size)

In [47]:
trainer = Trainer(
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    model_init = lambda: get_BiLSTM(),
    #callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)
  

In [48]:
best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    compute_objective=lambda metrics: metrics["eval_f1"],
    pruner=pruner,
    sampler=sampler,
    study_name="Test-base",
    n_trials=150
)

[I 2025-03-15 09:38:56,470] A new study created in memory with name: Test-base


Trial 0 with params: {'learning_rate': 0.0001025350969016849, 'weight_decay': 0.01, 'adam_beta1': 0.97, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.7574,3.453929,0.176902,0.003538,0.02,0.006012
2,3.2644,3.148399,0.176902,0.003538,0.02,0.006012
3,3.0994,3.006927,0.191567,0.024083,0.025549,0.014725
4,2.9168,2.841771,0.340972,0.027059,0.063758,0.037581
5,2.7997,2.690734,0.359303,0.037905,0.07061,0.044083
6,2.633,2.570346,0.378552,0.037064,0.080066,0.050215
7,2.5055,2.441238,0.405133,0.041689,0.08648,0.055721
8,2.4115,2.352818,0.417049,0.064615,0.09045,0.063007
9,2.3102,2.269185,0.43538,0.067741,0.098012,0.071546
10,2.2162,2.190419,0.461962,0.084365,0.112096,0.08638


[I 2025-03-15 09:39:42,963] Trial 0 pruned. 


Trial 1 with params: {'learning_rate': 2.6368755339723032e-05, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.881,3.83861,0.176902,0.003538,0.02,0.006012
2,3.7795,3.69424,0.176902,0.003538,0.02,0.006012
3,3.551,3.36732,0.176902,0.003538,0.02,0.006012
4,3.265,3.199631,0.176902,0.003538,0.02,0.006012
5,3.2041,3.139016,0.176902,0.003538,0.02,0.006012


[I 2025-03-15 09:40:05,590] Trial 1 pruned. 


Trial 2 with params: {'learning_rate': 0.00041917115166952007, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3961,2.923635,0.298808,0.035805,0.056288,0.034684
2,2.6401,2.368158,0.421632,0.056331,0.092828,0.063935
3,2.194,2.033916,0.48121,0.120576,0.120001,0.095275
4,1.9125,1.814344,0.550871,0.161849,0.169673,0.150859
5,1.6822,1.620763,0.581118,0.194995,0.199028,0.185277


[I 2025-03-15 09:40:28,809] Trial 2 pruned. 


Trial 3 with params: {'learning_rate': 0.001764971584817572, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8768,2.205863,0.466544,0.100829,0.118321,0.094381
2,1.8903,1.715888,0.563703,0.232429,0.194116,0.181503
3,1.4289,1.331846,0.670944,0.333808,0.317516,0.311905
4,1.0475,1.233193,0.692942,0.397584,0.359473,0.360523
5,0.7734,1.100223,0.731439,0.480038,0.452971,0.454218
6,0.5533,1.060179,0.730522,0.555773,0.491313,0.506943
7,0.3796,1.131954,0.739688,0.625383,0.509905,0.538403
8,0.2644,1.120521,0.752521,0.63374,0.594377,0.595061
9,0.1659,1.188065,0.754354,0.634212,0.567363,0.583786
10,0.0983,1.205021,0.768103,0.649386,0.632123,0.627267


[I 2025-03-15 09:41:22,437] Trial 3 pruned. 


Trial 4 with params: {'learning_rate': 6.62431060594998e-05, 'weight_decay': 0.005, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8186,3.65598,0.176902,0.003538,0.02,0.006012
2,3.3925,3.19707,0.176902,0.003538,0.02,0.006012
3,3.153,3.074793,0.176902,0.003538,0.02,0.006012
4,3.0107,2.952493,0.255729,0.028561,0.040992,0.028658
5,2.9156,2.818059,0.35747,0.037794,0.069965,0.044589


[I 2025-03-15 09:41:47,734] Trial 4 pruned. 


Trial 5 with params: {'learning_rate': 0.0004480975918214954, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3526,2.900689,0.311641,0.050915,0.058621,0.039463
2,2.6145,2.344055,0.429881,0.070778,0.096404,0.067819
3,2.1824,2.018939,0.488543,0.106089,0.128534,0.10022
4,1.8865,1.784999,0.547204,0.165762,0.169149,0.149664
5,1.649,1.612731,0.593034,0.221606,0.208149,0.189671
6,1.4214,1.518356,0.614115,0.254456,0.240655,0.227674
7,1.2722,1.401311,0.64253,0.342157,0.276293,0.275677
8,1.1398,1.321012,0.672777,0.324635,0.309545,0.305712
9,0.995,1.280856,0.665445,0.373919,0.323223,0.325064
10,0.8785,1.222526,0.684693,0.406687,0.33679,0.347677


[I 2025-03-15 09:42:34,783] Trial 5 pruned. 


Trial 6 with params: {'learning_rate': 0.00017018418817029164, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.582,3.166328,0.176902,0.003538,0.02,0.006012
2,3.0307,2.856562,0.340055,0.029577,0.064475,0.039115
3,2.745,2.598453,0.376719,0.039032,0.077235,0.049864
4,2.5009,2.387861,0.413382,0.058379,0.089016,0.06151
5,2.3247,2.209584,0.456462,0.081787,0.108211,0.082534


[I 2025-03-15 09:42:59,657] Trial 6 pruned. 


Trial 7 with params: {'learning_rate': 0.00039710847107924746, 'weight_decay': 0.0, 'adam_beta1': 0.96, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3798,3.012966,0.206233,0.018918,0.031636,0.019412
2,2.7852,2.562267,0.36572,0.038868,0.073919,0.046737
3,2.3905,2.200275,0.451879,0.0935,0.104973,0.079752
4,2.0882,1.961124,0.504125,0.103065,0.137788,0.116007
5,1.8538,1.788925,0.570119,0.196503,0.18351,0.167452
6,1.6334,1.649984,0.588451,0.216603,0.208861,0.197046
7,1.4759,1.537108,0.597617,0.242775,0.220124,0.212875
8,1.3329,1.449189,0.637947,0.288265,0.260938,0.254366
9,1.1906,1.393275,0.653529,0.319103,0.273642,0.277852
10,1.0738,1.344083,0.658112,0.314038,0.299593,0.291022


[I 2025-03-15 09:43:46,033] Trial 7 pruned. 


Trial 8 with params: {'learning_rate': 1.498208643215546e-05, 'weight_decay': 0.01, 'adam_beta1': 0.99, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8943,3.872589,0.176902,0.003584,0.02,0.006079
2,3.8539,3.834196,0.176902,0.003538,0.02,0.006012
3,3.8143,3.788391,0.176902,0.003538,0.02,0.006012
4,3.7588,3.728727,0.176902,0.003538,0.02,0.006012
5,3.6999,3.649857,0.176902,0.003538,0.02,0.006012
6,3.6038,3.551848,0.176902,0.003538,0.02,0.006012
7,3.4969,3.440398,0.176902,0.003538,0.02,0.006012
8,3.3973,3.334496,0.176902,0.003538,0.02,0.006012
9,3.308,3.258631,0.176902,0.003538,0.02,0.006012
10,3.2488,3.220245,0.176902,0.003538,0.02,0.006012


[I 2025-03-15 09:44:32,639] Trial 8 pruned. 


Trial 9 with params: {'learning_rate': 6.639623079859457e-05, 'weight_decay': 0.001, 'adam_beta1': 0.96, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8248,3.678069,0.176902,0.003538,0.02,0.006012
2,3.4328,3.219928,0.176902,0.003538,0.02,0.006012
3,3.1809,3.106549,0.176902,0.003538,0.02,0.006012
4,3.0628,2.989141,0.248396,0.031972,0.038925,0.025503
5,2.9638,2.858771,0.32539,0.047691,0.059542,0.037823


[I 2025-03-15 09:44:55,136] Trial 9 pruned. 


Trial 10 with params: {'learning_rate': 0.003327590120039613, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8472,1.994534,0.498625,0.111151,0.135479,0.110326
2,1.6995,1.484939,0.633364,0.293013,0.275846,0.259464
3,1.2189,1.221298,0.691109,0.385561,0.364727,0.350486
4,0.8115,1.058276,0.730522,0.422707,0.412204,0.408662
5,0.5033,0.982379,0.75802,0.575607,0.518112,0.526904
6,0.3015,1.163593,0.735105,0.599892,0.539442,0.548958
7,0.1602,1.31326,0.75802,0.632682,0.535106,0.559239
8,0.0936,1.275627,0.76352,0.62951,0.582174,0.593079
9,0.0408,1.362129,0.768103,0.647561,0.588962,0.601886
10,0.017,1.420301,0.776352,0.667011,0.632302,0.638629


[I 2025-03-15 09:46:33,574] Trial 10 pruned. 


Trial 11 with params: {'learning_rate': 0.0035221780342879175, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7261,1.96374,0.508708,0.137678,0.150694,0.128473
2,1.6317,1.39447,0.661778,0.33882,0.313397,0.302693
3,1.1405,1.143685,0.704858,0.367069,0.365754,0.354962
4,0.7425,1.107809,0.719523,0.534763,0.455745,0.474033
5,0.4707,1.037737,0.756187,0.579181,0.509712,0.528714
6,0.2776,1.055779,0.773602,0.637355,0.603105,0.604812
7,0.1621,1.204902,0.769936,0.689278,0.59567,0.613994
8,0.0851,1.285902,0.766269,0.659318,0.621234,0.624653
9,0.0416,1.376813,0.769019,0.66596,0.613286,0.624901
10,0.0263,1.401448,0.772686,0.678259,0.629443,0.641428


[I 2025-03-15 09:48:08,078] Trial 11 pruned. 


Trial 12 with params: {'learning_rate': 0.0012537553169436762, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9915,2.366005,0.418882,0.095091,0.099108,0.078017
2,2.0388,1.86406,0.536205,0.156459,0.17516,0.1512
3,1.61,1.478112,0.627864,0.2641,0.246353,0.239871
4,1.205,1.27145,0.681943,0.410859,0.344918,0.351107
5,0.921,1.161568,0.71494,0.42927,0.374835,0.379074
6,0.7062,1.082818,0.721357,0.473863,0.437454,0.43919
7,0.5285,1.147804,0.728689,0.529892,0.440793,0.459627
8,0.4179,1.123052,0.736939,0.572898,0.521394,0.530185
9,0.2949,1.136426,0.748854,0.646539,0.544293,0.57398
10,0.219,1.170268,0.746104,0.59805,0.568099,0.563358


[I 2025-03-15 09:49:38,878] Trial 12 pruned. 


Trial 13 with params: {'learning_rate': 0.004449518806372289, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6527,1.993032,0.507791,0.161805,0.159368,0.139973
2,1.6087,1.454402,0.644363,0.314821,0.299022,0.294208
3,1.088,1.126407,0.71769,0.404823,0.395322,0.390059
4,0.6989,1.069695,0.727773,0.542493,0.489192,0.498771
5,0.3754,1.003037,0.769019,0.584804,0.548462,0.552146
6,0.2151,1.113516,0.772686,0.620959,0.582512,0.587343
7,0.111,1.169671,0.767186,0.663055,0.60671,0.617495
8,0.0464,1.227151,0.787351,0.680046,0.635633,0.639823
9,0.0413,1.218992,0.787351,0.687688,0.64655,0.650925
10,0.0117,1.297936,0.783685,0.659362,0.632818,0.634809


[I 2025-03-15 09:52:12,932] Trial 13 finished with value: 0.6634038842579802 and parameters: {'learning_rate': 0.004449518806372289, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 1}. Best is trial 13 with value: 0.6634038842579802.


Trial 14 with params: {'learning_rate': 0.0031154959032138716, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7894,2.038462,0.491292,0.137752,0.136232,0.114897
2,1.7103,1.553835,0.620532,0.269465,0.266605,0.25458
3,1.2481,1.186977,0.705775,0.421587,0.364969,0.371252
4,0.8215,1.095982,0.728689,0.501909,0.445456,0.462591
5,0.5326,1.015504,0.761687,0.617986,0.533212,0.554024
6,0.3282,1.102181,0.761687,0.641364,0.572279,0.587222
7,0.1962,1.177941,0.768103,0.676146,0.586436,0.612589
8,0.1158,1.242107,0.75802,0.673132,0.590145,0.608787
9,0.0595,1.25141,0.780018,0.681683,0.639212,0.648027
10,0.0265,1.357559,0.776352,0.663539,0.626731,0.630466


[I 2025-03-15 09:53:47,350] Trial 14 pruned. 


Trial 15 with params: {'learning_rate': 0.001402963633287756, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0431,2.37163,0.412466,0.065291,0.095501,0.069426
2,2.0189,1.847809,0.541705,0.182863,0.179847,0.156648
3,1.5564,1.41101,0.648029,0.283262,0.276261,0.268139
4,1.1236,1.279591,0.68011,0.402188,0.36065,0.362495
5,0.8603,1.170726,0.715857,0.467977,0.41121,0.416174
6,0.6588,1.084648,0.727773,0.479364,0.453853,0.453898
7,0.4602,1.150886,0.744271,0.525584,0.444112,0.463883
8,0.345,1.1989,0.72319,0.576008,0.517127,0.522728
9,0.239,1.182907,0.747021,0.643243,0.565382,0.581472
10,0.1588,1.328578,0.735105,0.63989,0.579274,0.58964


[I 2025-03-15 09:54:34,873] Trial 15 pruned. 


Trial 16 with params: {'learning_rate': 0.0032841533482735647, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7922,2.008636,0.497709,0.130439,0.139075,0.11455
2,1.6689,1.655465,0.608616,0.267798,0.26112,0.247115
3,1.1946,1.180022,0.703025,0.394563,0.359022,0.35452
4,0.7794,1.062104,0.730522,0.515021,0.476393,0.475253
5,0.4997,1.007099,0.767186,0.582591,0.52735,0.534212
6,0.292,1.144515,0.76077,0.650117,0.55862,0.585399
7,0.1695,1.242699,0.761687,0.651584,0.583897,0.60471
8,0.0928,1.287873,0.769936,0.641586,0.627636,0.618004
9,0.0518,1.310055,0.777269,0.640704,0.615804,0.616009
10,0.0248,1.344837,0.782768,0.655187,0.639541,0.630602


[I 2025-03-15 09:56:09,418] Trial 16 pruned. 


Trial 17 with params: {'learning_rate': 0.0019481663329480377, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9148,2.208241,0.441797,0.110241,0.106604,0.087949
2,1.8857,1.685688,0.580202,0.244938,0.207495,0.192782
3,1.3855,1.313174,0.670027,0.348484,0.318958,0.314172
4,0.9834,1.172277,0.715857,0.446111,0.400136,0.402361
5,0.7044,1.085839,0.734189,0.531475,0.463269,0.475091
6,0.4973,1.052357,0.737855,0.563927,0.489238,0.503769
7,0.3181,1.145932,0.747021,0.615782,0.493959,0.519065
8,0.2115,1.211597,0.747938,0.633914,0.568511,0.586651
9,0.1408,1.196618,0.765353,0.672851,0.60911,0.627603
10,0.0873,1.35544,0.75527,0.648961,0.611049,0.615806


[I 2025-03-15 09:56:59,123] Trial 17 pruned. 


Trial 18 with params: {'learning_rate': 0.004277248223462774, 'weight_decay': 0.001, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.61,1.981408,0.497709,0.123829,0.145879,0.126815
2,1.6484,1.482729,0.624198,0.256776,0.265735,0.249466
3,1.1822,1.193339,0.698442,0.381129,0.378698,0.370732
4,0.7726,1.08477,0.718607,0.437477,0.416742,0.414381
5,0.4683,1.125993,0.752521,0.546042,0.486237,0.500179
6,0.2766,1.131662,0.757104,0.572906,0.539846,0.544164
7,0.1504,1.340906,0.772686,0.690974,0.645311,0.639213
8,0.0954,1.388718,0.765353,0.654438,0.590642,0.607732
9,0.0554,1.355216,0.773602,0.611452,0.616944,0.600714
10,0.0322,1.38421,0.789184,0.662446,0.614532,0.624601


[I 2025-03-15 09:58:36,572] Trial 18 pruned. 


Trial 19 with params: {'learning_rate': 0.0013513720151519398, 'weight_decay': 0.0, 'adam_beta1': 0.98, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.1651,2.581485,0.383135,0.041761,0.08029,0.052645
2,2.2932,2.088705,0.48121,0.09889,0.125683,0.101425
3,1.9294,1.819657,0.542621,0.2073,0.167401,0.153076
4,1.6207,1.585532,0.604033,0.236683,0.236777,0.220286
5,1.3417,1.424665,0.646196,0.313669,0.306442,0.298779
6,1.0677,1.258569,0.689276,0.377651,0.344045,0.347307
7,0.8325,1.194153,0.706691,0.398699,0.377181,0.377745
8,0.6473,1.162445,0.724106,0.483895,0.443552,0.445929
9,0.4992,1.177384,0.736022,0.501738,0.465986,0.471457
10,0.3465,1.247689,0.743355,0.576644,0.517087,0.532283


[I 2025-03-15 10:00:13,776] Trial 19 pruned. 


Trial 20 with params: {'learning_rate': 0.0006416110044393996, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.2413,2.784796,0.324473,0.035723,0.066534,0.041465
2,2.4133,2.143623,0.472961,0.090576,0.119683,0.095212
3,1.9522,1.800888,0.538038,0.18345,0.158936,0.145398
4,1.6246,1.573097,0.614115,0.246189,0.225757,0.216281
5,1.3577,1.400667,0.647113,0.311561,0.277106,0.268973
6,1.1187,1.287513,0.665445,0.383538,0.324058,0.322278
7,0.9433,1.243641,0.68011,0.348412,0.327803,0.32739
8,0.8259,1.16194,0.701192,0.428192,0.378353,0.387437
9,0.6927,1.10468,0.709441,0.458942,0.403797,0.410844
10,0.5703,1.141322,0.71494,0.460657,0.425182,0.433939


[I 2025-03-15 10:01:04,083] Trial 20 pruned. 


Trial 21 with params: {'learning_rate': 0.004685762019540236, 'weight_decay': 0.001, 'adam_beta1': 0.93, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7547,2.012946,0.509624,0.14316,0.150133,0.130601
2,1.6693,1.527334,0.621448,0.285874,0.263705,0.24774
3,1.1606,1.175666,0.705775,0.40006,0.388505,0.376691
4,0.7468,1.139124,0.716774,0.49187,0.433018,0.446551
5,0.4412,1.1084,0.758937,0.621683,0.545745,0.559429
6,0.2576,1.097566,0.761687,0.659952,0.639057,0.63502
7,0.1297,1.28769,0.772686,0.646753,0.61913,0.609621
8,0.0784,1.233371,0.787351,0.694743,0.623926,0.632513
9,0.0423,1.264062,0.785518,0.66756,0.642993,0.641045
10,0.0232,1.309633,0.780935,0.670377,0.644622,0.642372


[I 2025-03-15 10:03:29,186] Trial 21 finished with value: 0.6515584960531714 and parameters: {'learning_rate': 0.004685762019540236, 'weight_decay': 0.001, 'adam_beta1': 0.93, 'warmup_steps': 1}. Best is trial 13 with value: 0.6634038842579802.


Trial 22 with params: {'learning_rate': 0.0016316997568603284, 'weight_decay': 0.003, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0318,2.378743,0.416132,0.06281,0.09715,0.071767
2,2.0582,1.835391,0.547204,0.188793,0.181749,0.163784
3,1.6016,1.478545,0.63428,0.290878,0.271133,0.266748
4,1.1702,1.270896,0.673694,0.355553,0.34658,0.339928
5,0.8794,1.159683,0.705775,0.417004,0.403057,0.396663
6,0.6467,1.156273,0.72044,0.491445,0.44217,0.451282
7,0.4727,1.170475,0.727773,0.531822,0.464479,0.471282
8,0.346,1.203109,0.731439,0.575683,0.530332,0.537285
9,0.222,1.222758,0.751604,0.632946,0.563711,0.583147
10,0.1511,1.2735,0.752521,0.643897,0.596755,0.605623


[I 2025-03-15 10:04:17,990] Trial 22 pruned. 


Trial 23 with params: {'learning_rate': 0.004656814533907507, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6351,1.94749,0.493126,0.159084,0.158546,0.145449
2,1.5947,1.434827,0.63978,0.315885,0.282743,0.276879
3,1.0846,1.15974,0.72044,0.391302,0.395708,0.384168
4,0.6634,1.077087,0.742438,0.569673,0.501862,0.522176
5,0.3742,1.128167,0.76077,0.662095,0.566959,0.595438
6,0.2139,1.157214,0.770852,0.650106,0.611234,0.618026
7,0.1088,1.301372,0.782768,0.666165,0.630966,0.631717
8,0.052,1.297086,0.788268,0.673478,0.63226,0.642967
9,0.0293,1.324946,0.791934,0.67434,0.637279,0.641556
10,0.0118,1.340872,0.79835,0.68496,0.656482,0.661206


[I 2025-03-15 10:06:45,798] Trial 23 finished with value: 0.6575556935782846 and parameters: {'learning_rate': 0.004656814533907507, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 2}. Best is trial 13 with value: 0.6634038842579802.


Trial 24 with params: {'learning_rate': 0.0010739061248478078, 'weight_decay': 0.007, 'adam_beta1': 0.96, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.2204,2.647717,0.368469,0.040391,0.075958,0.049064
2,2.3222,2.069562,0.484876,0.088169,0.126954,0.100271
3,1.9005,1.767625,0.553621,0.188057,0.17189,0.160774
4,1.5484,1.526128,0.637947,0.285769,0.261912,0.252341
5,1.2457,1.322828,0.669111,0.339239,0.313462,0.305376
6,0.9819,1.208893,0.687443,0.379538,0.36132,0.356647
7,0.8024,1.180651,0.709441,0.417778,0.376974,0.383915
8,0.6385,1.159329,0.706691,0.470748,0.435577,0.43857
9,0.4734,1.137183,0.727773,0.520797,0.451398,0.46733
10,0.3537,1.160506,0.726856,0.555427,0.503929,0.515497


[I 2025-03-15 10:08:20,145] Trial 24 pruned. 


Trial 25 with params: {'learning_rate': 0.0038625880083034284, 'weight_decay': 0.007, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9086,2.153297,0.461045,0.108098,0.126394,0.104458
2,1.7988,1.601592,0.593951,0.220227,0.230241,0.214529
3,1.2987,1.292662,0.686526,0.364771,0.344417,0.340787
4,0.8532,1.15167,0.71769,0.445584,0.41095,0.411927
5,0.5524,1.145808,0.741522,0.556367,0.489763,0.500178
6,0.3437,1.163364,0.752521,0.617206,0.567093,0.56786
7,0.1999,1.281355,0.752521,0.631376,0.574743,0.584802
8,0.116,1.290219,0.780018,0.643234,0.605928,0.611306
9,0.0646,1.359746,0.777269,0.647774,0.615697,0.619438
10,0.04,1.402978,0.774519,0.668835,0.620474,0.625315


[I 2025-03-15 10:09:55,428] Trial 25 pruned. 


Trial 26 with params: {'learning_rate': 0.0036190768273279663, 'weight_decay': 0.004, 'adam_beta1': 0.93, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7243,1.971602,0.517874,0.134124,0.148997,0.126171
2,1.6653,1.520055,0.630614,0.302513,0.289018,0.274311
3,1.1815,1.219994,0.692942,0.357937,0.357595,0.349498
4,0.7418,1.052037,0.728689,0.545501,0.472771,0.490236
5,0.4561,1.118812,0.744271,0.616112,0.532568,0.548717
6,0.3002,1.125622,0.76077,0.640225,0.585694,0.596757
7,0.1586,1.301105,0.76077,0.659336,0.58707,0.610821
8,0.0752,1.280868,0.789184,0.69926,0.634003,0.653976
9,0.0442,1.430164,0.777269,0.69694,0.615832,0.637708
10,0.0306,1.340058,0.785518,0.671932,0.643548,0.643635


[I 2025-03-15 10:12:23,144] Trial 26 finished with value: 0.6573180097729155 and parameters: {'learning_rate': 0.0036190768273279663, 'weight_decay': 0.004, 'adam_beta1': 0.93, 'warmup_steps': 3}. Best is trial 13 with value: 0.6634038842579802.


Trial 27 with params: {'learning_rate': 0.0035907128489555563, 'weight_decay': 0.003, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.1129,2.228058,0.434464,0.093874,0.111715,0.094559
2,1.8823,1.67584,0.563703,0.196257,0.204236,0.187044
3,1.3686,1.368391,0.666361,0.383579,0.339444,0.340657
4,0.9203,1.157368,0.725023,0.472441,0.423859,0.433303
5,0.5829,1.136327,0.746104,0.497429,0.466061,0.462756
6,0.3664,1.244094,0.735105,0.560394,0.501941,0.510458
7,0.2139,1.38664,0.749771,0.601193,0.533979,0.555026
8,0.1223,1.401547,0.762603,0.613405,0.574739,0.581502
9,0.0893,1.458025,0.765353,0.624338,0.586713,0.590815
10,0.0437,1.544074,0.768103,0.611961,0.587566,0.58816


[I 2025-03-15 10:14:00,253] Trial 27 pruned. 


Trial 28 with params: {'learning_rate': 0.003923558397956021, 'weight_decay': 0.005, 'adam_beta1': 0.91, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7276,1.993542,0.514207,0.134875,0.154996,0.130079
2,1.6164,1.491174,0.643446,0.300176,0.294832,0.280804
3,1.1431,1.161484,0.708524,0.428558,0.38353,0.384215
4,0.7198,1.067185,0.736022,0.518263,0.484064,0.477481
5,0.4285,1.056981,0.753437,0.569582,0.528898,0.530951
6,0.2435,1.164257,0.761687,0.656619,0.575636,0.594388
7,0.1343,1.265847,0.771769,0.644791,0.595626,0.606295
8,0.0661,1.33121,0.777269,0.634994,0.594951,0.606063
9,0.0541,1.314449,0.783685,0.657851,0.619996,0.621825
10,0.0343,1.316426,0.781852,0.641653,0.620983,0.618329


[I 2025-03-15 10:16:28,781] Trial 28 finished with value: 0.6551136227386704 and parameters: {'learning_rate': 0.003923558397956021, 'weight_decay': 0.005, 'adam_beta1': 0.91, 'warmup_steps': 3}. Best is trial 13 with value: 0.6634038842579802.


Trial 29 with params: {'learning_rate': 0.00042236241563950157, 'weight_decay': 0.005, 'adam_beta1': 0.93, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4068,2.981815,0.206233,0.018253,0.031636,0.019185
2,2.6986,2.431978,0.411549,0.062028,0.088634,0.057531
3,2.2583,2.098728,0.472961,0.097236,0.120398,0.092804
4,1.9615,1.848448,0.527039,0.119237,0.153415,0.127193
5,1.7188,1.679079,0.582951,0.199448,0.201781,0.186679


[I 2025-03-15 10:16:53,400] Trial 29 pruned. 


Trial 30 with params: {'learning_rate': 0.004950079986516717, 'weight_decay': 0.005, 'adam_beta1': 0.93, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6201,1.925979,0.51604,0.164203,0.173243,0.156602
2,1.5739,1.383891,0.652612,0.29438,0.297341,0.28493
3,1.0785,1.151551,0.71769,0.425774,0.407932,0.403112
4,0.6313,1.006225,0.756187,0.585517,0.532349,0.543912
5,0.3479,1.067366,0.776352,0.646735,0.575355,0.590383
6,0.2156,1.086447,0.788268,0.668277,0.615716,0.624215
7,0.114,1.183501,0.785518,0.674961,0.637035,0.640389
8,0.0497,1.274766,0.780018,0.685391,0.634649,0.642441
9,0.0197,1.330472,0.796517,0.697726,0.653539,0.661934
10,0.0109,1.410565,0.794684,0.698331,0.652603,0.658139


[I 2025-03-15 10:19:23,862] Trial 30 finished with value: 0.6473074087703958 and parameters: {'learning_rate': 0.004950079986516717, 'weight_decay': 0.005, 'adam_beta1': 0.93, 'warmup_steps': 2}. Best is trial 13 with value: 0.6634038842579802.


Trial 31 with params: {'learning_rate': 0.0036788425482949683, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6971,1.975996,0.512374,0.134404,0.157072,0.133071
2,1.6055,1.447182,0.655362,0.311131,0.30633,0.29554
3,1.1272,1.18728,0.701192,0.407106,0.378374,0.370326
4,0.7224,1.070071,0.738772,0.532928,0.489845,0.497324
5,0.4283,1.014739,0.76352,0.608826,0.571669,0.568661
6,0.2397,1.182029,0.756187,0.6101,0.5291,0.552315
7,0.144,1.189407,0.769019,0.652616,0.607389,0.614557
8,0.0565,1.287912,0.782768,0.646809,0.620843,0.614942
9,0.0429,1.335428,0.774519,0.639305,0.622008,0.619514
10,0.0236,1.38502,0.782768,0.646632,0.614997,0.612544


[I 2025-03-15 10:21:49,338] Trial 31 finished with value: 0.6493525553755619 and parameters: {'learning_rate': 0.0036788425482949683, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 3}. Best is trial 13 with value: 0.6634038842579802.


Trial 32 with params: {'learning_rate': 0.0014670123496554556, 'weight_decay': 0.005, 'adam_beta1': 0.92, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0587,2.377501,0.414299,0.086435,0.097542,0.075903
2,2.0226,1.818751,0.548121,0.169622,0.185875,0.162541
3,1.5378,1.405384,0.653529,0.314455,0.282463,0.277436
4,1.1168,1.247088,0.68286,0.430653,0.37091,0.374501
5,0.8401,1.15565,0.714024,0.461383,0.421848,0.425376
6,0.6224,1.100499,0.721357,0.503152,0.465705,0.468051
7,0.4478,1.146885,0.735105,0.525112,0.433086,0.455059
8,0.3312,1.133073,0.751604,0.649925,0.570812,0.588072
9,0.2189,1.17456,0.764436,0.663795,0.593855,0.610832
10,0.1474,1.311686,0.742438,0.61725,0.582135,0.579336


[I 2025-03-15 10:23:27,763] Trial 32 pruned. 


Trial 33 with params: {'learning_rate': 1.2161047690501456e-05, 'weight_decay': 0.002, 'adam_beta1': 0.99, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8973,3.879499,0.165903,0.005374,0.023653,0.008381
2,3.8646,3.849076,0.176902,0.003538,0.02,0.006012
3,3.8344,3.81571,0.176902,0.003538,0.02,0.006012
4,3.7953,3.775221,0.176902,0.003538,0.02,0.006012
5,3.757,3.724165,0.176902,0.003538,0.02,0.006012
6,3.6945,3.661227,0.176902,0.003538,0.02,0.006012
7,3.624,3.584752,0.176902,0.003538,0.02,0.006012
8,3.549,3.497572,0.176902,0.003538,0.02,0.006012
9,3.4616,3.408413,0.176902,0.003538,0.02,0.006012
10,3.374,3.328702,0.176902,0.003538,0.02,0.006012


[I 2025-03-15 10:25:05,522] Trial 33 pruned. 


Trial 34 with params: {'learning_rate': 0.004498255185801358, 'weight_decay': 0.004, 'adam_beta1': 0.93, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8974,2.112244,0.461962,0.130293,0.125312,0.109406
2,1.7386,1.625124,0.597617,0.225195,0.239224,0.217377
3,1.2364,1.171324,0.703025,0.385186,0.371417,0.364604
4,0.7347,1.093443,0.727773,0.46316,0.426738,0.429692
5,0.4612,1.067588,0.75802,0.608082,0.534172,0.535551
6,0.2588,1.1404,0.754354,0.650833,0.60978,0.609146
7,0.1262,1.208418,0.780018,0.661225,0.617565,0.623166
8,0.0598,1.331983,0.778185,0.659631,0.612929,0.611121
9,0.0413,1.441162,0.758937,0.643233,0.585677,0.595971
10,0.0373,1.369822,0.780018,0.650968,0.589727,0.604541


[I 2025-03-15 10:27:29,584] Trial 34 finished with value: 0.6572631211720212 and parameters: {'learning_rate': 0.004498255185801358, 'weight_decay': 0.004, 'adam_beta1': 0.93, 'warmup_steps': 3}. Best is trial 13 with value: 0.6634038842579802.


Trial 35 with params: {'learning_rate': 0.002808771000788496, 'weight_decay': 0.003, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9503,2.213175,0.455545,0.099625,0.120731,0.101608
2,1.8547,1.672663,0.575619,0.23986,0.226255,0.209416
3,1.3597,1.27641,0.681027,0.343666,0.339996,0.330899
4,0.9234,1.164816,0.705775,0.404785,0.388601,0.386639
5,0.6258,1.119442,0.739688,0.486482,0.45015,0.453709
6,0.4142,1.145497,0.754354,0.597676,0.547617,0.553549
7,0.2585,1.1722,0.765353,0.624989,0.552215,0.571875
8,0.1508,1.302304,0.762603,0.675496,0.569085,0.59972
9,0.0857,1.37919,0.769019,0.646447,0.592886,0.600782
10,0.0469,1.324645,0.780018,0.602906,0.628702,0.607323


[I 2025-03-15 10:29:15,333] Trial 35 pruned. 


Trial 36 with params: {'learning_rate': 0.001173177966493162, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0333,2.400553,0.406966,0.089454,0.093928,0.069982
2,2.0513,1.864251,0.537122,0.159753,0.170767,0.148574
3,1.5966,1.436992,0.63978,0.271537,0.265287,0.253475
4,1.1852,1.261579,0.678277,0.440721,0.351715,0.363982
5,0.9187,1.18469,0.704858,0.407376,0.377924,0.373562
6,0.717,1.143719,0.705775,0.439824,0.431301,0.421051
7,0.5359,1.177636,0.72594,0.506887,0.421111,0.442035
8,0.4436,1.135091,0.730522,0.586114,0.502935,0.521678
9,0.306,1.152613,0.730522,0.600931,0.528053,0.548608
10,0.2344,1.1954,0.740605,0.588843,0.560855,0.567441


[I 2025-03-15 10:30:00,642] Trial 36 pruned. 


Trial 37 with params: {'learning_rate': 0.004202166087142448, 'weight_decay': 0.005, 'adam_beta1': 0.93, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8401,2.070999,0.482126,0.135641,0.135507,0.123072
2,1.7105,1.523271,0.612282,0.262264,0.255389,0.238438
3,1.1924,1.176612,0.701192,0.378208,0.351426,0.347727
4,0.7559,1.074416,0.733272,0.470728,0.434081,0.433755
5,0.4528,1.062991,0.759853,0.577151,0.521808,0.529664
6,0.2661,1.14612,0.769019,0.670986,0.595254,0.613998
7,0.1538,1.251599,0.780018,0.654719,0.611551,0.615673
8,0.069,1.312276,0.765353,0.688122,0.615125,0.63256
9,0.0364,1.427118,0.780935,0.660581,0.636095,0.634986
10,0.0268,1.498286,0.776352,0.660275,0.609598,0.618543


[I 2025-03-15 10:30:49,018] Trial 37 pruned. 


Trial 38 with params: {'learning_rate': 1.162626851313962e-05, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8965,3.87882,0.168653,0.005415,0.023964,0.008463
2,3.8623,3.843514,0.176902,0.003538,0.02,0.006012
3,3.8222,3.793273,0.176902,0.003538,0.02,0.006012
4,3.756,3.713931,0.176902,0.003538,0.02,0.006012
5,3.6629,3.579274,0.176902,0.003538,0.02,0.006012
6,3.503,3.422344,0.176902,0.003538,0.02,0.006012
7,3.3623,3.30851,0.176902,0.003538,0.02,0.006012
8,3.2889,3.24642,0.176902,0.003538,0.02,0.006012
9,3.2456,3.210076,0.176902,0.003538,0.02,0.006012
10,3.2101,3.183092,0.176902,0.003538,0.02,0.006012


[I 2025-03-15 10:32:25,073] Trial 38 pruned. 


Trial 39 with params: {'learning_rate': 2.744905812550546e-05, 'weight_decay': 0.006, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8803,3.838086,0.176902,0.003538,0.02,0.006012
2,3.7844,3.710993,0.176902,0.003538,0.02,0.006012
3,3.5932,3.428979,0.176902,0.003538,0.02,0.006012
4,3.3001,3.216631,0.176902,0.003538,0.02,0.006012
5,3.2178,3.149077,0.176902,0.003538,0.02,0.006012


[I 2025-03-15 10:32:48,580] Trial 39 pruned. 


Trial 40 with params: {'learning_rate': 0.00015460941865464952, 'weight_decay': 0.006, 'adam_beta1': 0.99, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6195,3.23874,0.176902,0.003538,0.02,0.006012
2,3.2309,3.10914,0.296059,0.041684,0.053405,0.029611
3,3.0524,2.950392,0.232814,0.04942,0.039822,0.029837
4,2.8552,2.785614,0.330889,0.038408,0.062414,0.041669
5,2.7521,2.652324,0.341888,0.026188,0.066129,0.036662
6,2.5767,2.512161,0.377635,0.052732,0.079366,0.054752
7,2.4506,2.396564,0.427131,0.067008,0.095819,0.067572
8,2.3444,2.295419,0.431714,0.070354,0.097372,0.069746
9,2.2403,2.196162,0.458295,0.086597,0.110059,0.087721
10,2.1434,2.124208,0.472044,0.116025,0.119669,0.096523


[I 2025-03-15 10:33:34,417] Trial 40 pruned. 


Trial 41 with params: {'learning_rate': 2.180526226355731e-05, 'weight_decay': 0.01, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8842,3.850977,0.176902,0.003538,0.02,0.006012
2,3.814,3.767064,0.176902,0.003538,0.02,0.006012
3,3.6968,3.592859,0.176902,0.003538,0.02,0.006012
4,3.4611,3.336684,0.176902,0.003538,0.02,0.006012
5,3.2954,3.209079,0.176902,0.003538,0.02,0.006012
6,3.1923,3.156038,0.176902,0.003538,0.02,0.006012
7,3.1468,3.114551,0.176902,0.003538,0.02,0.006012
8,3.1213,3.072838,0.176902,0.003541,0.02,0.006017
9,3.0736,3.027232,0.2044,0.036845,0.027169,0.016233
10,3.0175,2.983699,0.303391,0.032921,0.053519,0.037212


[I 2025-03-15 10:34:23,023] Trial 41 pruned. 


Trial 42 with params: {'learning_rate': 0.0022199683893738276, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9366,2.211679,0.442713,0.105217,0.110902,0.094171
2,1.8759,1.677852,0.575619,0.248569,0.217744,0.204897
3,1.377,1.339225,0.67736,0.33774,0.332836,0.319341
4,0.9623,1.198912,0.712191,0.464439,0.395373,0.410304
5,0.6862,1.076707,0.742438,0.520314,0.467664,0.477246
6,0.4731,1.07672,0.751604,0.589047,0.523319,0.528947
7,0.314,1.138887,0.757104,0.65692,0.541837,0.573914
8,0.1936,1.24486,0.759853,0.673156,0.581423,0.594478
9,0.1079,1.231978,0.768103,0.68491,0.595821,0.620612
10,0.0632,1.342986,0.758937,0.622043,0.607343,0.589854


[I 2025-03-15 10:36:48,396] Trial 42 finished with value: 0.6684987437203125 and parameters: {'learning_rate': 0.0022199683893738276, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 2}. Best is trial 42 with value: 0.6684987437203125.


Trial 43 with params: {'learning_rate': 0.0031906114982619793, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8165,2.10472,0.472961,0.133134,0.129658,0.110733
2,1.7476,1.531677,0.618698,0.275332,0.269813,0.25511
3,1.2612,1.220806,0.697525,0.381803,0.355681,0.354375
4,0.8387,1.089503,0.722273,0.496261,0.444075,0.448063
5,0.5209,1.033191,0.75527,0.53562,0.520101,0.509118
6,0.3231,1.186007,0.754354,0.66113,0.570551,0.597185
7,0.2133,1.21931,0.762603,0.658062,0.561894,0.590603
8,0.1383,1.289846,0.761687,0.64018,0.575159,0.584044
9,0.0699,1.28708,0.779102,0.675259,0.619025,0.633497
10,0.0267,1.359889,0.781852,0.676951,0.639832,0.644243


[I 2025-03-15 10:38:31,313] Trial 43 pruned. 


Trial 44 with params: {'learning_rate': 0.00446260336530848, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6529,1.982313,0.507791,0.165263,0.161836,0.143697
2,1.6054,1.467992,0.63428,0.298388,0.289243,0.283046
3,1.0821,1.125092,0.719523,0.412569,0.391149,0.386968
4,0.6915,1.034454,0.738772,0.563912,0.486921,0.499437
5,0.3704,0.998317,0.769019,0.615916,0.561418,0.572888
6,0.1906,1.125665,0.787351,0.698891,0.629304,0.645264
7,0.1009,1.291173,0.769936,0.665238,0.608875,0.619676
8,0.0672,1.33232,0.786434,0.759276,0.650529,0.678984
9,0.0338,1.440898,0.787351,0.685285,0.629209,0.644049
10,0.0231,1.400149,0.794684,0.723634,0.667145,0.681455


[I 2025-03-15 10:40:55,012] Trial 44 finished with value: 0.6830693405840152 and parameters: {'learning_rate': 0.00446260336530848, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 1}. Best is trial 44 with value: 0.6830693405840152.


Trial 45 with params: {'learning_rate': 0.004693546493886514, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7471,2.008055,0.497709,0.125454,0.145182,0.115002
2,1.6362,1.475837,0.627864,0.288341,0.291907,0.27785
3,1.1142,1.148596,0.706691,0.378197,0.380382,0.367003
4,0.7108,1.069064,0.729606,0.509869,0.463332,0.472446
5,0.404,1.050612,0.766269,0.596047,0.551381,0.555853
6,0.2072,1.180572,0.764436,0.680439,0.628796,0.631714
7,0.123,1.201459,0.767186,0.672877,0.623548,0.630669
8,0.0544,1.255827,0.787351,0.680806,0.647725,0.650674
9,0.0321,1.460495,0.779102,0.710456,0.622833,0.649953
10,0.0246,1.347429,0.79835,0.716858,0.674734,0.682794


[I 2025-03-15 10:43:20,926] Trial 45 finished with value: 0.7063463805279003 and parameters: {'learning_rate': 0.004693546493886514, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 46 with params: {'learning_rate': 0.0042205343874207575, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6413,2.009057,0.502291,0.176511,0.151097,0.129748
2,1.623,1.427798,0.651696,0.310431,0.298109,0.292881
3,1.0904,1.103771,0.716774,0.38969,0.395146,0.386673
4,0.6846,1.027703,0.733272,0.549131,0.492995,0.507287
5,0.3744,0.969822,0.775435,0.658611,0.605978,0.613684
6,0.1947,1.095164,0.776352,0.706292,0.641115,0.658397
7,0.1144,1.150254,0.787351,0.701659,0.64927,0.663192
8,0.0493,1.307291,0.790101,0.671063,0.645229,0.643742
9,0.0239,1.390137,0.794684,0.679795,0.649596,0.653674
10,0.0191,1.418196,0.79835,0.669749,0.65094,0.64861


[I 2025-03-15 10:46:05,435] Trial 46 finished with value: 0.6922543225371284 and parameters: {'learning_rate': 0.0042205343874207575, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 47 with params: {'learning_rate': 0.0027620499728312226, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.811,2.067997,0.488543,0.14274,0.137216,0.12121
2,1.7395,1.584941,0.621448,0.277827,0.261169,0.249165
3,1.269,1.220322,0.692942,0.382281,0.349553,0.349592
4,0.8552,1.060688,0.72319,0.520827,0.429811,0.445438
5,0.5478,1.094297,0.739688,0.572518,0.493903,0.502343
6,0.37,1.106312,0.743355,0.667114,0.549832,0.570036
7,0.2257,1.208146,0.773602,0.642311,0.544016,0.571166
8,0.1332,1.245543,0.770852,0.685941,0.598353,0.620593
9,0.0741,1.300753,0.774519,0.673652,0.621901,0.631487
10,0.048,1.410049,0.770852,0.67507,0.608721,0.621679


[I 2025-03-15 10:48:33,320] Trial 47 finished with value: 0.673361762799432 and parameters: {'learning_rate': 0.0027620499728312226, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 48 with params: {'learning_rate': 0.003235840249919905, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.733,1.970926,0.507791,0.125011,0.142835,0.122884
2,1.6373,1.517293,0.64528,0.304379,0.298727,0.282759
3,1.1347,1.178862,0.696609,0.402501,0.370518,0.369457
4,0.7402,1.056724,0.741522,0.497742,0.467806,0.471128
5,0.4572,1.061082,0.753437,0.579759,0.495102,0.515968
6,0.2836,1.101994,0.766269,0.653672,0.558816,0.577368
7,0.1432,1.303323,0.769936,0.657446,0.570195,0.587952
8,0.0803,1.206835,0.786434,0.673963,0.625101,0.632577
9,0.0499,1.261811,0.779102,0.683726,0.612852,0.628681
10,0.0423,1.363479,0.770852,0.67297,0.634973,0.638622


[I 2025-03-15 10:50:12,829] Trial 48 pruned. 


Trial 49 with params: {'learning_rate': 0.004812290828879115, 'weight_decay': 0.01, 'adam_beta1': 0.99, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6935,2.087961,0.487626,0.111796,0.136529,0.117851
2,1.7991,1.663016,0.584785,0.209346,0.228062,0.21398
3,1.3882,1.390368,0.664528,0.338887,0.325039,0.323031
4,1.0216,1.300663,0.681027,0.360919,0.35186,0.348928
5,0.6902,1.201926,0.732356,0.476782,0.443174,0.445895
6,0.475,1.258763,0.736022,0.486114,0.439161,0.444875
7,0.2743,1.296922,0.76077,0.572252,0.558889,0.555897
8,0.1513,1.463067,0.774519,0.625661,0.597811,0.595798
9,0.0778,1.597326,0.780935,0.683215,0.636288,0.64446
10,0.0521,1.787339,0.778185,0.61458,0.599216,0.590557


[I 2025-03-15 10:52:38,418] Trial 49 finished with value: 0.6411098197323015 and parameters: {'learning_rate': 0.004812290828879115, 'weight_decay': 0.01, 'adam_beta1': 0.99, 'warmup_steps': 0}. Best is trial 45 with value: 0.7063463805279003.


Trial 50 with params: {'learning_rate': 0.0028121506137260203, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8126,2.062046,0.491292,0.137395,0.13834,0.121131
2,1.7401,1.593455,0.617782,0.280035,0.260682,0.250806
3,1.2658,1.198902,0.692942,0.412062,0.364868,0.369848
4,0.846,1.057738,0.731439,0.501664,0.432458,0.445167
5,0.5467,1.06337,0.747021,0.605424,0.500873,0.519769
6,0.3637,1.103459,0.747938,0.606746,0.561733,0.559002
7,0.2163,1.274373,0.757104,0.63435,0.554796,0.575713
8,0.1364,1.23726,0.771769,0.656806,0.601864,0.614581
9,0.0713,1.356754,0.764436,0.650566,0.594725,0.608391
10,0.0491,1.390758,0.753437,0.624805,0.606152,0.597716


[I 2025-03-15 10:54:13,860] Trial 50 pruned. 


Trial 51 with params: {'learning_rate': 0.0034601125039944586, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7009,2.010129,0.510541,0.138314,0.160733,0.13627
2,1.6754,1.496096,0.63428,0.277814,0.28464,0.271591
3,1.1346,1.16086,0.708524,0.414572,0.381991,0.379698
4,0.7336,1.128441,0.711274,0.480426,0.451588,0.444135
5,0.4621,1.013082,0.759853,0.588631,0.526872,0.534504
6,0.2725,1.132768,0.762603,0.634196,0.569204,0.578641
7,0.1648,1.26611,0.771769,0.665305,0.571015,0.59713
8,0.093,1.15175,0.791017,0.665466,0.63568,0.630685
9,0.0594,1.30899,0.771769,0.670921,0.618494,0.62648
10,0.0301,1.291427,0.791934,0.670304,0.636009,0.638999


[I 2025-03-15 10:56:39,192] Trial 51 finished with value: 0.6564561102946271 and parameters: {'learning_rate': 0.0034601125039944586, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 52 with params: {'learning_rate': 0.004161235253061019, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6263,1.976073,0.520623,0.176616,0.165731,0.150753
2,1.5872,1.437047,0.638863,0.304824,0.304183,0.29451
3,1.0791,1.089943,0.715857,0.38503,0.38436,0.376396
4,0.6626,1.055345,0.72319,0.509347,0.451796,0.465532
5,0.3824,1.043748,0.768103,0.654901,0.554696,0.583284
6,0.2221,1.112886,0.769936,0.641548,0.59085,0.597866
7,0.1247,1.237701,0.777269,0.682809,0.589475,0.6142
8,0.0582,1.29272,0.791934,0.689307,0.610331,0.628272
9,0.033,1.314514,0.794684,0.678792,0.627656,0.6432
10,0.0233,1.379144,0.79835,0.692018,0.64003,0.651784


[I 2025-03-15 10:58:15,653] Trial 52 pruned. 


Trial 53 with params: {'learning_rate': 0.004655862139629419, 'weight_decay': 0.001, 'adam_beta1': 0.91, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7398,2.003862,0.497709,0.126924,0.147144,0.115262
2,1.6492,1.448773,0.636114,0.319547,0.291396,0.282155
3,1.1438,1.172887,0.700275,0.384629,0.374218,0.36834
4,0.7384,1.060228,0.735105,0.49931,0.455006,0.463112
5,0.4169,1.036773,0.761687,0.593149,0.581052,0.571901
6,0.2317,1.158319,0.753437,0.640131,0.581317,0.59109
7,0.1291,1.182741,0.777269,0.702872,0.614725,0.645369
8,0.0534,1.256455,0.791017,0.689153,0.611889,0.632041
9,0.029,1.218386,0.794684,0.695774,0.648343,0.659618
10,0.0076,1.299657,0.805683,0.690081,0.652109,0.659092


[I 2025-03-15 11:00:39,304] Trial 53 finished with value: 0.6724419074568008 and parameters: {'learning_rate': 0.004655862139629419, 'weight_decay': 0.001, 'adam_beta1': 0.91, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 54 with params: {'learning_rate': 0.0015337235123643216, 'weight_decay': 0.003, 'adam_beta1': 0.91, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.954,2.275911,0.426214,0.068825,0.101205,0.07883
2,1.9589,1.76677,0.553621,0.18564,0.186262,0.164323
3,1.4885,1.344158,0.670027,0.297876,0.309533,0.29414
4,1.0788,1.218298,0.690192,0.423446,0.370441,0.3737
5,0.8168,1.117183,0.718607,0.439323,0.389449,0.396404


[I 2025-03-15 11:01:04,994] Trial 54 pruned. 


Trial 55 with params: {'learning_rate': 0.004048147341586717, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6194,1.98513,0.507791,0.182771,0.164889,0.145062
2,1.5821,1.491147,0.64253,0.322065,0.289736,0.287328
3,1.0988,1.169939,0.692942,0.390633,0.37153,0.368491
4,0.6897,1.04365,0.731439,0.551114,0.485409,0.49312
5,0.3935,1.032809,0.764436,0.600985,0.539915,0.547152
6,0.2282,1.064638,0.781852,0.641398,0.584783,0.595366
7,0.112,1.182981,0.79835,0.689852,0.619976,0.626322
8,0.0478,1.260095,0.792851,0.672102,0.643306,0.644399
9,0.0341,1.317017,0.794684,0.691283,0.640484,0.651067
10,0.0187,1.293167,0.792851,0.686437,0.639664,0.647601


[I 2025-03-15 11:03:31,133] Trial 55 finished with value: 0.6737357145796721 and parameters: {'learning_rate': 0.004048147341586717, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 56 with params: {'learning_rate': 0.0048761667857604215, 'weight_decay': 0.003, 'adam_beta1': 0.91, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7704,1.996424,0.499542,0.131576,0.145643,0.12111
2,1.6407,1.476916,0.627864,0.264831,0.279193,0.25788
3,1.121,1.139904,0.712191,0.408649,0.394494,0.385999
4,0.7059,1.106215,0.716774,0.492204,0.448569,0.457824
5,0.3939,1.045322,0.748854,0.577789,0.564215,0.558609
6,0.2134,1.19827,0.76077,0.674216,0.6394,0.630642
7,0.1336,1.190325,0.778185,0.670359,0.6435,0.642741
8,0.0565,1.277086,0.780018,0.705822,0.63437,0.6514
9,0.0343,1.355982,0.784601,0.712166,0.647529,0.664112
10,0.009,1.385277,0.7956,0.674141,0.658806,0.653738


[I 2025-03-15 11:06:06,470] Trial 56 finished with value: 0.694264487229523 and parameters: {'learning_rate': 0.0048761667857604215, 'weight_decay': 0.003, 'adam_beta1': 0.91, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 57 with params: {'learning_rate': 0.003576079698140823, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7309,1.954808,0.506874,0.127551,0.145354,0.119066
2,1.6622,1.51834,0.624198,0.259591,0.273092,0.248715
3,1.1741,1.161383,0.708524,0.416339,0.378691,0.380502
4,0.7694,1.056899,0.735105,0.526181,0.457045,0.47158
5,0.4846,1.004373,0.759853,0.565055,0.501387,0.519524
6,0.3041,1.062491,0.758937,0.631377,0.606093,0.604157
7,0.165,1.215414,0.75527,0.649062,0.591647,0.603512
8,0.0897,1.255584,0.777269,0.645145,0.57202,0.590896
9,0.0521,1.229431,0.792851,0.67016,0.646119,0.650153
10,0.0238,1.299832,0.785518,0.668824,0.637279,0.641886


[I 2025-03-15 11:07:42,709] Trial 57 pruned. 


Trial 58 with params: {'learning_rate': 0.004972005147239882, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7768,2.007968,0.502291,0.166102,0.147631,0.123502
2,1.6279,1.483784,0.616865,0.267081,0.27013,0.245983
3,1.1132,1.129275,0.721357,0.417652,0.400685,0.393882
4,0.6822,1.083211,0.726856,0.482246,0.43682,0.443268
5,0.3796,1.058998,0.764436,0.6506,0.547604,0.57823
6,0.2086,1.136821,0.761687,0.694612,0.623974,0.635059
7,0.1003,1.172798,0.776352,0.668693,0.629124,0.628467
8,0.0359,1.353685,0.781852,0.735372,0.64526,0.658532
9,0.0189,1.301939,0.791017,0.736055,0.653441,0.671725
10,0.022,1.42277,0.781852,0.710247,0.629725,0.638285


[I 2025-03-15 11:10:15,693] Trial 58 finished with value: 0.688031218657222 and parameters: {'learning_rate': 0.004972005147239882, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 59 with params: {'learning_rate': 0.0002938083556090815, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4099,3.029434,0.197067,0.018319,0.028,0.016529
2,2.7981,2.587389,0.379468,0.037603,0.079069,0.04978
3,2.4325,2.268549,0.44363,0.066361,0.103763,0.075871
4,2.1518,2.027228,0.485793,0.097027,0.128395,0.103149
5,1.9462,1.872532,0.546288,0.138944,0.164709,0.14437


[I 2025-03-15 11:10:40,664] Trial 59 pruned. 


Trial 60 with params: {'learning_rate': 0.004599142558964877, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7333,2.025227,0.495875,0.123182,0.146736,0.114265
2,1.6421,1.43199,0.637947,0.315446,0.287943,0.2784
3,1.115,1.135143,0.718607,0.422836,0.392171,0.387933
4,0.7022,1.030751,0.748854,0.556429,0.487463,0.504624
5,0.3905,1.072023,0.75527,0.552195,0.501442,0.507965
6,0.2149,1.145037,0.76077,0.64169,0.591199,0.602176
7,0.0984,1.195834,0.780018,0.670554,0.641642,0.639908
8,0.0453,1.260698,0.772686,0.679617,0.621756,0.6334
9,0.0195,1.279914,0.791017,0.674862,0.637678,0.645807
10,0.0109,1.41654,0.794684,0.685412,0.636685,0.642803


[I 2025-03-15 11:12:20,280] Trial 60 pruned. 


Trial 61 with params: {'learning_rate': 0.0005255415183698524, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.99, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4185,3.005034,0.217232,0.021829,0.035874,0.017534
2,2.7936,2.616017,0.36022,0.040667,0.071968,0.046866
3,2.4501,2.311714,0.444546,0.06774,0.10462,0.077396
4,2.1933,2.109982,0.473877,0.104062,0.123627,0.104358
5,2.0068,1.932713,0.512374,0.135089,0.145869,0.124042
6,1.8349,1.840118,0.531622,0.161095,0.165381,0.1458
7,1.6977,1.711446,0.558203,0.179782,0.182891,0.169337
8,1.5758,1.627323,0.584785,0.235961,0.203571,0.192357
9,1.45,1.536212,0.628781,0.263055,0.245826,0.237051
10,1.3144,1.435688,0.652612,0.286543,0.276431,0.270073


[I 2025-03-15 11:13:08,651] Trial 61 pruned. 


Trial 62 with params: {'learning_rate': 0.004219587206180257, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.639,2.00221,0.508708,0.178365,0.155848,0.137035
2,1.6143,1.417707,0.654445,0.315695,0.303652,0.299594
3,1.0957,1.116432,0.715857,0.401251,0.388497,0.384733
4,0.691,1.034049,0.736022,0.495324,0.462997,0.471785
5,0.3861,0.985158,0.769936,0.649369,0.596511,0.596618
6,0.222,1.097199,0.779102,0.70975,0.615391,0.642563
7,0.1045,1.137866,0.7956,0.709239,0.629568,0.65118
8,0.0566,1.16645,0.786434,0.686181,0.661398,0.657181
9,0.0289,1.19336,0.80385,0.709818,0.664527,0.668859
10,0.009,1.315378,0.804766,0.711406,0.664226,0.676737


[I 2025-03-15 11:15:36,195] Trial 62 finished with value: 0.6874435594737384 and parameters: {'learning_rate': 0.004219587206180257, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 63 with params: {'learning_rate': 0.004587195068818531, 'weight_decay': 0.003, 'adam_beta1': 0.91, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7337,2.01815,0.505041,0.124702,0.148098,0.115326
2,1.6506,1.4842,0.633364,0.301569,0.287381,0.276101
3,1.1394,1.138112,0.709441,0.404069,0.377087,0.372548
4,0.705,1.073587,0.72319,0.484454,0.439628,0.445076
5,0.4095,1.081363,0.743355,0.602444,0.540493,0.548352
6,0.2323,1.12967,0.767186,0.673313,0.621853,0.624644
7,0.1058,1.2185,0.773602,0.684435,0.633417,0.63887
8,0.0455,1.302765,0.791934,0.695004,0.642261,0.654197
9,0.0272,1.283791,0.793767,0.673247,0.64877,0.648753
10,0.0132,1.402506,0.788268,0.666119,0.64,0.641725


[I 2025-03-15 11:17:10,606] Trial 63 pruned. 


Trial 64 with params: {'learning_rate': 1.6488779238415115e-05, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8917,3.86657,0.176902,0.003538,0.02,0.006012
2,3.8401,3.808048,0.176902,0.003538,0.02,0.006012
3,3.7631,3.697543,0.176902,0.003538,0.02,0.006012
4,3.6008,3.491711,0.176902,0.003538,0.02,0.006012
5,3.4116,3.298414,0.176902,0.003538,0.02,0.006012
6,3.2599,3.216775,0.176902,0.003538,0.02,0.006012
7,3.2022,3.172623,0.176902,0.003538,0.02,0.006012
8,3.1792,3.140635,0.176902,0.003538,0.02,0.006012
9,3.1507,3.111925,0.176902,0.003538,0.02,0.006012
10,3.1153,3.081209,0.176902,0.003538,0.02,0.006012


[I 2025-03-15 11:18:48,788] Trial 64 pruned. 


Trial 65 with params: {'learning_rate': 0.004191458310489591, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5522,1.94826,0.52154,0.174794,0.168445,0.15009
2,1.572,1.37651,0.659028,0.319718,0.301163,0.285576
3,1.0916,1.131038,0.712191,0.42456,0.390976,0.391433
4,0.7002,1.007297,0.738772,0.484496,0.47268,0.462323
5,0.3942,1.005809,0.752521,0.58856,0.525847,0.541997
6,0.2185,1.115684,0.76352,0.663149,0.586773,0.60117
7,0.1201,1.219148,0.781852,0.671923,0.624723,0.635911
8,0.0503,1.326648,0.780935,0.670092,0.610183,0.620485
9,0.0324,1.459389,0.785518,0.724473,0.640604,0.665046
10,0.0173,1.447704,0.792851,0.667641,0.635286,0.637806


[I 2025-03-15 11:21:16,737] Trial 65 finished with value: 0.6725581080660935 and parameters: {'learning_rate': 0.004191458310489591, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 0}. Best is trial 45 with value: 0.7063463805279003.


Trial 66 with params: {'learning_rate': 0.004621271672252057, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6436,1.978197,0.493126,0.173066,0.156198,0.143091
2,1.5702,1.470097,0.637947,0.334727,0.285941,0.27908
3,1.0512,1.124523,0.718607,0.433127,0.393082,0.393009
4,0.6239,0.99932,0.756187,0.608583,0.521489,0.541623
5,0.3402,1.03191,0.759853,0.64717,0.584124,0.597666
6,0.1817,1.137007,0.780018,0.686009,0.58963,0.614529
7,0.0887,1.154813,0.802016,0.740272,0.672502,0.687688
8,0.0445,1.244041,0.783685,0.69524,0.649995,0.654022
9,0.0242,1.203651,0.79835,0.751983,0.675597,0.697569
10,0.0082,1.325035,0.79835,0.721788,0.670032,0.677219


[I 2025-03-15 11:23:46,134] Trial 66 finished with value: 0.6843052440964152 and parameters: {'learning_rate': 0.004621271672252057, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 2}. Best is trial 45 with value: 0.7063463805279003.


Trial 67 with params: {'learning_rate': 0.003935499666314333, 'weight_decay': 0.003, 'adam_beta1': 0.91, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7975,2.048577,0.472961,0.126297,0.128592,0.105344
2,1.6491,1.526074,0.623281,0.2924,0.269593,0.261846
3,1.1498,1.182802,0.705775,0.418568,0.374252,0.372836
4,0.7329,1.111918,0.72044,0.472796,0.458883,0.453817
5,0.4522,1.08849,0.756187,0.58734,0.496128,0.514715
6,0.2818,1.170735,0.753437,0.619826,0.565382,0.571089
7,0.1477,1.292475,0.767186,0.63837,0.57281,0.590856
8,0.0911,1.265814,0.780018,0.639011,0.602766,0.604524
9,0.0467,1.353039,0.780935,0.626471,0.608564,0.605434
10,0.0182,1.424036,0.782768,0.637069,0.621926,0.615003


[I 2025-03-15 11:24:35,209] Trial 67 pruned. 


Trial 68 with params: {'learning_rate': 0.0016922008475512538, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9414,2.251145,0.432631,0.118352,0.10386,0.086173
2,1.921,1.724224,0.562786,0.243347,0.194663,0.182924
3,1.4289,1.310253,0.679193,0.325056,0.313023,0.303141
4,1.0241,1.208412,0.701192,0.437716,0.389524,0.394324
5,0.7605,1.086381,0.727773,0.470901,0.427841,0.432919
6,0.5374,1.055884,0.730522,0.537305,0.487241,0.490139
7,0.3605,1.17318,0.737855,0.582318,0.457849,0.488259
8,0.259,1.182585,0.741522,0.613023,0.524535,0.542882
9,0.1709,1.195454,0.743355,0.638774,0.549508,0.568704
10,0.1069,1.315109,0.748854,0.648888,0.578892,0.596608


[I 2025-03-15 11:25:22,129] Trial 68 pruned. 


Trial 69 with params: {'learning_rate': 0.0024127986062345056, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8113,2.072264,0.485793,0.134441,0.133233,0.111345
2,1.7609,1.574099,0.626031,0.273061,0.273036,0.258677
3,1.2534,1.206417,0.691109,0.386821,0.331701,0.33522
4,0.8648,1.068295,0.732356,0.507923,0.442292,0.447778
5,0.574,1.005606,0.75527,0.612116,0.514601,0.539551
6,0.3708,1.172415,0.739688,0.60447,0.519565,0.528075
7,0.2382,1.246238,0.753437,0.680707,0.55162,0.590064
8,0.1487,1.2288,0.76352,0.670774,0.574368,0.598374
9,0.0974,1.226138,0.775435,0.670497,0.630346,0.630376
10,0.0493,1.30874,0.773602,0.678477,0.634652,0.633455


[I 2025-03-15 11:27:49,139] Trial 69 finished with value: 0.671980491366742 and parameters: {'learning_rate': 0.0024127986062345056, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 2}. Best is trial 45 with value: 0.7063463805279003.


Trial 70 with params: {'learning_rate': 5.44745444986144e-05, 'weight_decay': 0.001, 'adam_beta1': 0.99, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8435,3.747354,0.176902,0.003538,0.02,0.006012
2,3.6014,3.418635,0.176902,0.003538,0.02,0.006012
3,3.2985,3.227197,0.176902,0.003538,0.02,0.006012
4,3.2152,3.176625,0.176902,0.003538,0.02,0.006012
5,3.1781,3.097197,0.228231,0.02285,0.034989,0.019705
6,3.085,3.039439,0.187901,0.019807,0.023826,0.012246
7,3.0194,2.979611,0.216315,0.056579,0.034107,0.025218
8,2.9658,2.902244,0.333639,0.050589,0.064002,0.049311
9,2.8798,2.844018,0.353804,0.038446,0.067905,0.042286
10,2.816,2.789917,0.350137,0.026739,0.066872,0.038133


[I 2025-03-15 11:29:30,652] Trial 70 pruned. 


Trial 71 with params: {'learning_rate': 0.004907829002882974, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.768,2.007273,0.496792,0.143187,0.14399,0.118442
2,1.6208,1.462395,0.627864,0.297818,0.286863,0.269323
3,1.1123,1.132387,0.712191,0.38887,0.393024,0.382257
4,0.6877,1.105919,0.730522,0.529945,0.45932,0.476726
5,0.3897,1.038112,0.773602,0.608531,0.582391,0.584484
6,0.1963,1.205044,0.761687,0.673884,0.614956,0.628757
7,0.1112,1.262974,0.779102,0.722063,0.663429,0.678338
8,0.0497,1.384657,0.769019,0.648849,0.632809,0.627866
9,0.0367,1.34393,0.776352,0.67882,0.655709,0.649161
10,0.014,1.411914,0.788268,0.661818,0.657427,0.646613


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--accuracy/f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Sat Oct 12 13:56:14 2024) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.
[I 2025-03-15 11:31:43,508] Trial 71 pruned. 


Trial 72 with params: {'learning_rate': 0.004081166220460519, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6255,1.988606,0.515124,0.184768,0.160968,0.143751
2,1.5883,1.418163,0.653529,0.31785,0.316542,0.308528
3,1.0798,1.104838,0.72044,0.424074,0.396734,0.394113
4,0.6769,1.048933,0.736939,0.531905,0.498544,0.49835
5,0.3652,1.017147,0.768103,0.63749,0.563508,0.576285
6,0.1972,1.147141,0.778185,0.663665,0.609789,0.612749
7,0.1057,1.232292,0.784601,0.673886,0.602191,0.624813
8,0.0592,1.284177,0.784601,0.678069,0.618408,0.633863
9,0.0279,1.348324,0.79835,0.731545,0.647976,0.670217
10,0.0106,1.358567,0.805683,0.710571,0.660278,0.672127


[I 2025-03-15 11:34:12,376] Trial 72 finished with value: 0.67466637364978 and parameters: {'learning_rate': 0.004081166220460519, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 73 with params: {'learning_rate': 4.81823673316245e-05, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8423,3.737549,0.176902,0.003538,0.02,0.006012
2,3.5214,3.269752,0.176902,0.003538,0.02,0.006012
3,3.2014,3.126932,0.176902,0.003538,0.02,0.006012
4,3.089,3.019292,0.222731,0.02778,0.032243,0.022488
5,3.0071,2.915634,0.295142,0.049655,0.051486,0.035608
6,2.8764,2.824597,0.332722,0.034464,0.062886,0.03872
7,2.7889,2.753312,0.338222,0.035283,0.064563,0.040648
8,2.731,2.685697,0.35472,0.037029,0.071163,0.046087
9,2.6698,2.63545,0.36297,0.037035,0.073507,0.047727
10,2.611,2.597606,0.372136,0.038645,0.077752,0.050189


[I 2025-03-15 11:35:03,208] Trial 73 pruned. 


Trial 74 with params: {'learning_rate': 0.0025582497840849327, 'weight_decay': 0.005, 'adam_beta1': 0.91, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8064,2.107639,0.47846,0.139244,0.131712,0.114656
2,1.765,1.590726,0.604033,0.270294,0.242059,0.237959
3,1.2858,1.262246,0.673694,0.366483,0.328396,0.324242
4,0.8822,1.09668,0.721357,0.540056,0.452376,0.467361
5,0.6126,1.014427,0.754354,0.570769,0.518033,0.530131
6,0.3923,1.068894,0.745188,0.607911,0.520249,0.540583
7,0.2419,1.203367,0.748854,0.647745,0.54719,0.577767
8,0.1394,1.295446,0.768103,0.65103,0.57267,0.584519
9,0.0809,1.28901,0.773602,0.693127,0.620067,0.638526
10,0.0551,1.352673,0.76352,0.636841,0.610652,0.608008


[I 2025-03-15 11:35:52,277] Trial 74 pruned. 


Trial 75 with params: {'learning_rate': 0.0025244426144617137, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7574,2.039215,0.496792,0.124958,0.141227,0.118818
2,1.717,1.518333,0.635197,0.259119,0.279412,0.2568
3,1.1948,1.216667,0.683776,0.363493,0.337141,0.33404
4,0.8295,1.098555,0.732356,0.538299,0.458742,0.478448
5,0.5423,1.036093,0.76077,0.594323,0.516701,0.52441
6,0.3504,1.086735,0.754354,0.580731,0.513354,0.524163
7,0.2241,1.220371,0.756187,0.656201,0.557265,0.589036
8,0.1382,1.243067,0.765353,0.646675,0.610622,0.611406
9,0.0753,1.352096,0.75527,0.668438,0.589459,0.606078
10,0.0588,1.319685,0.779102,0.694136,0.614338,0.639686


[I 2025-03-15 11:38:21,827] Trial 75 finished with value: 0.6672486105480107 and parameters: {'learning_rate': 0.0025244426144617137, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 2}. Best is trial 45 with value: 0.7063463805279003.


Trial 76 with params: {'learning_rate': 0.004371290245858929, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6485,2.036927,0.489459,0.171441,0.161136,0.147539
2,1.5782,1.404212,0.658112,0.333496,0.304374,0.302466
3,1.0423,1.128299,0.725023,0.444861,0.431165,0.424275
4,0.6498,1.023967,0.737855,0.536134,0.49977,0.501025
5,0.3565,1.09671,0.749771,0.604965,0.527277,0.543022
6,0.1976,1.251723,0.758937,0.699302,0.605323,0.63365
7,0.1152,1.202707,0.776352,0.701435,0.634452,0.648773
8,0.0557,1.299979,0.784601,0.700054,0.63716,0.647786
9,0.0271,1.297296,0.791934,0.698772,0.65477,0.663627
10,0.015,1.397684,0.789184,0.694904,0.637856,0.650093


[I 2025-03-15 11:40:57,374] Trial 76 finished with value: 0.6668109630653086 and parameters: {'learning_rate': 0.004371290245858929, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 2}. Best is trial 45 with value: 0.7063463805279003.


Trial 77 with params: {'learning_rate': 0.00022309840089248744, 'weight_decay': 0.004, 'adam_beta1': 0.99, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.576,3.265531,0.176902,0.003538,0.02,0.006012
2,3.1549,3.00612,0.245646,0.052849,0.044744,0.029708
3,2.9115,2.796508,0.36297,0.043037,0.071794,0.048801
4,2.6926,2.624916,0.351054,0.038055,0.069215,0.03983
5,2.5574,2.445514,0.401467,0.056349,0.086937,0.061995


[I 2025-03-15 11:41:22,200] Trial 77 pruned. 


Trial 78 with params: {'learning_rate': 0.00016778463957892002, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5978,3.168915,0.176902,0.003538,0.02,0.006012
2,3.0296,2.854913,0.324473,0.028244,0.059587,0.037227
3,2.7378,2.601469,0.378552,0.038754,0.079374,0.050815
4,2.4958,2.400144,0.401467,0.062905,0.084495,0.058061
5,2.3294,2.217533,0.454629,0.080085,0.107014,0.082641


[I 2025-03-15 11:41:50,018] Trial 78 pruned. 


Trial 79 with params: {'learning_rate': 0.004783493217024564, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7516,2.006475,0.503208,0.135081,0.143071,0.113455
2,1.6229,1.462121,0.617782,0.30044,0.281459,0.261049
3,1.1185,1.139382,0.715857,0.392595,0.401813,0.387681
4,0.6992,1.081414,0.71769,0.509153,0.453258,0.463868
5,0.3956,1.080009,0.752521,0.579746,0.543605,0.546191
6,0.2248,1.093767,0.774519,0.649542,0.595684,0.609477
7,0.1266,1.300879,0.773602,0.693012,0.616619,0.6333
8,0.0485,1.359537,0.779102,0.705961,0.639448,0.651137
9,0.028,1.353575,0.780018,0.67033,0.622405,0.63487
10,0.0137,1.428111,0.786434,0.65789,0.64928,0.640547


[I 2025-03-15 11:44:23,886] Trial 79 finished with value: 0.6723461568506022 and parameters: {'learning_rate': 0.004783493217024564, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 80 with params: {'learning_rate': 0.004438355735864292, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6251,1.947631,0.501375,0.169754,0.169951,0.155233
2,1.5198,1.45078,0.64528,0.325556,0.297057,0.288072
3,1.0241,1.102531,0.729606,0.409411,0.41229,0.395503
4,0.6276,1.045867,0.745188,0.560854,0.503954,0.517488
5,0.3461,1.05955,0.770852,0.663596,0.577112,0.591736
6,0.2071,1.093386,0.777269,0.711,0.623941,0.646525
7,0.106,1.22851,0.785518,0.661328,0.604278,0.615886
8,0.0558,1.278749,0.785518,0.680872,0.649144,0.651205
9,0.0363,1.369265,0.784601,0.684662,0.638062,0.64953
10,0.0183,1.315226,0.790101,0.686136,0.615939,0.631414


[I 2025-03-15 11:45:14,155] Trial 80 pruned. 


Trial 81 with params: {'learning_rate': 0.004359103511338755, 'weight_decay': 0.004, 'adam_beta1': 0.92, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6445,1.993476,0.517874,0.169666,0.16413,0.14473
2,1.6212,1.478746,0.640697,0.316037,0.28699,0.281401
3,1.1194,1.220374,0.698442,0.404714,0.383685,0.373696
4,0.6913,1.071425,0.731439,0.510978,0.46883,0.474491
5,0.4122,1.055397,0.749771,0.586656,0.520578,0.530451
6,0.2315,1.13215,0.766269,0.64107,0.609027,0.61304
7,0.1262,1.281813,0.783685,0.673168,0.630865,0.637789
8,0.0596,1.374584,0.773602,0.709887,0.64409,0.657147
9,0.031,1.354982,0.791934,0.737602,0.643209,0.673897
10,0.0207,1.523307,0.773602,0.664207,0.605508,0.617145


[I 2025-03-15 11:46:56,327] Trial 81 pruned. 


Trial 82 with params: {'learning_rate': 0.003622058398317237, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6752,1.944445,0.514207,0.161399,0.160088,0.141493
2,1.6039,1.44548,0.654445,0.31314,0.291715,0.285654
3,1.1102,1.118819,0.708524,0.398831,0.368583,0.362414
4,0.6969,1.04788,0.729606,0.537181,0.507209,0.501079
5,0.4305,1.03541,0.747938,0.568,0.527284,0.532487
6,0.2499,1.084289,0.754354,0.646447,0.593865,0.60003
7,0.1367,1.263939,0.752521,0.659815,0.588119,0.602433
8,0.0714,1.218871,0.773602,0.664259,0.621547,0.626931
9,0.0388,1.420225,0.765353,0.618543,0.578688,0.588408
10,0.0268,1.370801,0.773602,0.649969,0.607713,0.61807


[I 2025-03-15 11:49:26,522] Trial 82 finished with value: 0.666249687462892 and parameters: {'learning_rate': 0.003622058398317237, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 0}. Best is trial 45 with value: 0.7063463805279003.


Trial 83 with params: {'learning_rate': 0.004123293698958752, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6255,1.984881,0.506874,0.16874,0.15592,0.135737
2,1.5797,1.439687,0.644363,0.318058,0.310855,0.30513
3,1.0705,1.114003,0.705775,0.395888,0.386487,0.378019
4,0.6622,0.995875,0.744271,0.522576,0.484358,0.48872
5,0.3744,0.995134,0.767186,0.625368,0.567777,0.571107
6,0.2208,1.091887,0.777269,0.681593,0.63049,0.643208
7,0.1112,1.160905,0.797434,0.687316,0.64352,0.651569
8,0.0423,1.212605,0.787351,0.666477,0.619699,0.631189
9,0.0333,1.31994,0.788268,0.704048,0.630304,0.647328
10,0.0298,1.414852,0.780935,0.680591,0.609223,0.628214


[I 2025-03-15 11:51:10,210] Trial 83 pruned. 


Trial 84 with params: {'learning_rate': 0.00035402800746304916, 'weight_decay': 0.01, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3992,3.031965,0.2044,0.019054,0.030909,0.018967
2,2.8081,2.591084,0.36297,0.038203,0.073387,0.045902
3,2.4162,2.235425,0.44088,0.066449,0.100366,0.074663
4,2.1142,1.995994,0.497709,0.098888,0.136153,0.112272
5,1.8863,1.83078,0.55912,0.1614,0.177641,0.153575


[I 2025-03-15 11:51:52,975] Trial 84 pruned. 


Trial 85 with params: {'learning_rate': 0.002298943120377134, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7903,2.07206,0.483043,0.13239,0.132427,0.112506
2,1.7575,1.584264,0.611366,0.265953,0.252332,0.245733
3,1.2776,1.267332,0.67736,0.352921,0.327665,0.325214
4,0.8897,1.097467,0.727773,0.512277,0.438981,0.453382
5,0.6147,1.02115,0.747938,0.541067,0.498432,0.508939
6,0.4165,1.097381,0.731439,0.621748,0.527725,0.538193
7,0.2634,1.230897,0.753437,0.663857,0.558592,0.586105
8,0.1626,1.226808,0.769019,0.682378,0.592359,0.617166
9,0.0946,1.26103,0.770852,0.642628,0.616214,0.610859
10,0.0678,1.306553,0.773602,0.660014,0.623311,0.627703


[I 2025-03-15 11:52:41,815] Trial 85 pruned. 


Trial 86 with params: {'learning_rate': 0.0026530564028649476, 'weight_decay': 0.005, 'adam_beta1': 0.98, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8545,2.227153,0.450962,0.081893,0.113664,0.08871
2,1.9498,1.775885,0.55912,0.174962,0.199851,0.172318
3,1.5559,1.528434,0.638863,0.285866,0.273108,0.265898
4,1.2043,1.318037,0.681943,0.328978,0.341491,0.326477
5,0.862,1.249269,0.709441,0.409234,0.386874,0.389241
6,0.6263,1.199872,0.71769,0.509534,0.429964,0.446294
7,0.4154,1.227633,0.742438,0.540634,0.502388,0.509155
8,0.263,1.272703,0.754354,0.604091,0.541639,0.555768
9,0.1581,1.461892,0.744271,0.616929,0.56379,0.574407
10,0.0937,1.570063,0.754354,0.641757,0.584531,0.59719


[I 2025-03-15 11:53:29,430] Trial 86 pruned. 


Trial 87 with params: {'learning_rate': 0.0015951275477887948, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8677,2.211309,0.450962,0.087207,0.113419,0.09323
2,1.906,1.740937,0.55637,0.210624,0.182035,0.164214
3,1.4501,1.341375,0.662695,0.311101,0.300717,0.288781
4,1.065,1.224125,0.692026,0.42854,0.374824,0.377674
5,0.7908,1.111555,0.729606,0.433214,0.410547,0.412673
6,0.5809,1.08187,0.725023,0.498231,0.458522,0.464662
7,0.4085,1.163234,0.735105,0.589311,0.476491,0.50273
8,0.2791,1.149486,0.744271,0.592239,0.527301,0.53757
9,0.1774,1.252714,0.740605,0.602281,0.545826,0.559877
10,0.1198,1.320512,0.747021,0.621297,0.600072,0.598967


[I 2025-03-15 11:55:05,719] Trial 87 pruned. 


Trial 88 with params: {'learning_rate': 0.004046513561944527, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6207,1.990595,0.512374,0.186746,0.163924,0.145722
2,1.5821,1.420117,0.652612,0.347323,0.302619,0.301869
3,1.0868,1.138095,0.713107,0.393022,0.389577,0.383666
4,0.6937,1.067083,0.716774,0.479304,0.450072,0.453656
5,0.3969,1.037107,0.762603,0.622156,0.555079,0.568998
6,0.227,1.132959,0.774519,0.71507,0.623063,0.643741
7,0.1153,1.21209,0.779102,0.725486,0.605236,0.638817
8,0.0511,1.295279,0.785518,0.697132,0.633678,0.645642
9,0.0372,1.275372,0.780018,0.676551,0.641937,0.64224
10,0.0133,1.358218,0.790101,0.692208,0.656398,0.662151


[I 2025-03-15 11:57:39,926] Trial 88 finished with value: 0.6897905643254393 and parameters: {'learning_rate': 0.004046513561944527, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 89 with params: {'learning_rate': 0.0049310047640707564, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7746,2.009837,0.497709,0.13195,0.146817,0.119784
2,1.6364,1.495482,0.617782,0.291001,0.283057,0.269505
3,1.1168,1.148588,0.712191,0.392249,0.393124,0.385553
4,0.6898,1.026233,0.727773,0.490496,0.446591,0.448232
5,0.3776,1.059259,0.751604,0.551371,0.53036,0.528376
6,0.1963,1.143771,0.765353,0.66121,0.58622,0.602647
7,0.1191,1.271441,0.781852,0.70239,0.610981,0.635655
8,0.0647,1.287715,0.785518,0.679316,0.63822,0.645015
9,0.036,1.280738,0.783685,0.70893,0.637147,0.657741
10,0.0184,1.463322,0.779102,0.687987,0.627574,0.639837


[I 2025-03-15 12:00:05,945] Trial 89 finished with value: 0.6661443871716736 and parameters: {'learning_rate': 0.0049310047640707564, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 90 with params: {'learning_rate': 0.0024455680105008697, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8114,2.115949,0.48121,0.125529,0.132039,0.109881
2,1.7759,1.58566,0.6022,0.275372,0.242069,0.240247
3,1.3074,1.255033,0.688359,0.377947,0.334564,0.332332
4,0.9081,1.09084,0.715857,0.513218,0.429158,0.444299
5,0.6226,1.0324,0.750687,0.537678,0.493732,0.501257
6,0.4185,1.079028,0.747021,0.602992,0.537434,0.551162
7,0.2716,1.213544,0.754354,0.63315,0.531685,0.553797
8,0.1708,1.202792,0.766269,0.681328,0.584571,0.604506
9,0.098,1.282765,0.771769,0.659582,0.594416,0.611876
10,0.0614,1.314222,0.764436,0.662186,0.632852,0.628042


[I 2025-03-15 12:00:55,884] Trial 90 pruned. 


Trial 91 with params: {'learning_rate': 0.003756446836091649, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6552,1.969514,0.52429,0.154476,0.164305,0.14163
2,1.6359,1.42471,0.664528,0.330178,0.310764,0.30524
3,1.1146,1.136845,0.71494,0.393813,0.383877,0.377159
4,0.7218,1.059422,0.722273,0.536699,0.472197,0.486612
5,0.4219,1.014785,0.770852,0.627541,0.542503,0.56317
6,0.2417,1.094582,0.765353,0.661479,0.594735,0.604237
7,0.1218,1.219996,0.784601,0.709544,0.608902,0.63556
8,0.0648,1.296314,0.783685,0.701068,0.622286,0.638443
9,0.0391,1.268316,0.786434,0.680703,0.652151,0.655824
10,0.018,1.370807,0.785518,0.692118,0.647999,0.653447


[I 2025-03-15 12:03:26,842] Trial 91 finished with value: 0.6697459122235678 and parameters: {'learning_rate': 0.003756446836091649, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 92 with params: {'learning_rate': 0.003419172044318328, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6824,1.98126,0.516957,0.142118,0.162242,0.139272
2,1.6443,1.408098,0.660862,0.352838,0.29489,0.289281
3,1.1365,1.163277,0.703941,0.391627,0.376863,0.368888
4,0.7379,1.03894,0.735105,0.518602,0.47101,0.478882
5,0.4438,1.015904,0.750687,0.594661,0.51335,0.529374
6,0.275,1.098363,0.749771,0.639688,0.57674,0.580786
7,0.1398,1.166571,0.776352,0.656509,0.590439,0.607053
8,0.0715,1.24211,0.791934,0.696083,0.63216,0.650365
9,0.0458,1.326145,0.780018,0.708161,0.644842,0.655062
10,0.0368,1.357871,0.784601,0.69847,0.627343,0.640437


[I 2025-03-15 12:05:00,643] Trial 92 pruned. 


Trial 93 with params: {'learning_rate': 0.0027984324326431743, 'weight_decay': 0.01, 'adam_beta1': 0.91, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8391,2.067158,0.484876,0.122312,0.133272,0.111734
2,1.7324,1.530807,0.621448,0.279714,0.269748,0.258259
3,1.2145,1.200787,0.692942,0.422202,0.35021,0.358284
4,0.8387,1.125491,0.714024,0.496714,0.427373,0.446058
5,0.5533,1.06315,0.75527,0.539279,0.478608,0.487216


[I 2025-03-15 12:05:26,308] Trial 93 pruned. 


Trial 94 with params: {'learning_rate': 0.004499643753494019, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6574,1.988463,0.515124,0.186387,0.164942,0.148235
2,1.6017,1.453969,0.654445,0.319107,0.303553,0.297207
3,1.0937,1.128989,0.713107,0.412445,0.389266,0.388944
4,0.6985,1.063862,0.719523,0.560837,0.489436,0.499612
5,0.3832,1.026808,0.762603,0.6331,0.54557,0.56278
6,0.213,1.08185,0.783685,0.675418,0.61894,0.628451
7,0.1123,1.161327,0.792851,0.753939,0.642215,0.678859
8,0.0475,1.211635,0.773602,0.665328,0.650958,0.637612
9,0.0279,1.245626,0.796517,0.728214,0.651322,0.672274
10,0.0082,1.307853,0.8011,0.709576,0.657531,0.668201


[I 2025-03-15 12:08:06,275] Trial 94 finished with value: 0.6957462825373973 and parameters: {'learning_rate': 0.004499643753494019, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 95 with params: {'learning_rate': 0.0007096725961486922, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.1953,2.692621,0.330889,0.036111,0.068708,0.041636
2,2.3349,2.080409,0.492209,0.103717,0.131013,0.104812
3,1.8826,1.72832,0.550871,0.189911,0.169693,0.156388
4,1.5293,1.488599,0.63978,0.284034,0.259903,0.254595
5,1.2604,1.338531,0.661778,0.329766,0.299191,0.295287


[I 2025-03-15 12:08:32,588] Trial 95 pruned. 


Trial 96 with params: {'learning_rate': 0.004376266806261626, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6423,2.032132,0.488543,0.173597,0.159555,0.145849
2,1.5801,1.390657,0.654445,0.338372,0.305491,0.303739
3,1.0414,1.134556,0.700275,0.379357,0.389164,0.374241
4,0.6438,1.076991,0.72594,0.515397,0.481842,0.482601
5,0.3671,1.03894,0.770852,0.628931,0.557331,0.570059
6,0.2104,1.200173,0.771769,0.660975,0.602115,0.615753
7,0.1081,1.197075,0.791934,0.718315,0.642303,0.653542
8,0.0546,1.21766,0.783685,0.697223,0.640082,0.651132
9,0.0252,1.260682,0.791934,0.726105,0.648365,0.670641
10,0.0085,1.379495,0.792851,0.731416,0.639766,0.6692


[I 2025-03-15 12:10:12,493] Trial 96 pruned. 


Trial 97 with params: {'learning_rate': 1.0626063505848356e-05, 'weight_decay': 0.005, 'adam_beta1': 0.93, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8981,3.8822,0.111824,0.007623,0.022435,0.009515
2,3.8679,3.852218,0.176902,0.003538,0.02,0.006012
3,3.8356,3.81352,0.176902,0.003538,0.02,0.006012
4,3.7872,3.758817,0.176902,0.003538,0.02,0.006012
5,3.7268,3.672849,0.176902,0.003538,0.02,0.006012
6,3.6168,3.550586,0.176902,0.003538,0.02,0.006012
7,3.482,3.415722,0.176902,0.003538,0.02,0.006012
8,3.3739,3.31588,0.176902,0.003538,0.02,0.006012
9,3.2996,3.257548,0.176902,0.003538,0.02,0.006012
10,3.2498,3.221058,0.176902,0.003538,0.02,0.006012


[I 2025-03-15 12:10:59,705] Trial 97 pruned. 


Trial 98 with params: {'learning_rate': 0.0040943879908314895, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.729,2.006084,0.494042,0.174862,0.152886,0.130801
2,1.6409,1.526869,0.620532,0.260525,0.275414,0.255041
3,1.1366,1.178938,0.707608,0.383809,0.388049,0.375766
4,0.7132,1.090434,0.730522,0.48796,0.456698,0.455082
5,0.4137,1.058499,0.759853,0.62005,0.582153,0.57355
6,0.246,1.1366,0.771769,0.669599,0.580846,0.602575
7,0.1368,1.238026,0.781852,0.672368,0.61241,0.628428
8,0.0612,1.234538,0.786434,0.656334,0.632192,0.617531
9,0.0328,1.375846,0.773602,0.620202,0.601271,0.593231
10,0.0225,1.369985,0.785518,0.655503,0.616084,0.616


[I 2025-03-15 12:11:50,400] Trial 98 pruned. 


Trial 99 with params: {'learning_rate': 0.00016746164512569846, 'weight_decay': 0.008, 'adam_beta1': 0.96, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6158,3.228316,0.176902,0.003538,0.02,0.006012
2,3.1332,3.005228,0.2044,0.038669,0.03064,0.019648
3,2.8858,2.740725,0.329056,0.024874,0.062371,0.034447
4,2.6251,2.521542,0.388634,0.038347,0.081198,0.05185
5,2.4542,2.345482,0.434464,0.06483,0.098177,0.069983
6,2.2616,2.190346,0.458295,0.089093,0.108853,0.083467
7,2.1264,2.130132,0.454629,0.097378,0.110125,0.091861
8,2.0269,2.004963,0.488543,0.102803,0.132477,0.109101
9,1.9185,1.904534,0.507791,0.13418,0.139013,0.118635
10,1.8216,1.85068,0.527956,0.169744,0.158968,0.139766


[I 2025-03-15 12:13:31,387] Trial 99 pruned. 


Trial 100 with params: {'learning_rate': 0.004698538915849571, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7461,1.991863,0.496792,0.125288,0.144988,0.115448
2,1.6528,1.488302,0.632447,0.294679,0.283303,0.271447
3,1.1423,1.173582,0.693859,0.394281,0.370167,0.363432
4,0.7321,1.081545,0.725023,0.512708,0.457316,0.468754
5,0.414,1.023079,0.747938,0.546053,0.517524,0.511377
6,0.2276,1.139388,0.764436,0.708306,0.613782,0.635417
7,0.1115,1.23992,0.76077,0.680062,0.631109,0.63982
8,0.0441,1.292789,0.784601,0.661047,0.639274,0.639153
9,0.0347,1.383139,0.774519,0.663471,0.598175,0.617311
10,0.0188,1.321021,0.791934,0.728845,0.643261,0.667606


[I 2025-03-15 12:16:21,175] Trial 100 finished with value: 0.6628245623526107 and parameters: {'learning_rate': 0.004698538915849571, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 101 with params: {'learning_rate': 0.004599513628875362, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7328,2.028615,0.504125,0.127121,0.149887,0.116249
2,1.6435,1.426443,0.64528,0.333595,0.291827,0.28105
3,1.1187,1.146921,0.709441,0.438916,0.397418,0.393427
4,0.712,1.008837,0.745188,0.551481,0.479799,0.497511
5,0.4041,1.030756,0.75527,0.600767,0.544271,0.546098
6,0.2208,1.140253,0.759853,0.63442,0.563327,0.576791
7,0.1246,1.173185,0.781852,0.693662,0.63351,0.646005
8,0.0488,1.336215,0.778185,0.714282,0.627116,0.649518
9,0.0247,1.321267,0.779102,0.706216,0.637419,0.648491
10,0.022,1.337928,0.788268,0.659309,0.644571,0.633889


[I 2025-03-15 12:18:51,169] Trial 101 finished with value: 0.670480262319323 and parameters: {'learning_rate': 0.004599513628875362, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 102 with params: {'learning_rate': 0.004935362396004043, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7782,1.999005,0.505041,0.126137,0.147939,0.119596
2,1.6407,1.471856,0.633364,0.27855,0.284402,0.269051
3,1.1332,1.159454,0.708524,0.387019,0.386409,0.375975
4,0.6933,1.123559,0.732356,0.494305,0.448593,0.454405
5,0.4006,1.107831,0.744271,0.602228,0.559923,0.560537
6,0.209,1.134248,0.771769,0.650921,0.605743,0.610868
7,0.1025,1.260644,0.779102,0.713596,0.6322,0.658391
8,0.0479,1.390273,0.778185,0.71368,0.648713,0.662077
9,0.0371,1.334083,0.788268,0.696036,0.650028,0.660753
10,0.0244,1.405199,0.776352,0.694267,0.653684,0.661862


[I 2025-03-15 12:21:27,849] Trial 102 finished with value: 0.6803455641250064 and parameters: {'learning_rate': 0.004935362396004043, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 103 with params: {'learning_rate': 1.546855136785054e-05, 'weight_decay': 0.01, 'adam_beta1': 0.98, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8909,3.868233,0.176902,0.003538,0.02,0.006012
2,3.848,3.825982,0.176902,0.003538,0.02,0.006012
3,3.8022,3.770625,0.176902,0.003538,0.02,0.006012
4,3.7323,3.691834,0.176902,0.003538,0.02,0.006012
5,3.6497,3.57822,0.176902,0.003538,0.02,0.006012
6,3.5123,3.438864,0.176902,0.003538,0.02,0.006012
7,3.3722,3.309788,0.176902,0.003538,0.02,0.006012
8,3.2823,3.233778,0.176902,0.003538,0.02,0.006012
9,3.2335,3.195913,0.176902,0.003538,0.02,0.006012
10,3.1959,3.166861,0.176902,0.003538,0.02,0.006012


[I 2025-03-15 12:22:23,152] Trial 103 pruned. 


Trial 104 with params: {'learning_rate': 0.004378397142067045, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5783,1.94138,0.525206,0.175034,0.163354,0.143921
2,1.5797,1.433771,0.638863,0.288342,0.291406,0.275733
3,1.1351,1.139643,0.72319,0.423536,0.38297,0.389981
4,0.7213,1.017325,0.746104,0.539589,0.490651,0.501183
5,0.4186,1.014088,0.764436,0.601203,0.507433,0.528149
6,0.2335,1.123001,0.768103,0.620243,0.584477,0.589389
7,0.1195,1.22032,0.786434,0.699759,0.625243,0.64204
8,0.0631,1.300211,0.781852,0.630264,0.584117,0.591963
9,0.0311,1.385754,0.788268,0.666972,0.597332,0.615681
10,0.0292,1.396206,0.783685,0.688369,0.639982,0.649264


[I 2025-03-15 12:24:52,393] Trial 104 finished with value: 0.6704014981639291 and parameters: {'learning_rate': 0.004378397142067045, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 0}. Best is trial 45 with value: 0.7063463805279003.


Trial 105 with params: {'learning_rate': 1.4771448129559609e-05, 'weight_decay': 0.007, 'adam_beta1': 0.98, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8934,3.871855,0.176902,0.003558,0.02,0.006041
2,3.8528,3.832362,0.176902,0.003538,0.02,0.006012
3,3.8107,3.782138,0.176902,0.003538,0.02,0.006012
4,3.7478,3.711859,0.176902,0.003538,0.02,0.006012
5,3.6744,3.610821,0.176902,0.003538,0.02,0.006012
6,3.5503,3.481625,0.176902,0.003538,0.02,0.006012
7,3.4143,3.349291,0.176902,0.003538,0.02,0.006012
8,3.3127,3.256881,0.176902,0.003538,0.02,0.006012
9,3.2502,3.211156,0.176902,0.003538,0.02,0.006012
10,3.2096,3.180583,0.176902,0.003538,0.02,0.006012


[I 2025-03-15 12:25:39,764] Trial 105 pruned. 


Trial 106 with params: {'learning_rate': 0.0038254576962951626, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5839,1.941699,0.507791,0.161202,0.162135,0.144087
2,1.5681,1.432685,0.652612,0.316028,0.301073,0.288791
3,1.0881,1.121439,0.706691,0.391546,0.369944,0.3684
4,0.7035,1.058591,0.734189,0.536586,0.479577,0.489936
5,0.4129,1.007675,0.758937,0.659779,0.566926,0.587092
6,0.2367,1.142841,0.75802,0.646286,0.600591,0.609471
7,0.147,1.238531,0.779102,0.675454,0.609815,0.622482
8,0.0662,1.344309,0.774519,0.660789,0.623133,0.625786
9,0.0273,1.41305,0.778185,0.727752,0.660779,0.67079
10,0.023,1.333697,0.780018,0.700119,0.656866,0.662512


[I 2025-03-15 12:28:12,050] Trial 106 finished with value: 0.6718224733240867 and parameters: {'learning_rate': 0.0038254576962951626, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 0}. Best is trial 45 with value: 0.7063463805279003.


Trial 107 with params: {'learning_rate': 0.0027482815173207665, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8279,2.13714,0.484876,0.139884,0.138891,0.118962
2,1.7803,1.584759,0.609533,0.260346,0.247425,0.239605
3,1.3037,1.272892,0.676444,0.364185,0.338054,0.334613
4,0.8925,1.109533,0.727773,0.484164,0.416358,0.430918
5,0.5974,1.042187,0.747938,0.515856,0.495983,0.489065
6,0.3905,1.073755,0.750687,0.585553,0.539479,0.542892
7,0.2438,1.288449,0.765353,0.688231,0.557458,0.585407
8,0.153,1.204159,0.777269,0.678058,0.59728,0.619484
9,0.0658,1.328146,0.771769,0.690159,0.612667,0.626215
10,0.053,1.416958,0.759853,0.653307,0.622358,0.616622


[I 2025-03-15 12:30:49,398] Trial 107 finished with value: 0.6597827707850081 and parameters: {'learning_rate': 0.0027482815173207665, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 108 with params: {'learning_rate': 0.0001529301871840071, 'weight_decay': 0.01, 'adam_beta1': 0.91, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.631,3.200088,0.176902,0.003538,0.02,0.006012
2,3.0721,2.900932,0.332722,0.049879,0.061734,0.03989
3,2.7903,2.659437,0.36297,0.037173,0.074381,0.047286
4,2.5562,2.449979,0.4033,0.043138,0.085224,0.056294
5,2.3932,2.278528,0.449129,0.076154,0.104043,0.078354


[I 2025-03-15 12:31:18,277] Trial 108 pruned. 


Trial 109 with params: {'learning_rate': 0.0011125498816045374, 'weight_decay': 0.006, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.1166,2.541171,0.380385,0.052479,0.081145,0.056835
2,2.2155,1.958448,0.530706,0.180814,0.158686,0.138331
3,1.772,1.640237,0.579285,0.208394,0.198549,0.186303
4,1.3913,1.361116,0.67736,0.341793,0.321766,0.322969
5,1.09,1.283664,0.68011,0.351336,0.339941,0.32691
6,0.8616,1.179123,0.704858,0.462571,0.41679,0.419152
7,0.6689,1.14923,0.706691,0.452561,0.376804,0.394149
8,0.5338,1.136999,0.722273,0.561565,0.476917,0.492765
9,0.3887,1.10998,0.734189,0.528689,0.472578,0.488177
10,0.271,1.14244,0.740605,0.558706,0.548545,0.543776


[I 2025-03-15 12:32:55,921] Trial 109 pruned. 


Trial 110 with params: {'learning_rate': 0.004577363645732319, 'weight_decay': 0.001, 'adam_beta1': 0.91, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7319,2.027637,0.504125,0.144294,0.15045,0.118232
2,1.6496,1.49828,0.631531,0.286127,0.278099,0.263756
3,1.1404,1.158809,0.707608,0.369168,0.36883,0.35732
4,0.714,1.104219,0.718607,0.467111,0.425961,0.433665
5,0.4137,1.037354,0.753437,0.597025,0.533314,0.546924
6,0.2346,1.18229,0.754354,0.660452,0.608062,0.623034
7,0.1215,1.171921,0.785518,0.718966,0.653109,0.670709
8,0.0563,1.238168,0.784601,0.662514,0.639113,0.642908
9,0.027,1.266539,0.785518,0.678379,0.657604,0.658151
10,0.011,1.407666,0.786434,0.704407,0.634524,0.656236


[I 2025-03-15 12:35:25,926] Trial 110 finished with value: 0.6628906270151073 and parameters: {'learning_rate': 0.004577363645732319, 'weight_decay': 0.001, 'adam_beta1': 0.91, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 111 with params: {'learning_rate': 0.00015060199441375714, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.96, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6641,3.232409,0.176902,0.003538,0.02,0.006012
2,3.1683,3.046278,0.186068,0.02219,0.023547,0.01205
3,2.944,2.812189,0.346471,0.04702,0.065695,0.038688
4,2.6948,2.598215,0.370302,0.038367,0.075123,0.049042
5,2.5386,2.415592,0.413382,0.062635,0.089192,0.05878
6,2.3354,2.257409,0.450046,0.072734,0.104411,0.077525
7,2.1945,2.174076,0.453712,0.092084,0.108441,0.088782
8,2.0902,2.060163,0.483043,0.101839,0.127962,0.105794
9,1.9841,1.95851,0.493126,0.105429,0.132135,0.11051
10,1.8875,1.903952,0.517874,0.148808,0.149959,0.129247


[I 2025-03-15 12:37:03,568] Trial 111 pruned. 


Trial 112 with params: {'learning_rate': 0.004930401491637061, 'weight_decay': 0.003, 'adam_beta1': 0.91, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7787,1.998243,0.507791,0.130354,0.14906,0.121104
2,1.6463,1.470306,0.633364,0.306858,0.285087,0.269849
3,1.1272,1.149499,0.713107,0.380416,0.391823,0.375409
4,0.7101,1.14545,0.71494,0.510419,0.44064,0.447304
5,0.4039,1.101225,0.756187,0.640509,0.585281,0.592961
6,0.2154,1.180003,0.772686,0.673497,0.622271,0.629061
7,0.1088,1.271929,0.770852,0.688677,0.633958,0.644517
8,0.0576,1.306052,0.784601,0.665433,0.613579,0.624588
9,0.0293,1.373992,0.784601,0.713318,0.666634,0.671456
10,0.015,1.386595,0.789184,0.71768,0.653045,0.672201


[I 2025-03-15 12:39:37,570] Trial 112 finished with value: 0.6777770179630592 and parameters: {'learning_rate': 0.004930401491637061, 'weight_decay': 0.003, 'adam_beta1': 0.91, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 113 with params: {'learning_rate': 0.0007894793537912772, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.1747,2.651324,0.355637,0.055243,0.07785,0.048499
2,2.2862,2.022517,0.510541,0.10899,0.140813,0.114958
3,1.8359,1.690765,0.56187,0.225008,0.179084,0.167984
4,1.4672,1.440298,0.638863,0.328168,0.268782,0.267937
5,1.1858,1.306678,0.673694,0.376949,0.32427,0.321663
6,0.9597,1.184284,0.689276,0.415228,0.380882,0.381332
7,0.7772,1.196219,0.706691,0.46847,0.369956,0.390983
8,0.6631,1.13359,0.715857,0.481906,0.417374,0.427356
9,0.526,1.074445,0.718607,0.510337,0.439929,0.456314
10,0.4163,1.127487,0.715857,0.477338,0.462521,0.46319


[I 2025-03-15 12:41:23,541] Trial 113 pruned. 


Trial 114 with params: {'learning_rate': 4.0648800446916785e-05, 'weight_decay': 0.006, 'adam_beta1': 0.96, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8573,3.787249,0.176902,0.003538,0.02,0.006012
2,3.6764,3.520954,0.176902,0.003538,0.02,0.006012
3,3.3598,3.220643,0.176902,0.003538,0.02,0.006012
4,3.1795,3.134573,0.176902,0.003538,0.02,0.006012
5,3.1398,3.064902,0.176902,0.003538,0.02,0.006012


[I 2025-03-15 12:41:48,381] Trial 114 pruned. 


Trial 115 with params: {'learning_rate': 0.0017927430972615225, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9567,2.256494,0.428964,0.069746,0.101816,0.079865
2,1.932,1.741885,0.557287,0.23257,0.201056,0.185716
3,1.4594,1.340052,0.669111,0.304301,0.307956,0.296176
4,1.0359,1.193151,0.692942,0.437368,0.381982,0.38993
5,0.7634,1.078715,0.734189,0.451733,0.426538,0.429228
6,0.5321,1.094274,0.735105,0.556803,0.480725,0.499466
7,0.3644,1.140443,0.745188,0.635875,0.496195,0.526766
8,0.2474,1.201703,0.75527,0.642257,0.565775,0.580223
9,0.1712,1.204475,0.757104,0.654605,0.566969,0.59051
10,0.1096,1.335727,0.747021,0.613441,0.576708,0.572934


[I 2025-03-15 12:44:19,870] Trial 115 finished with value: 0.6488425291340052 and parameters: {'learning_rate': 0.0017927430972615225, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 116 with params: {'learning_rate': 0.004900796715879104, 'weight_decay': 0.005, 'adam_beta1': 0.91, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.771,1.991009,0.504125,0.129618,0.146343,0.119026
2,1.646,1.469444,0.633364,0.29688,0.284839,0.272748
3,1.1271,1.152038,0.706691,0.394209,0.38488,0.375951
4,0.7103,1.084908,0.72319,0.502081,0.448587,0.457142
5,0.395,1.047174,0.750687,0.583884,0.570816,0.554768
6,0.2218,1.18523,0.762603,0.636586,0.595143,0.597466
7,0.1151,1.211609,0.790101,0.710844,0.633784,0.654082
8,0.0577,1.269523,0.786434,0.697708,0.643634,0.65383
9,0.0289,1.322902,0.780935,0.688818,0.639561,0.652328
10,0.0166,1.389688,0.784601,0.700666,0.634345,0.648816


[I 2025-03-15 12:46:57,653] Trial 116 finished with value: 0.6961633008655267 and parameters: {'learning_rate': 0.004900796715879104, 'weight_decay': 0.005, 'adam_beta1': 0.91, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 117 with params: {'learning_rate': 0.004379967818196696, 'weight_decay': 0.007, 'adam_beta1': 0.92, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6521,1.988565,0.517874,0.163266,0.158198,0.137736
2,1.6258,1.46047,0.637947,0.317697,0.281958,0.273272
3,1.1249,1.194131,0.696609,0.367923,0.369111,0.355998
4,0.7015,1.075193,0.733272,0.509847,0.468674,0.474673
5,0.4015,1.091485,0.766269,0.676867,0.577394,0.598196
6,0.2498,1.166956,0.766269,0.70256,0.606725,0.632612
7,0.1475,1.160164,0.784601,0.690057,0.633873,0.645939
8,0.0671,1.350363,0.762603,0.661743,0.61957,0.617661
9,0.0291,1.383964,0.777269,0.694188,0.634638,0.651564
10,0.0138,1.410655,0.784601,0.705631,0.662705,0.673834


[I 2025-03-15 12:49:25,914] Trial 117 finished with value: 0.6717800221339172 and parameters: {'learning_rate': 0.004379967818196696, 'weight_decay': 0.007, 'adam_beta1': 0.92, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 118 with params: {'learning_rate': 0.004289383498314753, 'weight_decay': 0.006, 'adam_beta1': 0.99, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.871,2.184252,0.474794,0.113082,0.134774,0.112358
2,1.9083,1.737296,0.55912,0.209646,0.202239,0.186504
3,1.5074,1.49437,0.638863,0.29057,0.279726,0.275459
4,1.1352,1.341391,0.67736,0.363546,0.33484,0.335032
5,0.8129,1.305374,0.704858,0.389284,0.391937,0.383191
6,0.582,1.318481,0.719523,0.426035,0.387144,0.393819
7,0.3878,1.394235,0.730522,0.504677,0.465439,0.468121
8,0.2473,1.549507,0.742438,0.555833,0.512405,0.51909
9,0.1271,1.66264,0.751604,0.607832,0.562129,0.569894
10,0.0787,1.839875,0.746104,0.579196,0.57037,0.565513


[I 2025-03-15 12:51:01,539] Trial 118 pruned. 


Trial 119 with params: {'learning_rate': 0.0035124182582184174, 'weight_decay': 0.004, 'adam_beta1': 0.92, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7745,1.974759,0.501375,0.114498,0.137294,0.115133
2,1.6704,1.526983,0.615949,0.260533,0.267457,0.24749
3,1.1765,1.167666,0.704858,0.390883,0.385789,0.37949
4,0.7702,1.105928,0.729606,0.459348,0.417533,0.427205
5,0.4903,1.034939,0.753437,0.559708,0.499779,0.514779
6,0.3075,1.093491,0.75802,0.596259,0.544754,0.550841
7,0.1639,1.183344,0.762603,0.638273,0.597258,0.604498
8,0.078,1.28966,0.765353,0.606801,0.580141,0.574439
9,0.0419,1.344811,0.771769,0.653252,0.61758,0.620311
10,0.0277,1.422369,0.776352,0.664729,0.618366,0.627139


[I 2025-03-15 12:52:45,722] Trial 119 pruned. 


Trial 120 with params: {'learning_rate': 0.004733920108509699, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7528,1.992695,0.488543,0.12155,0.141383,0.113648
2,1.6449,1.4821,0.63978,0.300211,0.291722,0.278965
3,1.1382,1.12962,0.711274,0.392096,0.385307,0.37893
4,0.7262,1.059614,0.727773,0.488332,0.45681,0.463234
5,0.4133,1.047926,0.76352,0.564076,0.543177,0.53045
6,0.2366,1.086283,0.772686,0.653976,0.617596,0.621605
7,0.1056,1.256636,0.786434,0.660672,0.625741,0.62662
8,0.0457,1.385317,0.773602,0.63828,0.617121,0.609717
9,0.0307,1.384539,0.770852,0.699659,0.639432,0.654082
10,0.021,1.360389,0.789184,0.67403,0.627997,0.640377


[I 2025-03-15 12:54:24,700] Trial 120 pruned. 


Trial 121 with params: {'learning_rate': 0.0038097641056598717, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6859,1.992396,0.503208,0.121445,0.151434,0.12326
2,1.6646,1.531932,0.628781,0.301233,0.280613,0.273448
3,1.1304,1.129596,0.728689,0.41668,0.393985,0.393996
4,0.7325,1.104844,0.714024,0.516358,0.456004,0.464349
5,0.4525,1.00081,0.768103,0.601584,0.53344,0.549079
6,0.2697,1.177489,0.765353,0.642573,0.594169,0.607785
7,0.1449,1.180083,0.775435,0.681526,0.592355,0.61706
8,0.0562,1.312509,0.785518,0.670068,0.627009,0.632895
9,0.0371,1.26662,0.794684,0.70793,0.652037,0.665874
10,0.0165,1.429938,0.800183,0.716097,0.661209,0.6742


[I 2025-03-15 12:56:59,782] Trial 121 finished with value: 0.674861245085289 and parameters: {'learning_rate': 0.0038097641056598717, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 122 with params: {'learning_rate': 0.00469882327309934, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7477,1.99592,0.498625,0.126581,0.144494,0.115809
2,1.6567,1.479614,0.638863,0.298962,0.287496,0.275814
3,1.1434,1.174829,0.710357,0.40374,0.396124,0.385007
4,0.7381,1.137556,0.725023,0.51055,0.453526,0.466641
5,0.4183,1.147986,0.751604,0.620076,0.543466,0.560399
6,0.238,1.184018,0.766269,0.66692,0.615773,0.623531
7,0.1243,1.302726,0.770852,0.682079,0.615642,0.630414
8,0.0544,1.373408,0.780018,0.680583,0.633173,0.63908
9,0.0354,1.354371,0.778185,0.691744,0.636906,0.649491
10,0.0232,1.467089,0.783685,0.6899,0.620517,0.63078


[I 2025-03-15 12:59:31,897] Trial 122 finished with value: 0.663335571202401 and parameters: {'learning_rate': 0.00469882327309934, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 123 with params: {'learning_rate': 0.004989858783112163, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.78,2.003407,0.502291,0.159831,0.150826,0.128615
2,1.6271,1.472621,0.624198,0.27884,0.274121,0.252395
3,1.1138,1.146298,0.706691,0.378015,0.377853,0.365946
4,0.6705,1.074794,0.733272,0.494767,0.449427,0.457247
5,0.3695,1.076048,0.759853,0.633074,0.583206,0.587639
6,0.2003,1.16909,0.774519,0.675787,0.625932,0.62523
7,0.0953,1.228568,0.785518,0.734437,0.643446,0.65786
8,0.036,1.296008,0.784601,0.703643,0.626843,0.638102
9,0.0245,1.404435,0.780018,0.724626,0.627304,0.651207
10,0.0116,1.466158,0.791934,0.736365,0.6441,0.655593


[I 2025-03-15 13:02:07,370] Trial 123 finished with value: 0.6685091785186159 and parameters: {'learning_rate': 0.004989858783112163, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 124 with params: {'learning_rate': 0.0032767762952947825, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7412,2.004579,0.503208,0.145911,0.15067,0.127412
2,1.6658,1.453179,0.650779,0.289977,0.290211,0.280193
3,1.1619,1.171387,0.703025,0.412516,0.380327,0.378111
4,0.7447,1.023779,0.731439,0.483668,0.46091,0.462986
5,0.4677,1.030254,0.749771,0.548328,0.493108,0.499156
6,0.2883,1.201539,0.729606,0.621011,0.542808,0.55645
7,0.1635,1.211161,0.772686,0.672539,0.583602,0.610001
8,0.0816,1.213294,0.777269,0.703484,0.62847,0.641882
9,0.0515,1.317716,0.766269,0.61813,0.621861,0.606958
10,0.0487,1.368488,0.773602,0.664258,0.623336,0.627078


[I 2025-03-15 13:04:39,050] Trial 124 finished with value: 0.6681196556925351 and parameters: {'learning_rate': 0.0032767762952947825, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 125 with params: {'learning_rate': 0.0019945300271932286, 'weight_decay': 0.005, 'adam_beta1': 0.93, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9527,2.253634,0.43538,0.069896,0.103527,0.080644
2,1.9216,1.747144,0.553621,0.210555,0.195193,0.177093
3,1.4531,1.360393,0.665445,0.319758,0.31004,0.301316
4,1.0229,1.233093,0.684693,0.433399,0.370633,0.375559
5,0.7551,1.111707,0.725023,0.481504,0.435374,0.439672
6,0.5338,1.091494,0.733272,0.538109,0.460819,0.478282
7,0.371,1.1334,0.75527,0.58633,0.48004,0.505448
8,0.2549,1.238331,0.746104,0.656101,0.566048,0.585349
9,0.1697,1.187321,0.758937,0.632168,0.581572,0.594859
10,0.098,1.321563,0.75802,0.61682,0.587129,0.588607


[I 2025-03-15 13:05:31,914] Trial 125 pruned. 


Trial 126 with params: {'learning_rate': 0.0015693663764601963, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9755,2.274714,0.428964,0.115213,0.105091,0.088342
2,1.942,1.732885,0.566453,0.213017,0.190598,0.172917
3,1.455,1.343894,0.670027,0.302641,0.304356,0.294346
4,1.0557,1.212472,0.691109,0.443531,0.379336,0.389363
5,0.7788,1.09875,0.724106,0.42463,0.404343,0.402805
6,0.5738,1.164821,0.718607,0.534929,0.469669,0.476647
7,0.4107,1.196373,0.744271,0.573309,0.462404,0.486329
8,0.2932,1.20951,0.72319,0.56271,0.490034,0.509036
9,0.1898,1.203979,0.741522,0.647913,0.557675,0.583646
10,0.1173,1.403811,0.718607,0.594662,0.545976,0.552763


[I 2025-03-15 13:06:21,939] Trial 126 pruned. 


Trial 127 with params: {'learning_rate': 0.004737922867630483, 'weight_decay': 0.005, 'adam_beta1': 0.93, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7563,1.995079,0.508708,0.148623,0.150268,0.128698
2,1.6755,1.55479,0.607699,0.273186,0.252808,0.238741
3,1.1704,1.189412,0.701192,0.407231,0.386061,0.379684
4,0.7398,1.138687,0.714024,0.529771,0.433971,0.459129
5,0.4202,1.103592,0.746104,0.633753,0.565241,0.574949
6,0.2563,1.142332,0.775435,0.676678,0.625289,0.63089
7,0.1178,1.293732,0.772686,0.65967,0.607792,0.613872
8,0.0538,1.401299,0.773602,0.664503,0.61327,0.623746
9,0.0312,1.419418,0.782768,0.662457,0.634026,0.634663
10,0.016,1.531893,0.779102,0.645687,0.627341,0.629531


[I 2025-03-15 13:09:00,470] Trial 127 finished with value: 0.6650192182827627 and parameters: {'learning_rate': 0.004737922867630483, 'weight_decay': 0.005, 'adam_beta1': 0.93, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 128 with params: {'learning_rate': 0.0035483114695468044, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8692,2.102672,0.471127,0.13878,0.129184,0.112349
2,1.7493,1.543992,0.617782,0.258324,0.257507,0.246088
3,1.2465,1.237623,0.691109,0.392929,0.344196,0.345838
4,0.8076,1.081621,0.725023,0.462685,0.429546,0.434233
5,0.5077,1.030719,0.757104,0.605529,0.53108,0.53561
6,0.3147,1.086185,0.768103,0.622878,0.538973,0.550521
7,0.1883,1.238108,0.764436,0.67579,0.559927,0.586141
8,0.0943,1.23702,0.774519,0.654884,0.642229,0.626354
9,0.0571,1.377212,0.774519,0.681667,0.618183,0.628954
10,0.0345,1.306366,0.776352,0.650756,0.617609,0.617823


[I 2025-03-15 13:09:50,464] Trial 128 pruned. 


Trial 129 with params: {'learning_rate': 0.00492882802187531, 'weight_decay': 0.005, 'adam_beta1': 0.91, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.596,1.889061,0.516957,0.184564,0.173696,0.162184
2,1.5381,1.376139,0.660862,0.305696,0.308676,0.294667
3,1.05,1.086166,0.730522,0.414808,0.393066,0.392171
4,0.614,1.009736,0.75527,0.605411,0.527058,0.546931
5,0.3438,1.003797,0.781852,0.705768,0.626854,0.635865
6,0.1869,1.14471,0.784601,0.66616,0.621027,0.623417
7,0.1012,1.256911,0.790101,0.698559,0.61656,0.632943
8,0.0493,1.265342,0.784601,0.661872,0.630015,0.622717
9,0.0245,1.3552,0.783685,0.690398,0.61704,0.633983
10,0.0168,1.352106,0.792851,0.695484,0.633213,0.649729


[I 2025-03-15 13:12:46,761] Trial 129 finished with value: 0.671978400159619 and parameters: {'learning_rate': 0.00492882802187531, 'weight_decay': 0.005, 'adam_beta1': 0.91, 'warmup_steps': 2}. Best is trial 45 with value: 0.7063463805279003.


Trial 130 with params: {'learning_rate': 7.239391724904791e-05, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8126,3.60766,0.176902,0.003538,0.02,0.006012
2,3.327,3.150076,0.176902,0.003538,0.02,0.006012
3,3.1049,3.001305,0.255729,0.031886,0.040853,0.025286
4,2.9222,2.845073,0.32264,0.036492,0.059785,0.035624
5,2.8246,2.73294,0.351054,0.036642,0.069509,0.045275


[I 2025-03-15 13:13:13,329] Trial 130 pruned. 


Trial 131 with params: {'learning_rate': 0.004875281469428505, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7718,1.992546,0.495875,0.132416,0.142918,0.12032
2,1.6416,1.498714,0.621448,0.28371,0.277514,0.257992
3,1.1265,1.123063,0.715857,0.410221,0.399383,0.388834
4,0.7029,1.105485,0.722273,0.508682,0.449121,0.460046
5,0.3968,1.02051,0.748854,0.556826,0.534357,0.531263
6,0.2036,1.208066,0.773602,0.672213,0.651796,0.645203
7,0.1126,1.215698,0.774519,0.658483,0.618176,0.619682
8,0.0484,1.353181,0.796517,0.709992,0.63063,0.64595
9,0.0302,1.350565,0.788268,0.697559,0.639227,0.653261
10,0.0135,1.452025,0.786434,0.723652,0.645142,0.662233


[I 2025-03-15 13:15:47,007] Trial 131 finished with value: 0.655464831484364 and parameters: {'learning_rate': 0.004875281469428505, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 132 with params: {'learning_rate': 0.0015835457643377092, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9439,2.264873,0.427131,0.077123,0.100544,0.081101
2,1.9422,1.751848,0.554537,0.212425,0.185914,0.169719
3,1.4599,1.348012,0.670027,0.332154,0.313631,0.302048
4,1.0608,1.228294,0.692026,0.404764,0.359244,0.360511
5,0.7907,1.087852,0.729606,0.467906,0.422471,0.430167
6,0.5691,1.08918,0.72594,0.57563,0.493767,0.507812
7,0.3973,1.142802,0.746104,0.571488,0.47365,0.497251
8,0.2843,1.113197,0.750687,0.628061,0.558088,0.572643
9,0.1848,1.209002,0.748854,0.628111,0.567106,0.579974
10,0.1259,1.307987,0.742438,0.598101,0.584123,0.575075


[I 2025-03-15 13:17:25,958] Trial 132 pruned. 


Trial 133 with params: {'learning_rate': 0.003493366013803945, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6775,1.968656,0.52429,0.161516,0.165114,0.145862
2,1.6545,1.485119,0.644363,0.296324,0.283815,0.272971
3,1.1373,1.148139,0.710357,0.391719,0.376877,0.372451
4,0.7371,1.112689,0.718607,0.509733,0.451439,0.464042
5,0.4396,1.002539,0.771769,0.606207,0.554402,0.556136
6,0.2655,1.11568,0.758937,0.650248,0.597363,0.603266
7,0.1599,1.205748,0.787351,0.710741,0.618636,0.641112
8,0.0761,1.198614,0.782768,0.657591,0.624256,0.623427
9,0.042,1.291156,0.789184,0.725873,0.635917,0.660505
10,0.0249,1.331774,0.799267,0.67017,0.652954,0.646188


[I 2025-03-15 13:19:55,647] Trial 133 finished with value: 0.6725050242096973 and parameters: {'learning_rate': 0.003493366013803945, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 134 with params: {'learning_rate': 0.004676433965451635, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7434,1.996328,0.499542,0.133688,0.146291,0.116096
2,1.6531,1.471895,0.629698,0.29086,0.278792,0.263037
3,1.1426,1.191286,0.698442,0.414675,0.376155,0.369403
4,0.7445,1.073683,0.721357,0.498841,0.457349,0.461891
5,0.4239,1.034613,0.751604,0.591272,0.539912,0.545911
6,0.2459,1.118163,0.769936,0.68953,0.625004,0.633972
7,0.133,1.219625,0.772686,0.680801,0.616217,0.629127
8,0.0623,1.209908,0.781852,0.665021,0.646707,0.642037
9,0.0224,1.273609,0.789184,0.683272,0.638047,0.647099
10,0.0111,1.334355,0.802016,0.693275,0.659513,0.662194


[I 2025-03-15 13:22:52,826] Trial 134 finished with value: 0.6526871230445599 and parameters: {'learning_rate': 0.004676433965451635, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 135 with params: {'learning_rate': 0.0019818536537312926, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9105,2.220546,0.448213,0.103757,0.11254,0.093562
2,1.8835,1.664756,0.576535,0.23492,0.204994,0.190986
3,1.377,1.28098,0.67736,0.350741,0.324427,0.315868
4,0.9774,1.159221,0.711274,0.43302,0.391476,0.396752
5,0.7003,1.08911,0.728689,0.499486,0.442044,0.452903
6,0.4923,1.07837,0.737855,0.551094,0.476572,0.488937
7,0.3227,1.189924,0.748854,0.63225,0.528472,0.555155
8,0.221,1.171182,0.762603,0.635635,0.563156,0.573013
9,0.1291,1.2461,0.761687,0.612144,0.544371,0.562818
10,0.0955,1.257114,0.765353,0.61938,0.579612,0.583992


[I 2025-03-15 13:23:49,070] Trial 135 pruned. 


Trial 136 with params: {'learning_rate': 0.00488327709777692, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7709,2.007538,0.494959,0.131767,0.144811,0.117581
2,1.6382,1.477835,0.618698,0.28158,0.274193,0.251548
3,1.1354,1.157226,0.705775,0.40362,0.391755,0.383329
4,0.7036,1.081154,0.72594,0.511788,0.464914,0.471571
5,0.4003,1.048226,0.76077,0.616191,0.543964,0.559362
6,0.2074,1.119075,0.764436,0.666113,0.599069,0.610905
7,0.1056,1.274306,0.778185,0.658897,0.633164,0.632352
8,0.0578,1.30084,0.775435,0.690742,0.616971,0.634709
9,0.038,1.265368,0.791934,0.689908,0.661999,0.665242
10,0.0254,1.419694,0.790101,0.680198,0.665259,0.652165


[I 2025-03-15 13:26:18,045] Trial 136 finished with value: 0.6731544767758147 and parameters: {'learning_rate': 0.00488327709777692, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 1}. Best is trial 45 with value: 0.7063463805279003.


Trial 137 with params: {'learning_rate': 0.001957945431120213, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8246,2.161829,0.467461,0.11542,0.125676,0.105246
2,1.8316,1.649973,0.595784,0.275251,0.223859,0.216813
3,1.3355,1.329469,0.671861,0.365207,0.320109,0.316079
4,0.9773,1.157105,0.710357,0.464524,0.399178,0.409875
5,0.6902,1.049052,0.736022,0.513464,0.459399,0.469267


[I 2025-03-15 13:26:42,403] Trial 137 pruned. 


Trial 138 with params: {'learning_rate': 0.004882328381869567, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7696,1.997092,0.500458,0.129131,0.144779,0.118204
2,1.6331,1.461612,0.635197,0.311211,0.292867,0.274206
3,1.1167,1.131586,0.713107,0.380447,0.396627,0.382139
4,0.7121,1.120612,0.724106,0.523813,0.475267,0.481209
5,0.4085,1.050973,0.751604,0.601526,0.554028,0.562984
6,0.2406,1.107113,0.766269,0.685936,0.634971,0.637872
7,0.113,1.227862,0.76077,0.656192,0.606336,0.610267
8,0.0568,1.262671,0.787351,0.69885,0.635219,0.647258
9,0.0342,1.327915,0.794684,0.752105,0.665588,0.686008
10,0.0218,1.391058,0.794684,0.724791,0.630779,0.658195


[I 2025-03-15 13:28:25,466] Trial 138 pruned. 


Trial 139 with params: {'learning_rate': 1.1619982946199605e-05, 'weight_decay': 0.001, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8978,3.880602,0.143905,0.005967,0.021166,0.008749
2,3.8653,3.848582,0.176902,0.003538,0.02,0.006012
3,3.831,3.807546,0.176902,0.003538,0.02,0.006012
4,3.7792,3.748745,0.176902,0.003538,0.02,0.006012
5,3.7147,3.657263,0.176902,0.003538,0.02,0.006012
6,3.5987,3.530484,0.176902,0.003538,0.02,0.006012
7,3.4603,3.392405,0.176902,0.003538,0.02,0.006012
8,3.3513,3.292524,0.176902,0.003538,0.02,0.006012
9,3.2787,3.236827,0.176902,0.003538,0.02,0.006012
10,3.2313,3.202632,0.176902,0.003538,0.02,0.006012


[I 2025-03-15 13:29:16,479] Trial 139 pruned. 


Trial 140 with params: {'learning_rate': 0.004886519689952504, 'weight_decay': 0.003, 'adam_beta1': 0.91, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6184,1.932185,0.519707,0.160383,0.168611,0.148239
2,1.5566,1.421702,0.652612,0.310682,0.303831,0.292093
3,1.0754,1.15603,0.71494,0.397901,0.377245,0.37674
4,0.6866,1.042004,0.743355,0.521406,0.46486,0.475285
5,0.3756,1.084328,0.76352,0.584687,0.550112,0.548123
6,0.2105,1.091584,0.779102,0.694976,0.636615,0.650751
7,0.0937,1.290398,0.775435,0.673054,0.612118,0.617373
8,0.0641,1.342402,0.778185,0.677574,0.644697,0.64677
9,0.037,1.333704,0.781852,0.664356,0.629697,0.634022
10,0.015,1.362982,0.791017,0.690742,0.639279,0.648708


[I 2025-03-15 13:31:01,774] Trial 140 pruned. 


Trial 141 with params: {'learning_rate': 0.004410266415109446, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6224,1.935166,0.503208,0.168321,0.160832,0.147619
2,1.5213,1.431209,0.659945,0.345556,0.306431,0.29838
3,1.0319,1.106211,0.71769,0.394216,0.391489,0.377579
4,0.6231,1.054891,0.745188,0.525658,0.491301,0.495357
5,0.37,1.010559,0.769019,0.640967,0.580374,0.589408
6,0.195,1.106771,0.775435,0.701333,0.631428,0.647135
7,0.0919,1.249722,0.785518,0.692665,0.626102,0.633102
8,0.0636,1.210166,0.788268,0.675596,0.631078,0.635613
9,0.0299,1.294893,0.793767,0.713368,0.644694,0.663898
10,0.0102,1.313719,0.80385,0.694292,0.661849,0.663508


[I 2025-03-15 13:33:42,744] Trial 141 finished with value: 0.6633875146191716 and parameters: {'learning_rate': 0.004410266415109446, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 2}. Best is trial 45 with value: 0.7063463805279003.


Trial 142 with params: {'learning_rate': 0.004550685279788728, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5597,1.935506,0.51604,0.153305,0.169404,0.146793
2,1.5176,1.383181,0.649863,0.300149,0.3058,0.290305
3,1.024,1.105689,0.71494,0.40269,0.379703,0.371959
4,0.644,1.003513,0.729606,0.499605,0.486783,0.478796
5,0.3668,0.999944,0.771769,0.643579,0.593958,0.605458
6,0.1971,1.127312,0.771769,0.640132,0.609903,0.616067
7,0.104,1.239891,0.784601,0.71158,0.610581,0.636635
8,0.06,1.216932,0.791934,0.667392,0.643878,0.643973
9,0.0276,1.308205,0.793767,0.677084,0.650511,0.653617
10,0.0225,1.49432,0.779102,0.664551,0.643234,0.636693


[I 2025-03-15 13:36:20,612] Trial 142 finished with value: 0.6789188953655726 and parameters: {'learning_rate': 0.004550685279788728, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 0}. Best is trial 45 with value: 0.7063463805279003.


Trial 143 with params: {'learning_rate': 0.001635898960934995, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9501,2.255111,0.437214,0.1246,0.106062,0.089689
2,1.9232,1.733209,0.567369,0.243338,0.192873,0.178332
3,1.4265,1.322052,0.675527,0.324881,0.312801,0.305005
4,1.0346,1.193569,0.696609,0.439678,0.38715,0.396611
5,0.7486,1.104438,0.722273,0.440954,0.414496,0.416637
6,0.5438,1.129849,0.719523,0.531236,0.470288,0.474608
7,0.3841,1.171964,0.740605,0.557214,0.477452,0.498019
8,0.2671,1.212256,0.738772,0.619424,0.514112,0.544467
9,0.1736,1.23302,0.743355,0.59336,0.538333,0.544484
10,0.1181,1.330022,0.756187,0.628353,0.578495,0.583698


[I 2025-03-15 13:37:14,634] Trial 143 pruned. 


Trial 144 with params: {'learning_rate': 0.004400483600138653, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5395,1.928945,0.525206,0.165906,0.180535,0.158108
2,1.5126,1.433661,0.650779,0.3184,0.314904,0.298948
3,1.0337,1.133874,0.71494,0.425082,0.408538,0.401949
4,0.6581,1.029431,0.742438,0.516228,0.515413,0.504221
5,0.3743,1.058912,0.764436,0.662726,0.596903,0.611584
6,0.2066,1.072924,0.780018,0.652823,0.627426,0.624602
7,0.1002,1.228753,0.783685,0.710182,0.619355,0.646917
8,0.0557,1.276719,0.786434,0.698039,0.646109,0.655769
9,0.0279,1.289714,0.799267,0.719896,0.668425,0.681166
10,0.011,1.302416,0.796517,0.703992,0.66008,0.670077


[I 2025-03-15 13:39:50,914] Trial 144 finished with value: 0.6894933408230783 and parameters: {'learning_rate': 0.004400483600138653, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 0}. Best is trial 45 with value: 0.7063463805279003.


Trial 145 with params: {'learning_rate': 0.0022949276006876585, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7337,2.101801,0.48121,0.123394,0.134468,0.113327
2,1.7579,1.603027,0.610449,0.270889,0.252073,0.247125
3,1.2645,1.260126,0.678277,0.386435,0.331321,0.332293
4,0.8921,1.090622,0.722273,0.51907,0.431426,0.447507
5,0.5978,1.027739,0.753437,0.563723,0.455817,0.484107
6,0.4126,1.052373,0.740605,0.591929,0.528945,0.535834
7,0.2605,1.274338,0.751604,0.638082,0.548022,0.568355
8,0.1819,1.151897,0.766269,0.658826,0.616851,0.624267
9,0.11,1.304677,0.750687,0.678792,0.596589,0.615159
10,0.0632,1.315064,0.768103,0.646581,0.622738,0.618508


[I 2025-03-15 13:42:28,049] Trial 145 finished with value: 0.666461345545925 and parameters: {'learning_rate': 0.0022949276006876585, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 0}. Best is trial 45 with value: 0.7063463805279003.


Trial 146 with params: {'learning_rate': 0.0022889697101519772, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7412,2.108612,0.48121,0.130039,0.134186,0.114361
2,1.7648,1.602448,0.60495,0.259728,0.247859,0.238706
3,1.2758,1.296556,0.655362,0.352278,0.324141,0.318041
4,0.9143,1.118142,0.716774,0.486555,0.408207,0.421144
5,0.617,1.036282,0.745188,0.53135,0.460761,0.473361
6,0.4172,1.144109,0.728689,0.615569,0.527596,0.54037
7,0.2687,1.199336,0.757104,0.674803,0.565964,0.590299
8,0.1775,1.207604,0.750687,0.615507,0.573424,0.569495
9,0.1049,1.32069,0.747938,0.662425,0.613647,0.619841
10,0.0763,1.446273,0.741522,0.596717,0.569452,0.569339


[I 2025-03-15 13:44:04,554] Trial 146 pruned. 


Trial 147 with params: {'learning_rate': 0.0043196653618385135, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6437,1.991803,0.513291,0.165431,0.16033,0.140679
2,1.6033,1.433165,0.647113,0.298433,0.300473,0.289411
3,1.0929,1.140252,0.711274,0.385021,0.393686,0.381536
4,0.6878,1.033208,0.732356,0.534134,0.488316,0.49614
5,0.3704,1.009041,0.769019,0.619698,0.573731,0.581598
6,0.2162,1.090417,0.774519,0.673656,0.608458,0.620344
7,0.1192,1.224636,0.770852,0.604601,0.577743,0.572418
8,0.0524,1.249322,0.784601,0.666143,0.63326,0.636133
9,0.0337,1.313802,0.7956,0.700453,0.653024,0.658779
10,0.0202,1.403015,0.788268,0.687655,0.634891,0.633209


[I 2025-03-15 13:44:53,864] Trial 147 pruned. 


Trial 148 with params: {'learning_rate': 0.002935639351709915, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8523,2.047154,0.484876,0.127981,0.132485,0.109434
2,1.7505,1.578015,0.619615,0.256707,0.260953,0.24477
3,1.2645,1.251011,0.691109,0.370479,0.345837,0.340974
4,0.8796,1.108392,0.719523,0.404599,0.395968,0.392613
5,0.5856,1.063087,0.739688,0.513084,0.466081,0.478874
6,0.3754,1.109674,0.740605,0.57199,0.517911,0.525634
7,0.2281,1.275394,0.738772,0.611113,0.54592,0.559414
8,0.1275,1.321396,0.746104,0.630569,0.594938,0.596063
9,0.0705,1.373526,0.758937,0.64895,0.596238,0.608902
10,0.041,1.395149,0.764436,0.689878,0.632526,0.643305


[I 2025-03-15 13:46:35,009] Trial 148 pruned. 


Trial 149 with params: {'learning_rate': 0.004424007612483057, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5424,1.925231,0.530706,0.184072,0.177954,0.157629
2,1.52,1.448884,0.635197,0.294916,0.301013,0.283389
3,1.0454,1.126776,0.716774,0.409522,0.396076,0.387605
4,0.6402,1.010656,0.744271,0.535237,0.49713,0.500991
5,0.379,1.044941,0.768103,0.644904,0.553565,0.578707
6,0.2022,1.086422,0.791017,0.692517,0.612063,0.634344
7,0.1073,1.264515,0.777269,0.663789,0.611132,0.623494
8,0.0517,1.307805,0.779102,0.648638,0.596894,0.609669
9,0.0378,1.318783,0.785518,0.699646,0.654805,0.66647
10,0.0267,1.301515,0.780018,0.692817,0.63976,0.65002


[I 2025-03-15 13:48:59,825] Trial 149 finished with value: 0.6925547327118019 and parameters: {'learning_rate': 0.004424007612483057, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 0}. Best is trial 45 with value: 0.7063463805279003.


In [49]:
print(best_trial)

BestRun(run_id='45', objective=0.7063463805279003, hyperparameters={'learning_rate': 0.004693546493886514, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 1}, run_summary=None)


In [50]:
base.reset_seed()

In [51]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_fine_hp-search", logging_dir=f"~/logs/{DATASET}/bilstm-distill_fine_hp-search", remove_unused_columns=False, epochs=num_epochs, batch_size=batch_size)

In [52]:
def hp_space(trial):
    params =  {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-3, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0, 1e-2, step=1e-3),
        "adam_beta1" : trial.suggest_float("adam_beta1", 0.9, 0.99, step=0.01),
        "warmup_steps" : trial.suggest_int("warmup_steps", 0, warm_up),
        "lambda_param": trial.suggest_float("lambda_param",0,1,step=.1),
        "temperature": trial.suggest_float("temperature", 2,7, step=.5)
    }
    print(f"Trial {trial.number} with params: {params}")
    return params

In [53]:
pruner = optuna.pruners.HyperbandPruner(min_resource=min_r, max_resource=max_r, reduction_factor=2, bootstrap_count=2)
sampler = optuna.samplers.TPESampler(seed=42, multivariate=True)



In [54]:
trainer = base.DistilTrainer(
    args=training_args,
    train_dataset=eval_data,
    eval_dataset=test_data,
    compute_metrics=base.compute_metrics,
    model_init = lambda: get_BiLSTM(),
    #callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)
  

In [55]:
best_trial2 = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    compute_objective=lambda metrics: metrics["eval_f1"],
    pruner=pruner,
    sampler=sampler,
    study_name="Test-destilace",
    n_trials=150
)

[I 2025-03-15 13:49:00,152] A new study created in memory with name: Test-destilace


Trial 0 with params: {'learning_rate': 0.0001025350969016849, 'weight_decay': 0.01, 'adam_beta1': 0.97, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4665,2.428533,0.11,0.002619,0.02381,0.004719
2,2.4188,2.383934,0.11,0.002619,0.02381,0.004719
3,2.356,2.319232,0.11,0.002619,0.02381,0.004719
4,2.2677,2.240144,0.11,0.002619,0.02381,0.004719
5,2.1664,2.188566,0.11,0.002619,0.02381,0.004719
6,2.1283,2.206135,0.11,0.002619,0.02381,0.004719
7,2.1257,2.192723,0.11,0.002619,0.02381,0.004719
8,2.0938,2.15741,0.11,0.002619,0.02381,0.004719
9,2.0737,2.13115,0.11,0.002619,0.02381,0.004719
10,2.0527,2.112982,0.11,0.002619,0.02381,0.004719


[I 2025-03-15 13:50:29,574] Trial 0 pruned. 


Trial 1 with params: {'learning_rate': 1.4347159517201392e-05, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.96, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4781,2.45793,0.006,0.000143,0.02381,0.000284
2,2.4719,2.452051,0.006,0.000143,0.02381,0.000284
3,2.4662,2.446572,0.104,0.006763,0.037518,0.010427
4,2.4607,2.441354,0.11,0.00264,0.02381,0.004753
5,2.4543,2.436206,0.11,0.002619,0.02381,0.004719
6,2.4478,2.430971,0.11,0.002619,0.02381,0.004719
7,2.4394,2.425689,0.11,0.002619,0.02381,0.004719
8,2.4353,2.420377,0.11,0.002619,0.02381,0.004719
9,2.427,2.414921,0.11,0.002619,0.02381,0.004719
10,2.4242,2.409295,0.11,0.002619,0.02381,0.004719


[I 2025-03-15 13:52:48,363] Trial 1 finished with value: 0.004719004719004719 and parameters: {'learning_rate': 1.4347159517201392e-05, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.96, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 7.0}. Best is trial 1 with value: 0.004719004719004719.


Trial 2 with params: {'learning_rate': 0.001764971584817572, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2381,2.110317,0.184,0.033333,0.050149,0.029819
2,1.9131,1.837479,0.436,0.038529,0.097945,0.052628
3,1.7026,1.601798,0.446,0.045188,0.109346,0.06078
4,1.5138,1.482169,0.468,0.07589,0.119373,0.070246
5,1.3843,1.324928,0.536,0.074462,0.134996,0.090895
6,1.283,1.242908,0.556,0.096591,0.142288,0.100648
7,1.1832,1.194882,0.56,0.107265,0.160526,0.116387
8,1.0614,1.184429,0.566,0.124619,0.170904,0.132328
9,0.9712,1.116035,0.59,0.16874,0.204271,0.156017
10,0.8456,1.047175,0.608,0.156194,0.213032,0.163425


[I 2025-03-15 13:55:01,226] Trial 2 finished with value: 0.33802696888160705 and parameters: {'learning_rate': 0.001764971584817572, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 4.5}. Best is trial 2 with value: 0.33802696888160705.


Trial 3 with params: {'learning_rate': 0.0001464895513280072, 'weight_decay': 0.003, 'adam_beta1': 0.96, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4514,2.40085,0.11,0.002619,0.02381,0.004719
2,2.3635,2.305907,0.11,0.002619,0.02381,0.004719
3,2.23,2.196046,0.11,0.002619,0.02381,0.004719
4,2.1313,2.20241,0.11,0.002619,0.02381,0.004719
5,2.1116,2.168751,0.11,0.002619,0.02381,0.004719
6,2.0783,2.126848,0.11,0.002619,0.02381,0.004719
7,2.0505,2.097728,0.11,0.002619,0.02381,0.004719
8,2.0177,2.063048,0.11,0.002624,0.02381,0.004728
9,1.9884,2.003593,0.316,0.05949,0.051386,0.03858
10,1.9318,1.9482,0.376,0.043129,0.069779,0.047036


[I 2025-03-15 13:57:20,809] Trial 3 finished with value: 0.0598280546393387 and parameters: {'learning_rate': 0.0001464895513280072, 'weight_decay': 0.003, 'adam_beta1': 0.96, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 4.0}. Best is trial 2 with value: 0.33802696888160705.


Trial 4 with params: {'learning_rate': 0.00017018418817029164, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 2, 'lambda_param': 0.6000000000000001, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4583,2.404075,0.11,0.002619,0.02381,0.004719
2,2.3582,2.284832,0.11,0.002619,0.02381,0.004719
3,2.1892,2.187175,0.11,0.002619,0.02381,0.004719
4,2.1156,2.160493,0.11,0.002619,0.02381,0.004719
5,2.0715,2.113143,0.11,0.002619,0.02381,0.004719
6,2.0363,2.052437,0.11,0.002619,0.02381,0.004719
7,1.9771,1.970946,0.372,0.042303,0.064488,0.043873
8,1.9006,1.903239,0.42,0.035537,0.080255,0.048198
9,1.8565,1.822506,0.396,0.034017,0.074541,0.045592
10,1.7967,1.780495,0.412,0.038189,0.087653,0.052817


[I 2025-03-15 13:58:53,443] Trial 4 pruned. 


Trial 5 with params: {'learning_rate': 0.00043625993625605574, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4367,2.299795,0.11,0.002619,0.02381,0.004719
2,2.1702,2.184024,0.11,0.002619,0.02381,0.004719
3,2.0632,2.07801,0.11,0.002624,0.02381,0.004728
4,1.9567,1.903186,0.392,0.044249,0.071419,0.049963
5,1.819,1.783978,0.42,0.040888,0.078086,0.053046
6,1.7276,1.690301,0.442,0.040861,0.110563,0.05737
7,1.6423,1.608997,0.438,0.041003,0.105272,0.056074
8,1.5495,1.572696,0.452,0.075272,0.106761,0.067315
9,1.5077,1.488538,0.48,0.076628,0.117949,0.078523
10,1.434,1.440841,0.512,0.07043,0.123399,0.08041


[I 2025-03-15 14:01:09,181] Trial 5 finished with value: 0.12441890354325505 and parameters: {'learning_rate': 0.00043625993625605574, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 6.0}. Best is trial 2 with value: 0.33802696888160705.


Trial 6 with params: {'learning_rate': 6.639623079859457e-05, 'weight_decay': 0.001, 'adam_beta1': 0.96, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4709,2.439915,0.11,0.002624,0.02381,0.004728
2,2.4408,2.414218,0.11,0.002619,0.02381,0.004719
3,2.4083,2.382324,0.11,0.002619,0.02381,0.004719
4,2.3657,2.340319,0.11,0.002619,0.02381,0.004719
5,2.3066,2.287723,0.11,0.002619,0.02381,0.004719
6,2.24,2.231997,0.11,0.002619,0.02381,0.004719
7,2.1713,2.194,0.11,0.002619,0.02381,0.004719
8,2.1247,2.189498,0.11,0.002619,0.02381,0.004719
9,2.1157,2.189164,0.11,0.002619,0.02381,0.004719
10,2.0991,2.170096,0.11,0.002619,0.02381,0.004719


[I 2025-03-15 14:02:41,117] Trial 6 pruned. 


Trial 7 with params: {'learning_rate': 1.2382649697023537e-05, 'weight_decay': 0.01, 'adam_beta1': 0.92, 'warmup_steps': 3, 'lambda_param': 0.30000000000000004, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4783,2.458604,0.006,0.000143,0.02381,0.000284
2,2.4731,2.453514,0.006,0.000143,0.02381,0.000284
3,2.4683,2.448607,0.05,0.022924,0.033333,0.013729
4,2.4635,2.443897,0.11,0.003326,0.031313,0.005951
5,2.4577,2.439221,0.11,0.002624,0.02381,0.004728
6,2.4517,2.434395,0.11,0.002619,0.02381,0.004719
7,2.4438,2.429493,0.11,0.002619,0.02381,0.004719
8,2.4402,2.424681,0.11,0.002619,0.02381,0.004719
9,2.4323,2.419592,0.11,0.002619,0.02381,0.004719
10,2.4301,2.414389,0.11,0.002619,0.02381,0.004719


[I 2025-03-15 14:04:56,489] Trial 7 finished with value: 0.004719004719004719 and parameters: {'learning_rate': 1.2382649697023537e-05, 'weight_decay': 0.01, 'adam_beta1': 0.92, 'warmup_steps': 3, 'lambda_param': 0.30000000000000004, 'temperature': 4.5}. Best is trial 2 with value: 0.33802696888160705.


Trial 8 with params: {'learning_rate': 0.00029891977384598987, 'weight_decay': 0.002, 'adam_beta1': 0.99, 'warmup_steps': 3, 'lambda_param': 1.0, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4481,2.363546,0.11,0.002619,0.02381,0.004719
2,2.2604,2.188855,0.11,0.002619,0.02381,0.004719
3,2.1392,2.228231,0.11,0.002619,0.02381,0.004719
4,2.1018,2.13652,0.11,0.002619,0.02381,0.004719
5,2.0441,2.09356,0.118,0.018524,0.034392,0.017469
6,2.0148,2.045232,0.182,0.071077,0.049845,0.042417
7,1.9601,1.982468,0.344,0.07462,0.065807,0.055953
8,1.8959,1.909971,0.412,0.047158,0.076181,0.055031
9,1.8588,1.863749,0.432,0.046734,0.081029,0.058088
10,1.8091,1.829727,0.442,0.063036,0.097008,0.069519


[I 2025-03-15 14:07:17,222] Trial 8 finished with value: 0.07208223034205392 and parameters: {'learning_rate': 0.00029891977384598987, 'weight_decay': 0.002, 'adam_beta1': 0.99, 'warmup_steps': 3, 'lambda_param': 1.0, 'temperature': 6.5}. Best is trial 2 with value: 0.33802696888160705.


Trial 9 with params: {'learning_rate': 0.00041087915453240814, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3851,2.213947,0.11,0.002619,0.02381,0.004719
2,2.1238,2.149519,0.11,0.002619,0.02381,0.004719
3,2.0396,2.059207,0.112,0.014534,0.026455,0.009065
4,1.9205,1.902651,0.406,0.032762,0.079048,0.042886
5,1.8031,1.766116,0.42,0.039395,0.078086,0.051941
6,1.715,1.667978,0.442,0.042067,0.106224,0.058683
7,1.6359,1.60854,0.43,0.039663,0.101198,0.053874
8,1.5556,1.585102,0.448,0.051745,0.105657,0.062895
9,1.5189,1.490988,0.484,0.077228,0.118858,0.078862
10,1.4437,1.457116,0.5,0.072202,0.120358,0.077968


[I 2025-03-15 14:09:33,171] Trial 9 finished with value: 0.12151530248380986 and parameters: {'learning_rate': 0.00041087915453240814, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 3.5}. Best is trial 2 with value: 0.33802696888160705.


Trial 10 with params: {'learning_rate': 0.002041934417684722, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2043,2.072624,0.142,0.029309,0.044444,0.028226
2,1.8453,1.742371,0.424,0.040781,0.104477,0.057327
3,1.6343,1.529938,0.464,0.06965,0.113779,0.064503
4,1.4579,1.431311,0.484,0.075823,0.119286,0.077165
5,1.3244,1.261316,0.548,0.101844,0.139079,0.097506
6,1.2116,1.205865,0.552,0.090938,0.147454,0.102359
7,1.1151,1.168127,0.566,0.12247,0.165247,0.124343
8,0.985,1.116856,0.586,0.147414,0.184466,0.138347
9,0.8799,1.077881,0.606,0.186184,0.230682,0.181198
10,0.7563,1.026437,0.618,0.18411,0.240936,0.19065


[I 2025-03-15 14:11:54,113] Trial 10 finished with value: 0.3494718015484391 and parameters: {'learning_rate': 0.002041934417684722, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 6.5}. Best is trial 10 with value: 0.3494718015484391.


Trial 11 with params: {'learning_rate': 0.00318176128710325, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1426,1.943944,0.216,0.032018,0.066575,0.039893
2,1.7175,1.575402,0.466,0.067522,0.114298,0.062007
3,1.4813,1.399076,0.502,0.079388,0.126268,0.085097
4,1.3135,1.228468,0.546,0.088021,0.137752,0.097209
5,1.18,1.205554,0.556,0.101078,0.163863,0.112534
6,1.0538,1.125273,0.578,0.160807,0.173183,0.133452
7,0.9054,1.05466,0.606,0.192291,0.22047,0.179228
8,0.782,1.07035,0.624,0.236428,0.245725,0.215236
9,0.6675,0.979785,0.636,0.261699,0.285504,0.231438
10,0.5423,0.994234,0.664,0.29244,0.325112,0.276821


[I 2025-03-15 14:14:08,502] Trial 11 finished with value: 0.3957648826536272 and parameters: {'learning_rate': 0.00318176128710325, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 7.0}. Best is trial 11 with value: 0.3957648826536272.


Trial 12 with params: {'learning_rate': 0.0030035251452626105, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3162,2.141594,0.168,0.032697,0.039832,0.018982
2,1.886,1.787531,0.416,0.045391,0.092944,0.053979
3,1.6454,1.525721,0.492,0.069128,0.120871,0.074827
4,1.4552,1.400403,0.49,0.082574,0.128422,0.08579
5,1.2991,1.247072,0.56,0.105325,0.151017,0.111097
6,1.1768,1.169453,0.564,0.117458,0.161493,0.119269
7,1.0735,1.156762,0.574,0.144802,0.181124,0.145679
8,0.944,1.132883,0.588,0.208492,0.20854,0.171487
9,0.839,1.049893,0.61,0.186173,0.235293,0.193157
10,0.7048,1.01598,0.622,0.227638,0.259207,0.211802


[I 2025-03-15 14:16:31,833] Trial 12 finished with value: 0.36229630290606646 and parameters: {'learning_rate': 0.0030035251452626105, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 7.0}. Best is trial 11 with value: 0.3957648826536272.


Trial 13 with params: {'learning_rate': 0.0013883069209569172, 'weight_decay': 0.002, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 3, 'lambda_param': 0.4, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3396,2.171528,0.116,0.003994,0.025238,0.006735
2,2.0088,1.916487,0.418,0.04881,0.104805,0.061505
3,1.7814,1.688216,0.4,0.045676,0.084925,0.058465
4,1.6105,1.556272,0.468,0.070388,0.114614,0.064034
5,1.4799,1.437164,0.5,0.078838,0.127691,0.085926
6,1.3754,1.341805,0.546,0.102447,0.144475,0.103289
7,1.2789,1.265986,0.554,0.096863,0.148762,0.105971
8,1.1758,1.226803,0.554,0.106655,0.14138,0.107583
9,1.1102,1.187725,0.56,0.107361,0.154544,0.113865
10,1.0092,1.150531,0.584,0.14241,0.180744,0.138522


[I 2025-03-15 14:18:47,777] Trial 13 finished with value: 0.2934652345367953 and parameters: {'learning_rate': 0.0013883069209569172, 'weight_decay': 0.002, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 3, 'lambda_param': 0.4, 'temperature': 5.5}. Best is trial 11 with value: 0.3957648826536272.


Trial 14 with params: {'learning_rate': 0.003879925621399434, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2753,2.025818,0.138,0.025671,0.043492,0.026662
2,1.7903,1.664112,0.45,0.042536,0.110667,0.058783
3,1.5606,1.448712,0.498,0.075348,0.119701,0.0815
4,1.3856,1.310532,0.534,0.092252,0.145564,0.103178
5,1.2223,1.193554,0.564,0.101732,0.155938,0.112449
6,1.0981,1.142804,0.58,0.17186,0.192885,0.153662
7,0.9746,1.106936,0.594,0.160687,0.203848,0.163677
8,0.8333,1.040474,0.636,0.231348,0.262821,0.231923
9,0.7075,0.98739,0.626,0.245007,0.26586,0.225442
10,0.5881,1.000976,0.644,0.288131,0.293802,0.263608


[I 2025-03-15 14:20:59,448] Trial 14 finished with value: 0.4253194679448775 and parameters: {'learning_rate': 0.003879925621399434, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 7.0}. Best is trial 14 with value: 0.4253194679448775.


Trial 15 with params: {'learning_rate': 0.0011785292970990639, 'weight_decay': 0.007, 'adam_beta1': 0.92, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3184,2.150988,0.112,0.010571,0.024286,0.005643
2,2.0233,1.960784,0.39,0.052914,0.096252,0.065238
3,1.8075,1.72047,0.392,0.046015,0.083021,0.057078
4,1.6382,1.588871,0.458,0.046475,0.112203,0.062836
5,1.5059,1.461563,0.484,0.069337,0.118801,0.0741
6,1.399,1.367846,0.532,0.076153,0.134211,0.091378
7,1.307,1.297208,0.546,0.085351,0.135291,0.094234
8,1.2159,1.266376,0.552,0.112453,0.140856,0.106831
9,1.157,1.198859,0.56,0.104671,0.154267,0.113062
10,1.0547,1.155219,0.576,0.137465,0.171984,0.129585


[I 2025-03-15 14:23:14,432] Trial 15 finished with value: 0.2878921684587183 and parameters: {'learning_rate': 0.0011785292970990639, 'weight_decay': 0.007, 'adam_beta1': 0.92, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 7.0}. Best is trial 14 with value: 0.4253194679448775.


Trial 16 with params: {'learning_rate': 0.002980078004962928, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.14,1.975699,0.204,0.038364,0.061765,0.041818
2,1.7512,1.664118,0.446,0.042222,0.110716,0.057852
3,1.5208,1.403955,0.496,0.079702,0.130733,0.085666
4,1.3448,1.244141,0.546,0.089318,0.136279,0.09488
5,1.2044,1.185385,0.554,0.098803,0.162337,0.108382
6,1.0772,1.132838,0.56,0.144966,0.161282,0.123869
7,0.9479,1.058292,0.592,0.161419,0.206335,0.161859
8,0.8186,1.089634,0.64,0.27515,0.2609,0.238266
9,0.7174,0.97963,0.644,0.27194,0.290419,0.244793
10,0.5893,1.022339,0.636,0.289986,0.281025,0.248289


[I 2025-03-15 14:25:41,834] Trial 16 finished with value: 0.38768863359051975 and parameters: {'learning_rate': 0.002980078004962928, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 7.0}. Best is trial 14 with value: 0.4253194679448775.


Trial 17 with params: {'learning_rate': 0.0029594436387712733, 'weight_decay': 0.002, 'adam_beta1': 0.97, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1463,1.973226,0.212,0.034061,0.063454,0.039738
2,1.777,1.709517,0.416,0.043839,0.104703,0.056669
3,1.5676,1.48354,0.5,0.067067,0.120116,0.078114
4,1.4274,1.393652,0.496,0.086042,0.135799,0.092999
5,1.3089,1.270269,0.552,0.112665,0.15522,0.113273
6,1.2073,1.200794,0.554,0.098537,0.151072,0.10883
7,1.1167,1.187225,0.572,0.119537,0.183238,0.128415
8,1.0039,1.149671,0.562,0.160857,0.194703,0.143955
9,0.9324,1.130026,0.592,0.170752,0.21987,0.169845
10,0.8325,1.046048,0.618,0.206088,0.239249,0.183809


[I 2025-03-15 14:27:58,225] Trial 17 finished with value: 0.32871105685223473 and parameters: {'learning_rate': 0.0029594436387712733, 'weight_decay': 0.002, 'adam_beta1': 0.97, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 6.0}. Best is trial 14 with value: 0.4253194679448775.


Trial 18 with params: {'learning_rate': 0.003958309427236694, 'weight_decay': 0.005, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2608,2.007659,0.146,0.026674,0.047566,0.02932
2,1.7898,1.681016,0.444,0.041104,0.108913,0.057499
3,1.5764,1.43703,0.524,0.071371,0.131662,0.086912
4,1.406,1.375006,0.49,0.089752,0.132682,0.09136
5,1.2627,1.24858,0.556,0.125957,0.154029,0.117258
6,1.145,1.154855,0.572,0.165821,0.183787,0.139653
7,1.0325,1.126868,0.594,0.185247,0.19933,0.17497
8,0.9058,1.106545,0.606,0.189836,0.223443,0.192467
9,0.7928,1.020567,0.628,0.283928,0.263918,0.229497
10,0.6713,0.997414,0.646,0.25989,0.293551,0.247223


[I 2025-03-15 14:30:16,383] Trial 18 finished with value: 0.39042725057137706 and parameters: {'learning_rate': 0.003958309427236694, 'weight_decay': 0.005, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 5.5}. Best is trial 14 with value: 0.4253194679448775.


Trial 19 with params: {'learning_rate': 0.00025720788256592157, 'weight_decay': 0.003, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4281,2.334639,0.11,0.002619,0.02381,0.004719
2,2.2231,2.190979,0.11,0.002619,0.02381,0.004719
3,2.1222,2.177653,0.11,0.002619,0.02381,0.004719
4,2.0686,2.118934,0.11,0.002619,0.02381,0.004719
5,2.0173,2.053105,0.112,0.026434,0.035714,0.020601
6,1.9555,1.94886,0.386,0.047617,0.07216,0.050644
7,1.882,1.896752,0.416,0.057649,0.092857,0.06145
8,1.8073,1.815909,0.43,0.058443,0.094108,0.063756
9,1.7668,1.748077,0.4,0.037873,0.084796,0.051665
10,1.7067,1.709618,0.43,0.038996,0.103367,0.054634


[I 2025-03-15 14:32:37,430] Trial 19 finished with value: 0.08581929935303678 and parameters: {'learning_rate': 0.00025720788256592157, 'weight_decay': 0.003, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 6.5}. Best is trial 14 with value: 0.4253194679448775.


Trial 20 with params: {'learning_rate': 8.464343745077094e-05, 'weight_decay': 0.004, 'adam_beta1': 0.93, 'warmup_steps': 4, 'lambda_param': 0.4, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4714,2.437675,0.11,0.002624,0.02381,0.004728
2,2.4341,2.402607,0.11,0.002619,0.02381,0.004719
3,2.3853,2.351338,0.11,0.002619,0.02381,0.004719
4,2.3108,2.277242,0.11,0.002619,0.02381,0.004719
5,2.2064,2.204405,0.11,0.002619,0.02381,0.004719
6,2.1357,2.188982,0.11,0.002619,0.02381,0.004719
7,2.1145,2.172418,0.11,0.002619,0.02381,0.004719
8,2.0861,2.145189,0.11,0.002619,0.02381,0.004719
9,2.0718,2.120405,0.11,0.002619,0.02381,0.004719
10,2.0482,2.098574,0.11,0.002619,0.02381,0.004719


[I 2025-03-15 14:34:28,392] Trial 20 pruned. 


Trial 21 with params: {'learning_rate': 0.002425187341911066, 'weight_decay': 0.007, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3559,2.158966,0.206,0.035879,0.061938,0.039029
2,1.9825,1.909618,0.428,0.04271,0.110185,0.058822
3,1.7407,1.690397,0.426,0.075432,0.102727,0.066478
4,1.5521,1.513913,0.488,0.071821,0.119797,0.074863
5,1.4211,1.353609,0.528,0.079357,0.13042,0.092292
6,1.3063,1.267851,0.552,0.10545,0.148993,0.104596
7,1.2049,1.218409,0.556,0.108793,0.162551,0.113605
8,1.0842,1.175218,0.57,0.15063,0.178978,0.128699
9,1.0049,1.128744,0.586,0.165048,0.197441,0.150365
10,0.8919,1.081082,0.616,0.175153,0.230641,0.179338


[I 2025-03-15 14:36:45,682] Trial 21 finished with value: 0.3337877958696962 and parameters: {'learning_rate': 0.002425187341911066, 'weight_decay': 0.007, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 5.0}. Best is trial 14 with value: 0.4253194679448775.


Trial 22 with params: {'learning_rate': 0.003617279293760323, 'weight_decay': 0.003, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.216,1.985042,0.19,0.028329,0.060255,0.034919
2,1.7694,1.707377,0.438,0.041497,0.108528,0.056825
3,1.5605,1.458623,0.51,0.072117,0.128156,0.083577
4,1.3964,1.412401,0.488,0.092666,0.130521,0.090178
5,1.2616,1.250094,0.554,0.098332,0.149268,0.109159
6,1.1417,1.174173,0.56,0.15234,0.168246,0.119683
7,1.0375,1.121544,0.588,0.151972,0.185652,0.148115
8,0.9144,1.102502,0.602,0.180685,0.230868,0.185726
9,0.8086,1.041845,0.62,0.230288,0.26358,0.212502
10,0.6873,1.007933,0.636,0.282785,0.280724,0.242581


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--f1/34c46321f42186df33a6260966e34a368f14868d9cc2ba47d142112e2800d233 (last modified on Fri Jan 10 23:14:01 2025) since it couldn't be found locally at evaluate-metric--f1, or remotely on the Hugging Face Hub.
[I 2025-03-15 14:41:04,409] Trial 22 finished with value: 0.43313539056363315 and parameters: {'learning_rate': 0.003617279293760323, 'weight_decay': 0.003, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 5.5}. Best is trial 22 with value: 0.43313539056363315.


Trial 23 with params: {'learning_rate': 0.002338453380503679, 'weight_decay': 0.004, 'adam_beta1': 0.98, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2811,2.135033,0.16,0.03197,0.037927,0.017951
2,1.9523,1.907804,0.33,0.051848,0.103679,0.057217
3,1.7428,1.687468,0.406,0.073395,0.097935,0.062061
4,1.5802,1.556998,0.5,0.068262,0.122958,0.079334
5,1.4785,1.458691,0.49,0.087943,0.125873,0.087973
6,1.3769,1.331879,0.538,0.083557,0.139205,0.096494
7,1.2987,1.290695,0.548,0.093411,0.14817,0.105162
8,1.2064,1.235305,0.562,0.102302,0.145921,0.109778
9,1.1416,1.179315,0.568,0.107664,0.183164,0.119642
10,1.061,1.164143,0.564,0.129402,0.187061,0.131564


[I 2025-03-15 14:43:20,560] Trial 23 finished with value: 0.28412997382251676 and parameters: {'learning_rate': 0.002338453380503679, 'weight_decay': 0.004, 'adam_beta1': 0.98, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 5.0}. Best is trial 22 with value: 0.43313539056363315.


Trial 24 with params: {'learning_rate': 0.0015249758381195417, 'weight_decay': 0.002, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3036,2.138958,0.176,0.009074,0.039524,0.014654
2,2.0011,1.91858,0.45,0.044531,0.112707,0.062395
3,1.7762,1.712118,0.412,0.054947,0.099168,0.067634
4,1.6031,1.554221,0.462,0.044973,0.113155,0.061728
5,1.4765,1.438668,0.498,0.079686,0.127268,0.08621
6,1.3723,1.336692,0.55,0.104369,0.145943,0.105454
7,1.2857,1.28363,0.552,0.092799,0.146643,0.103571
8,1.1719,1.215059,0.558,0.108231,0.149887,0.114439
9,1.0985,1.172641,0.564,0.112607,0.169582,0.121819
10,1.0013,1.152327,0.57,0.111238,0.167267,0.123238


[I 2025-03-15 14:45:35,261] Trial 24 finished with value: 0.3109008768763665 and parameters: {'learning_rate': 0.0015249758381195417, 'weight_decay': 0.002, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 6.0}. Best is trial 22 with value: 0.43313539056363315.


Trial 25 with params: {'learning_rate': 0.002869753778279075, 'weight_decay': 0.002, 'adam_beta1': 0.97, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.29,2.137305,0.19,0.036223,0.053086,0.03281
2,1.8995,1.820979,0.42,0.04299,0.093853,0.054644
3,1.6893,1.608657,0.458,0.063833,0.11285,0.073904
4,1.5046,1.460691,0.508,0.090775,0.132744,0.092738
5,1.4021,1.376435,0.514,0.09136,0.13738,0.097696
6,1.3023,1.264691,0.56,0.099176,0.142087,0.102403
7,1.2181,1.198516,0.556,0.11217,0.166573,0.116494
8,1.1131,1.177371,0.558,0.094405,0.172949,0.111964
9,1.0515,1.154706,0.572,0.161279,0.19894,0.149151
10,0.9562,1.102038,0.604,0.198256,0.229977,0.185164


[I 2025-03-15 14:48:02,375] Trial 25 finished with value: 0.33983301980424946 and parameters: {'learning_rate': 0.002869753778279075, 'weight_decay': 0.002, 'adam_beta1': 0.97, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 2.5}. Best is trial 22 with value: 0.43313539056363315.


Trial 26 with params: {'learning_rate': 0.0046281371099103985, 'weight_decay': 0.0, 'adam_beta1': 0.97, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3312,2.107511,0.124,0.025355,0.040159,0.023895
2,1.8245,1.673584,0.43,0.043101,0.105623,0.059657
3,1.5911,1.472255,0.506,0.066354,0.124417,0.079593
4,1.432,1.425972,0.494,0.087882,0.141643,0.093095
5,1.3015,1.286003,0.538,0.089186,0.153386,0.102573
6,1.1861,1.183715,0.568,0.151021,0.18501,0.134601
7,1.0895,1.177693,0.57,0.165781,0.197566,0.152198
8,0.9761,1.109503,0.606,0.179343,0.219386,0.181836
9,0.8913,1.102331,0.6,0.190543,0.23841,0.188819
10,0.7777,1.049905,0.602,0.186136,0.24152,0.195841


[I 2025-03-15 14:50:17,169] Trial 26 finished with value: 0.3538936525964921 and parameters: {'learning_rate': 0.0046281371099103985, 'weight_decay': 0.0, 'adam_beta1': 0.97, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 7.0}. Best is trial 22 with value: 0.43313539056363315.


Trial 27 with params: {'learning_rate': 0.0019901230990337034, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3162,2.152279,0.184,0.010098,0.041472,0.01597
2,1.9412,1.846192,0.43,0.041577,0.096473,0.05604
3,1.6997,1.596263,0.452,0.067403,0.110805,0.061341
4,1.5039,1.431499,0.488,0.072462,0.120067,0.075988
5,1.3441,1.286288,0.544,0.10127,0.13811,0.096151
6,1.2215,1.193861,0.556,0.094453,0.151463,0.105427
7,1.1109,1.151632,0.568,0.115923,0.169812,0.124305
8,0.9774,1.123234,0.582,0.163471,0.194674,0.148309
9,0.8765,1.03751,0.612,0.184576,0.223533,0.183074
10,0.7399,1.012403,0.622,0.234827,0.251706,0.201415


[I 2025-03-15 14:52:35,189] Trial 27 finished with value: 0.36983276782427127 and parameters: {'learning_rate': 0.0019901230990337034, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 5.5}. Best is trial 22 with value: 0.43313539056363315.


Trial 28 with params: {'learning_rate': 0.0030484546421435488, 'weight_decay': 0.005, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1421,1.966715,0.206,0.036244,0.062198,0.040742
2,1.759,1.670482,0.438,0.043987,0.108246,0.059028
3,1.5391,1.420367,0.506,0.0775,0.130214,0.087269
4,1.3819,1.3448,0.522,0.080911,0.134538,0.092225
5,1.2494,1.239184,0.542,0.120497,0.155599,0.110677
6,1.1259,1.150714,0.558,0.09721,0.155455,0.109284
7,1.0158,1.118664,0.578,0.160184,0.186821,0.14377
8,0.8923,1.08218,0.604,0.182221,0.222308,0.17989
9,0.7846,1.017556,0.62,0.192488,0.265263,0.201647
10,0.6607,0.992367,0.644,0.246968,0.274342,0.234945


[I 2025-03-15 14:54:55,467] Trial 28 finished with value: 0.3908329351311741 and parameters: {'learning_rate': 0.0030484546421435488, 'weight_decay': 0.005, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 5.0}. Best is trial 22 with value: 0.43313539056363315.


Trial 29 with params: {'learning_rate': 0.0033426295053184175, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2047,1.955462,0.212,0.032093,0.06341,0.038194
2,1.7413,1.605145,0.456,0.044162,0.112096,0.061122
3,1.4925,1.414984,0.486,0.076143,0.119364,0.07847
4,1.3276,1.271541,0.548,0.093756,0.144288,0.10469
5,1.1751,1.178289,0.568,0.103594,0.164156,0.11397
6,1.0572,1.126769,0.582,0.158617,0.184596,0.142047
7,0.9293,1.033698,0.61,0.158378,0.219051,0.170712
8,0.7812,1.060074,0.63,0.231679,0.259028,0.226291
9,0.6746,0.976703,0.638,0.255156,0.286948,0.236035
10,0.5505,0.992571,0.638,0.263844,0.293581,0.250259


[I 2025-03-15 14:57:16,928] Trial 29 finished with value: 0.36041145418361853 and parameters: {'learning_rate': 0.0033426295053184175, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 6.5}. Best is trial 22 with value: 0.43313539056363315.


Trial 30 with params: {'learning_rate': 0.00033060529420710517, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 1, 'lambda_param': 0.2, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.429,2.309459,0.11,0.002619,0.02381,0.004719
2,2.1835,2.21658,0.11,0.002619,0.02381,0.004719
3,2.0877,2.116827,0.11,0.002619,0.02381,0.004719
4,2.0113,1.994289,0.362,0.069335,0.05829,0.042553
5,1.892,1.873824,0.428,0.059534,0.093545,0.064726
6,1.8059,1.771865,0.43,0.040323,0.094021,0.05531
7,1.7249,1.688437,0.412,0.037727,0.099038,0.052608
8,1.6381,1.637741,0.45,0.047257,0.108412,0.062525
9,1.5997,1.556238,0.45,0.049051,0.108215,0.064598
10,1.5337,1.538591,0.454,0.068014,0.109229,0.061557


[I 2025-03-15 14:58:45,410] Trial 30 pruned. 


Trial 31 with params: {'learning_rate': 0.002230062879340359, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2733,2.134086,0.146,0.030631,0.03455,0.016
2,1.9014,1.808136,0.42,0.04016,0.094135,0.053306
3,1.6772,1.567562,0.47,0.062449,0.115286,0.066005
4,1.4834,1.448008,0.484,0.077408,0.125008,0.080028
5,1.3379,1.278346,0.548,0.101261,0.139926,0.099027
6,1.2268,1.19805,0.556,0.090144,0.150636,0.103579
7,1.1239,1.170847,0.564,0.150514,0.167801,0.127379
8,0.9979,1.131833,0.584,0.192745,0.196938,0.160697
9,0.9046,1.070025,0.61,0.187934,0.233311,0.18909
10,0.7766,1.035374,0.612,0.186634,0.242539,0.191115


[I 2025-03-15 15:01:01,357] Trial 31 finished with value: 0.3649686984617929 and parameters: {'learning_rate': 0.002230062879340359, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 3.5}. Best is trial 22 with value: 0.43313539056363315.


Trial 32 with params: {'learning_rate': 0.0030131554715445443, 'weight_decay': 0.006, 'adam_beta1': 0.97, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1443,1.968421,0.208,0.035957,0.058206,0.037433
2,1.7748,1.698529,0.424,0.043426,0.10576,0.057035
3,1.5638,1.474175,0.5,0.070012,0.120086,0.078844
4,1.4216,1.389479,0.49,0.084977,0.132832,0.091137
5,1.3034,1.257319,0.55,0.099936,0.144134,0.105409
6,1.2,1.195,0.558,0.131903,0.157855,0.119109
7,1.1047,1.185384,0.564,0.145642,0.174424,0.131294
8,0.9882,1.145656,0.558,0.155726,0.191957,0.138623
9,0.9144,1.105999,0.602,0.168345,0.221397,0.167993
10,0.8122,1.040955,0.618,0.259706,0.243098,0.19493


[I 2025-03-15 15:03:16,596] Trial 32 finished with value: 0.34794175907114827 and parameters: {'learning_rate': 0.0030131554715445443, 'weight_decay': 0.006, 'adam_beta1': 0.97, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 4.0}. Best is trial 22 with value: 0.43313539056363315.


Trial 33 with params: {'learning_rate': 0.0016370495724040907, 'weight_decay': 0.004, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2684,2.12724,0.152,0.030132,0.048995,0.031303
2,1.9807,1.921588,0.446,0.047227,0.114102,0.062977
3,1.7601,1.686667,0.398,0.050318,0.095835,0.060588
4,1.5826,1.524768,0.462,0.046027,0.113199,0.06285
5,1.4642,1.421818,0.51,0.085651,0.133065,0.091414
6,1.3626,1.324036,0.55,0.091145,0.145406,0.098849
7,1.2773,1.255889,0.562,0.097983,0.153133,0.110092
8,1.1636,1.195833,0.568,0.109408,0.160314,0.119475
9,1.0911,1.165525,0.572,0.113668,0.165449,0.122986
10,0.989,1.124937,0.578,0.106494,0.168738,0.123407


[I 2025-03-15 15:05:43,414] Trial 33 finished with value: 0.33827712647678815 and parameters: {'learning_rate': 0.0016370495724040907, 'weight_decay': 0.004, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 5.0}. Best is trial 22 with value: 0.43313539056363315.


Trial 34 with params: {'learning_rate': 0.0030840127103672625, 'weight_decay': 0.005, 'adam_beta1': 0.96, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1437,1.962554,0.212,0.034797,0.063497,0.040215
2,1.7644,1.683722,0.43,0.045355,0.106341,0.058803
3,1.5546,1.447945,0.496,0.075144,0.127944,0.083432
4,1.4081,1.400545,0.482,0.081209,0.127374,0.085592
5,1.2867,1.242193,0.544,0.086769,0.138509,0.096478
6,1.1829,1.194931,0.556,0.104862,0.157256,0.113406
7,1.0847,1.184907,0.564,0.154466,0.178545,0.139311
8,0.9608,1.128328,0.584,0.161507,0.206908,0.157738
9,0.872,1.080636,0.598,0.179944,0.221685,0.174214
10,0.763,1.010957,0.62,0.188818,0.24762,0.191309


[I 2025-03-15 15:07:56,064] Trial 34 finished with value: 0.33551962982040584 and parameters: {'learning_rate': 0.0030840127103672625, 'weight_decay': 0.005, 'adam_beta1': 0.96, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 6.5}. Best is trial 22 with value: 0.43313539056363315.


Trial 35 with params: {'learning_rate': 0.0038956717793760523, 'weight_decay': 0.005, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2065,1.976997,0.202,0.037412,0.050183,0.028319
2,1.7692,1.714685,0.438,0.042413,0.108246,0.057706
3,1.5489,1.455575,0.506,0.071495,0.121553,0.080889
4,1.3882,1.34553,0.532,0.084638,0.14327,0.097825
5,1.2552,1.219794,0.55,0.117858,0.141654,0.109414
6,1.1437,1.161011,0.56,0.113083,0.154315,0.115363
7,1.0328,1.10935,0.592,0.152015,0.197406,0.146368
8,0.8977,1.120931,0.598,0.172028,0.219689,0.175076
9,0.793,1.036036,0.63,0.229224,0.253529,0.219009
10,0.6795,1.015153,0.638,0.290655,0.264133,0.234137


[I 2025-03-15 15:10:21,918] Trial 35 finished with value: 0.44430764628951386 and parameters: {'learning_rate': 0.0038956717793760523, 'weight_decay': 0.005, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 4.5}. Best is trial 35 with value: 0.44430764628951386.


Trial 36 with params: {'learning_rate': 0.004823920299638799, 'weight_decay': 0.005, 'adam_beta1': 0.91, 'warmup_steps': 3, 'lambda_param': 0.4, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8166,2.24604,0.13,0.036015,0.075878,0.038889
2,1.9455,1.806665,0.422,0.041686,0.094416,0.055094
3,1.64,1.520436,0.472,0.072059,0.115714,0.067732
4,1.4325,1.346033,0.516,0.086973,0.130696,0.086593
5,1.2564,1.182918,0.566,0.095342,0.153904,0.10989
6,1.1185,1.152556,0.574,0.124195,0.178001,0.13068
7,0.9958,1.137857,0.582,0.146828,0.203854,0.149499
8,0.8565,1.092245,0.614,0.262977,0.250313,0.222558
9,0.7271,1.020041,0.638,0.299007,0.293311,0.260883
10,0.5897,1.01161,0.632,0.298506,0.286992,0.261094


[I 2025-03-15 15:12:42,815] Trial 36 finished with value: 0.39031948694784896 and parameters: {'learning_rate': 0.004823920299638799, 'weight_decay': 0.005, 'adam_beta1': 0.91, 'warmup_steps': 3, 'lambda_param': 0.4, 'temperature': 7.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 37 with params: {'learning_rate': 2.197945691935017e-05, 'weight_decay': 0.007, 'adam_beta1': 0.97, 'warmup_steps': 3, 'lambda_param': 0.9, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4771,2.455299,0.006,0.000143,0.02381,0.000284
2,2.4674,2.446641,0.106,0.005974,0.037951,0.009523
3,2.4585,2.43867,0.11,0.002624,0.02381,0.004728
4,2.4498,2.430665,0.11,0.002619,0.02381,0.004719
5,2.44,2.422428,0.11,0.002619,0.02381,0.004719
6,2.4299,2.413776,0.11,0.002619,0.02381,0.004719
7,2.4175,2.404676,0.11,0.002619,0.02381,0.004719
8,2.4082,2.395032,0.11,0.002619,0.02381,0.004719
9,2.3944,2.384563,0.11,0.002619,0.02381,0.004719
10,2.3847,2.373398,0.11,0.002619,0.02381,0.004719


[I 2025-03-15 15:14:59,557] Trial 37 finished with value: 0.004719004719004719 and parameters: {'learning_rate': 2.197945691935017e-05, 'weight_decay': 0.007, 'adam_beta1': 0.97, 'warmup_steps': 3, 'lambda_param': 0.9, 'temperature': 4.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 38 with params: {'learning_rate': 0.0005011383935846559, 'weight_decay': 0.001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0, 'lambda_param': 0.9, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3631,2.190398,0.11,0.002619,0.02381,0.004719
2,2.1081,2.13656,0.11,0.002619,0.02381,0.004719
3,2.0159,2.031859,0.126,0.029911,0.035166,0.01822
4,1.8912,1.872883,0.4,0.060072,0.086878,0.060065
5,1.7811,1.761788,0.438,0.047955,0.095969,0.063658
6,1.7028,1.683593,0.442,0.040471,0.108719,0.056161
7,1.6223,1.589206,0.446,0.046507,0.107177,0.062047
8,1.5362,1.538816,0.472,0.071882,0.115879,0.069089
9,1.5018,1.464851,0.482,0.071052,0.118234,0.071693
10,1.4225,1.429809,0.514,0.072213,0.126769,0.082722


[I 2025-03-15 15:16:26,989] Trial 38 pruned. 


Trial 39 with params: {'learning_rate': 0.0010319599653348635, 'weight_decay': 0.001, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3636,2.219572,0.11,0.002619,0.02381,0.004719
2,2.0651,2.054651,0.118,0.034402,0.043651,0.026208
3,1.873,1.807346,0.396,0.058246,0.086013,0.061027
4,1.7137,1.696712,0.43,0.042614,0.103324,0.057643
5,1.5863,1.555759,0.46,0.049139,0.112809,0.065533
6,1.4969,1.476768,0.508,0.070311,0.122337,0.078058
7,1.4078,1.397212,0.532,0.075878,0.128232,0.088811
8,1.3203,1.327497,0.546,0.09788,0.140785,0.101103
9,1.2703,1.289638,0.54,0.077612,0.135896,0.092728
10,1.1774,1.232309,0.558,0.108673,0.159978,0.11844


[I 2025-03-15 15:17:55,149] Trial 39 pruned. 


Trial 40 with params: {'learning_rate': 0.0012246803234831542, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.278,2.130311,0.128,0.029017,0.030265,0.013493
2,2.0005,1.938605,0.438,0.044446,0.11098,0.060708
3,1.7795,1.712716,0.398,0.049702,0.095878,0.062714
4,1.6091,1.562592,0.464,0.047134,0.113675,0.064296
5,1.4858,1.437293,0.51,0.071861,0.125296,0.080223
6,1.3784,1.355816,0.54,0.089817,0.134232,0.095208
7,1.2901,1.293642,0.544,0.094628,0.13494,0.09731
8,1.1961,1.248272,0.554,0.118523,0.14167,0.110465
9,1.1316,1.188167,0.572,0.113009,0.169428,0.125089
10,1.0299,1.151995,0.578,0.121798,0.173635,0.131967


[I 2025-03-15 15:20:17,673] Trial 40 finished with value: 0.31119480898383395 and parameters: {'learning_rate': 0.0012246803234831542, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 7.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 41 with params: {'learning_rate': 0.0031548188564358977, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.96, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1452,1.958526,0.214,0.034779,0.06393,0.040338
2,1.759,1.685717,0.438,0.042091,0.108246,0.057466
3,1.5559,1.439677,0.494,0.073112,0.127143,0.082395
4,1.4054,1.405244,0.484,0.089169,0.13282,0.091097
5,1.2879,1.245203,0.554,0.094672,0.139907,0.099574
6,1.1934,1.215776,0.548,0.089809,0.150799,0.103497
7,1.0867,1.181519,0.562,0.156111,0.180372,0.138064
8,0.961,1.139959,0.568,0.160085,0.196894,0.1466
9,0.8765,1.08261,0.596,0.168222,0.222095,0.169766
10,0.7653,1.013255,0.612,0.192032,0.24597,0.191319


[I 2025-03-15 15:22:35,180] Trial 41 finished with value: 0.34153267140446153 and parameters: {'learning_rate': 0.0031548188564358977, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.96, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 5.5}. Best is trial 35 with value: 0.44430764628951386.


Trial 42 with params: {'learning_rate': 0.0035157192340374947, 'weight_decay': 0.005, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1672,1.964058,0.208,0.038961,0.058163,0.037902
2,1.761,1.688583,0.448,0.043588,0.112274,0.060797
3,1.5498,1.446276,0.514,0.074225,0.132324,0.086922
4,1.3906,1.368341,0.518,0.084199,0.136505,0.0939
5,1.2609,1.215387,0.546,0.097503,0.141415,0.105796
6,1.1545,1.167748,0.566,0.104001,0.16765,0.116564
7,1.0478,1.10506,0.588,0.147368,0.191001,0.142609
8,0.9178,1.159327,0.582,0.178184,0.218781,0.170004
9,0.8197,1.035885,0.616,0.204232,0.245298,0.200463
10,0.7149,1.025516,0.628,0.218179,0.25509,0.216071


[I 2025-03-15 15:24:49,207] Trial 42 finished with value: 0.38603419100009123 and parameters: {'learning_rate': 0.0035157192340374947, 'weight_decay': 0.005, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 3.5}. Best is trial 35 with value: 0.44430764628951386.


Trial 43 with params: {'learning_rate': 0.00015448517085097122, 'weight_decay': 0.01, 'adam_beta1': 0.99, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4652,2.419309,0.11,0.002619,0.02381,0.004719
2,2.3939,2.344229,0.11,0.002619,0.02381,0.004719
3,2.2849,2.238938,0.11,0.002619,0.02381,0.004719
4,2.1614,2.188971,0.11,0.002619,0.02381,0.004719
5,2.1262,2.230998,0.11,0.002619,0.02381,0.004719
6,2.1358,2.193555,0.11,0.002619,0.02381,0.004719
7,2.0925,2.142396,0.11,0.002619,0.02381,0.004719
8,2.0535,2.11383,0.11,0.002619,0.02381,0.004719
9,2.0427,2.094004,0.11,0.002624,0.02381,0.004728
10,2.0216,2.074417,0.112,0.01455,0.026455,0.009091


[I 2025-03-15 15:26:18,824] Trial 43 pruned. 


Trial 44 with params: {'learning_rate': 0.00478019581429169, 'weight_decay': 0.005, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5008,2.112802,0.128,0.029067,0.030265,0.013571
2,1.8227,1.71144,0.432,0.054778,0.094818,0.059517
3,1.5956,1.531531,0.474,0.076875,0.11505,0.072849
4,1.4366,1.364554,0.496,0.091688,0.129501,0.090043
5,1.2928,1.250432,0.554,0.105914,0.154989,0.114287
6,1.1622,1.161733,0.564,0.12966,0.165297,0.118535
7,1.0453,1.118036,0.58,0.155739,0.186592,0.144497
8,0.9128,1.117,0.59,0.166893,0.219497,0.169797
9,0.8064,1.049945,0.598,0.219534,0.240783,0.210832
10,0.6751,1.035486,0.608,0.223687,0.23644,0.205594


[I 2025-03-15 15:28:34,694] Trial 44 finished with value: 0.3448905785069116 and parameters: {'learning_rate': 0.00478019581429169, 'weight_decay': 0.005, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 5.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 45 with params: {'learning_rate': 0.004277769917039058, 'weight_decay': 0.001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.286,2.077592,0.128,0.025358,0.046961,0.026038
2,1.7957,1.659711,0.464,0.064239,0.113901,0.066551
3,1.556,1.465582,0.49,0.076181,0.12069,0.079238
4,1.3762,1.328063,0.514,0.095911,0.143163,0.101485
5,1.2185,1.201703,0.562,0.124393,0.16853,0.118531
6,1.097,1.144199,0.584,0.170108,0.177616,0.148368
7,0.9948,1.130607,0.6,0.195275,0.224471,0.180929
8,0.8569,1.058639,0.604,0.198153,0.238636,0.199663
9,0.7439,1.04044,0.618,0.239316,0.253261,0.220668
10,0.611,1.034938,0.626,0.275281,0.292608,0.252426


[I 2025-03-15 15:30:57,822] Trial 45 finished with value: 0.4147199573870624 and parameters: {'learning_rate': 0.004277769917039058, 'weight_decay': 0.001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 3.5}. Best is trial 35 with value: 0.44430764628951386.


Trial 46 with params: {'learning_rate': 0.004526358004577521, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2021,1.938836,0.254,0.039641,0.071249,0.035236
2,1.6953,1.528607,0.514,0.064302,0.123617,0.079027
3,1.4476,1.377032,0.514,0.088276,0.13865,0.096368
4,1.2454,1.160606,0.57,0.137535,0.165809,0.130905
5,1.0955,1.108184,0.58,0.156861,0.178632,0.14352
6,0.9434,1.060323,0.608,0.205187,0.230854,0.193856
7,0.8002,0.990141,0.64,0.265388,0.276519,0.236903
8,0.6534,0.961296,0.664,0.339253,0.339398,0.30044
9,0.5391,0.938052,0.672,0.324617,0.338893,0.304983
10,0.4311,0.979773,0.646,0.30737,0.35103,0.301635


[I 2025-03-15 15:33:13,856] Trial 46 finished with value: 0.38853827973777516 and parameters: {'learning_rate': 0.004526358004577521, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 3.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 47 with params: {'learning_rate': 0.004652492967552257, 'weight_decay': 0.001, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4683,2.121951,0.13,0.040957,0.042169,0.02545
2,1.8198,1.694981,0.432,0.041719,0.094714,0.057079
3,1.5869,1.517745,0.476,0.072267,0.114216,0.071577
4,1.4236,1.351272,0.498,0.085727,0.131168,0.091592
5,1.2813,1.251347,0.55,0.10784,0.151723,0.115519
6,1.1537,1.159727,0.566,0.102855,0.162104,0.116116
7,1.0441,1.107885,0.588,0.131714,0.191933,0.143153
8,0.9121,1.11805,0.59,0.162681,0.21067,0.162041
9,0.8077,1.066876,0.606,0.18809,0.242644,0.195804
10,0.6867,1.016271,0.618,0.248775,0.258513,0.220614


[I 2025-03-15 15:35:27,583] Trial 47 finished with value: 0.37219814116678324 and parameters: {'learning_rate': 0.004652492967552257, 'weight_decay': 0.001, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 4.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 48 with params: {'learning_rate': 0.003778213760209093, 'weight_decay': 0.002, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3288,2.083565,0.134,0.02905,0.038201,0.023429
2,1.8282,1.708235,0.43,0.042437,0.096407,0.057514
3,1.6013,1.478885,0.512,0.066898,0.126011,0.080402
4,1.4228,1.377434,0.502,0.085106,0.128531,0.088164
5,1.2723,1.252141,0.548,0.098887,0.148422,0.108737
6,1.149,1.152522,0.582,0.152422,0.181996,0.141105
7,1.04,1.137686,0.584,0.159375,0.197393,0.154286
8,0.9049,1.155533,0.604,0.21563,0.233076,0.201941
9,0.7902,1.04403,0.62,0.228501,0.248213,0.215257
10,0.6645,1.026911,0.648,0.262272,0.292826,0.247176


[I 2025-03-15 15:37:45,650] Trial 48 finished with value: 0.34858505138408097 and parameters: {'learning_rate': 0.003778213760209093, 'weight_decay': 0.002, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 3.5}. Best is trial 35 with value: 0.44430764628951386.


Trial 49 with params: {'learning_rate': 0.00073268109441029, 'weight_decay': 0.0, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.358,2.202659,0.11,0.002619,0.02381,0.004719
2,2.0807,2.088099,0.11,0.002624,0.02381,0.004728
3,1.9244,1.865834,0.388,0.05909,0.084151,0.05906
4,1.7796,1.73003,0.414,0.042438,0.088129,0.056763
5,1.6566,1.636451,0.448,0.047357,0.107696,0.062919
6,1.5685,1.531344,0.488,0.067889,0.120214,0.07234
7,1.4886,1.4517,0.486,0.07376,0.115086,0.07477
8,1.4092,1.428505,0.52,0.089575,0.134854,0.094825
9,1.3685,1.356367,0.526,0.071602,0.126646,0.084371
10,1.2814,1.311179,0.542,0.093873,0.144432,0.100193


[I 2025-03-15 15:39:14,294] Trial 49 pruned. 


Trial 50 with params: {'learning_rate': 1.4151885822385198e-05, 'weight_decay': 0.005, 'adam_beta1': 0.92, 'warmup_steps': 3, 'lambda_param': 0.8, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4781,2.457989,0.006,0.000143,0.02381,0.000284
2,2.472,2.452209,0.006,0.000143,0.02381,0.000284
3,2.4665,2.446676,0.1,0.00927,0.036652,0.01295
4,2.4608,2.441272,0.11,0.002651,0.02381,0.004771
5,2.4542,2.435787,0.11,0.002619,0.02381,0.004719
6,2.4473,2.430119,0.11,0.002619,0.02381,0.004719
7,2.4384,2.424281,0.11,0.002619,0.02381,0.004719
8,2.4336,2.418484,0.11,0.002619,0.02381,0.004719
9,2.4244,2.412179,0.11,0.002619,0.02381,0.004719
10,2.4205,2.405586,0.11,0.002619,0.02381,0.004719


[I 2025-03-15 15:40:45,419] Trial 50 pruned. 


Trial 51 with params: {'learning_rate': 9.804013238495092e-05, 'weight_decay': 0.007, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 4, 'lambda_param': 0.0, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4702,2.43407,0.11,0.002619,0.02381,0.004719
2,2.4266,2.391912,0.11,0.002619,0.02381,0.004719
3,2.366,2.327566,0.11,0.002619,0.02381,0.004719
4,2.2733,2.241824,0.11,0.002619,0.02381,0.004719
5,2.1619,2.188903,0.11,0.002619,0.02381,0.004719
6,2.1252,2.195535,0.11,0.002619,0.02381,0.004719
7,2.1066,2.158351,0.11,0.002619,0.02381,0.004719
8,2.0739,2.134546,0.11,0.002619,0.02381,0.004719
9,2.0619,2.110932,0.11,0.002619,0.02381,0.004719
10,2.034,2.084008,0.11,0.002619,0.02381,0.004719


[I 2025-03-15 15:43:00,668] Trial 51 finished with value: 0.05768072343914006 and parameters: {'learning_rate': 9.804013238495092e-05, 'weight_decay': 0.007, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 4, 'lambda_param': 0.0, 'temperature': 3.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 52 with params: {'learning_rate': 0.003916632308830288, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2065,1.966943,0.214,0.040055,0.06393,0.04263
2,1.7297,1.57335,0.466,0.063199,0.111742,0.064751
3,1.4738,1.398931,0.494,0.077223,0.121243,0.080358
4,1.2998,1.190682,0.548,0.079949,0.138459,0.096524
5,1.1545,1.163817,0.574,0.117492,0.172737,0.124505
6,1.015,1.135571,0.584,0.18377,0.193431,0.1631
7,0.8741,1.002375,0.632,0.207368,0.245713,0.203473
8,0.7442,1.094967,0.626,0.268326,0.248654,0.222491
9,0.6349,0.974789,0.662,0.296586,0.328352,0.277782
10,0.5022,1.009097,0.65,0.304752,0.350458,0.292523


[I 2025-03-15 15:45:38,440] Trial 52 finished with value: 0.4276814914504484 and parameters: {'learning_rate': 0.003916632308830288, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 7.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 53 with params: {'learning_rate': 0.0027028820148778197, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1781,2.021659,0.166,0.026322,0.054497,0.031991
2,1.7722,1.696582,0.444,0.040554,0.110326,0.056174
3,1.5496,1.491756,0.482,0.076941,0.118498,0.07757
4,1.3837,1.315172,0.534,0.08737,0.137838,0.09889
5,1.2401,1.208177,0.56,0.097482,0.157169,0.108404
6,1.1362,1.14618,0.574,0.1239,0.165513,0.122131
7,1.0304,1.149736,0.584,0.162377,0.204758,0.154946
8,0.8763,1.106184,0.612,0.238168,0.245832,0.201724
9,0.7708,1.054155,0.622,0.252866,0.275627,0.216616
10,0.6429,1.037813,0.626,0.221776,0.277371,0.225105


[I 2025-03-15 15:47:59,332] Trial 53 finished with value: 0.374333468982434 and parameters: {'learning_rate': 0.0027028820148778197, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 6.5}. Best is trial 35 with value: 0.44430764628951386.


Trial 54 with params: {'learning_rate': 0.0030295094063305735, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1398,1.961943,0.206,0.037733,0.062198,0.041647
2,1.7417,1.642313,0.456,0.044335,0.111966,0.061217
3,1.5123,1.391668,0.498,0.079991,0.131357,0.086737
4,1.3372,1.234194,0.552,0.089444,0.13832,0.096497
5,1.2053,1.191288,0.558,0.103783,0.168605,0.112289
6,1.0761,1.128782,0.574,0.125899,0.172062,0.126694
7,0.9419,1.06676,0.59,0.15709,0.197041,0.158071
8,0.8081,1.06682,0.624,0.248797,0.255606,0.228973
9,0.6985,0.979067,0.646,0.262609,0.300505,0.249921
10,0.5658,1.03595,0.636,0.283566,0.312819,0.266104


[I 2025-03-15 15:50:13,394] Trial 54 finished with value: 0.4112470840853582 and parameters: {'learning_rate': 0.0030295094063305735, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 7.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 55 with params: {'learning_rate': 0.004871846700976889, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5131,2.091725,0.138,0.029806,0.032646,0.014743
2,1.8042,1.666054,0.458,0.061461,0.112454,0.067012
3,1.54,1.472406,0.482,0.097288,0.119829,0.077499
4,1.3606,1.238291,0.548,0.084692,0.14523,0.096706
5,1.1964,1.161162,0.564,0.142345,0.177474,0.121462
6,1.0592,1.101261,0.588,0.155431,0.193417,0.150731
7,0.9122,1.062112,0.606,0.179157,0.21969,0.17964
8,0.7895,1.096975,0.622,0.244835,0.265383,0.237247
9,0.6928,1.003841,0.628,0.292756,0.27137,0.254452
10,0.5588,1.033159,0.642,0.286691,0.298025,0.267751


[I 2025-03-15 15:52:33,431] Trial 55 finished with value: 0.38796356403494303 and parameters: {'learning_rate': 0.004871846700976889, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 7.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 56 with params: {'learning_rate': 0.004900529295194352, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.30000000000000004, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3807,2.121366,0.12,0.02795,0.034868,0.020277
2,1.8027,1.667337,0.47,0.066804,0.115446,0.07104
3,1.5577,1.468291,0.478,0.074699,0.117338,0.073493
4,1.3806,1.30108,0.52,0.093476,0.140766,0.100543
5,1.2082,1.18634,0.57,0.12606,0.179798,0.121726
6,1.0737,1.123456,0.588,0.168878,0.192161,0.160296
7,0.9529,1.074076,0.604,0.17843,0.218584,0.180051
8,0.8053,1.063533,0.614,0.204152,0.243402,0.205614
9,0.6894,1.007178,0.626,0.277039,0.281257,0.250427
10,0.5497,1.07343,0.606,0.321148,0.296166,0.271227


[I 2025-03-15 15:54:48,220] Trial 56 finished with value: 0.4005873524067207 and parameters: {'learning_rate': 0.004900529295194352, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.30000000000000004, 'temperature': 5.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 57 with params: {'learning_rate': 0.003565303863074173, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3075,2.117943,0.146,0.030112,0.041058,0.025094
2,1.8275,1.700925,0.452,0.059346,0.099884,0.066139
3,1.5694,1.483125,0.478,0.074533,0.117338,0.073974
4,1.3901,1.304977,0.544,0.087424,0.141728,0.099787
5,1.2313,1.201271,0.568,0.109115,0.171413,0.113962
6,1.1083,1.130609,0.578,0.147405,0.168369,0.130017
7,0.9807,1.07242,0.612,0.171785,0.217225,0.167956
8,0.838,1.083979,0.628,0.245711,0.256865,0.229838
9,0.7284,0.975739,0.636,0.267107,0.268648,0.230269
10,0.5998,0.997293,0.646,0.278757,0.290849,0.253054


[I 2025-03-15 15:57:03,022] Trial 57 finished with value: 0.4054383644714752 and parameters: {'learning_rate': 0.003565303863074173, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 7.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 58 with params: {'learning_rate': 0.0001202975463488601, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 1, 'lambda_param': 1.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.462,2.419021,0.11,0.002619,0.02381,0.004719
2,2.3977,2.349414,0.11,0.002619,0.02381,0.004719
3,2.2893,2.237077,0.11,0.002619,0.02381,0.004719
4,2.1554,2.185045,0.11,0.002619,0.02381,0.004719
5,2.1051,2.15835,0.11,0.002619,0.02381,0.004719
6,2.0787,2.119381,0.11,0.002619,0.02381,0.004719
7,2.0483,2.078469,0.11,0.002619,0.02381,0.004719
8,1.9995,2.023205,0.35,0.051793,0.052225,0.036936
9,1.9584,1.955649,0.396,0.04336,0.074541,0.051229
10,1.9047,1.907964,0.418,0.03958,0.079779,0.052102


[I 2025-03-15 15:59:15,573] Trial 58 finished with value: 0.058835097562588166 and parameters: {'learning_rate': 0.0001202975463488601, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 1, 'lambda_param': 1.0, 'temperature': 7.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 59 with params: {'learning_rate': 2.357469248792504e-05, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'warmup_steps': 3, 'lambda_param': 0.6000000000000001, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.477,2.454763,0.006,0.000143,0.02381,0.000284
2,2.4666,2.445451,0.112,0.004362,0.03925,0.007489
3,2.4566,2.436187,0.11,0.002624,0.02381,0.004728
4,2.4461,2.426232,0.11,0.002619,0.02381,0.004719
5,2.4335,2.41532,0.11,0.002619,0.02381,0.004719
6,2.4198,2.402745,0.11,0.002619,0.02381,0.004719
7,2.4021,2.388346,0.11,0.002619,0.02381,0.004719
8,2.3855,2.37216,0.11,0.002619,0.02381,0.004719
9,2.3623,2.35266,0.11,0.002619,0.02381,0.004719
10,2.3398,2.331304,0.11,0.002619,0.02381,0.004719


[I 2025-03-15 16:00:45,737] Trial 59 pruned. 


Trial 60 with params: {'learning_rate': 0.004827642977911999, 'weight_decay': 0.004, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2132,1.932923,0.214,0.025806,0.066099,0.034977
2,1.7102,1.586287,0.47,0.070753,0.115385,0.068563
3,1.4577,1.381126,0.506,0.078758,0.127325,0.085808
4,1.273,1.19514,0.57,0.092465,0.155572,0.110451
5,1.1225,1.149104,0.568,0.17908,0.190428,0.139411
6,0.9884,1.065825,0.61,0.18916,0.2229,0.183335
7,0.8538,1.036164,0.634,0.19555,0.266737,0.204955
8,0.7217,1.025101,0.64,0.258752,0.313625,0.259338
9,0.62,0.983071,0.638,0.298712,0.316231,0.270879
10,0.5096,0.956489,0.67,0.308031,0.352999,0.299876


[I 2025-03-15 16:03:18,968] Trial 60 finished with value: 0.39396180573535666 and parameters: {'learning_rate': 0.004827642977911999, 'weight_decay': 0.004, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 7.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 61 with params: {'learning_rate': 0.002521989013635502, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 4, 'lambda_param': 0.2, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3287,2.163138,0.142,0.030176,0.033598,0.01532
2,1.9245,1.822355,0.436,0.04122,0.097945,0.056328
3,1.6798,1.566282,0.474,0.063044,0.116316,0.069441
4,1.485,1.421075,0.514,0.085904,0.141592,0.095582
5,1.3228,1.25782,0.55,0.096602,0.149895,0.105239
6,1.2036,1.178132,0.574,0.107985,0.164644,0.118671
7,1.1066,1.173354,0.556,0.145547,0.177565,0.131686
8,0.97,1.150681,0.586,0.199644,0.197676,0.162269
9,0.8704,1.044438,0.622,0.212675,0.24006,0.199141
10,0.7389,1.00282,0.634,0.231646,0.263795,0.219143


[I 2025-03-15 16:05:51,146] Trial 61 finished with value: 0.36688737632962437 and parameters: {'learning_rate': 0.002521989013635502, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 4, 'lambda_param': 0.2, 'temperature': 7.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 62 with params: {'learning_rate': 0.0049435068338065154, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2217,1.935809,0.216,0.023996,0.068745,0.03352
2,1.7036,1.568617,0.476,0.06801,0.116818,0.070918
3,1.4435,1.335631,0.512,0.080591,0.13094,0.089495
4,1.2487,1.193939,0.566,0.121613,0.154193,0.115061
5,1.1158,1.165817,0.576,0.15819,0.189333,0.144038
6,0.9638,1.071014,0.612,0.192123,0.225442,0.184781
7,0.8198,0.996914,0.638,0.214938,0.275223,0.218514
8,0.6688,1.020307,0.642,0.266625,0.312739,0.260645
9,0.5613,0.957165,0.66,0.327098,0.337635,0.302843
10,0.4505,0.973539,0.648,0.30876,0.348604,0.305873


[I 2025-03-15 16:08:09,428] Trial 62 finished with value: 0.3959074314556888 and parameters: {'learning_rate': 0.0049435068338065154, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 7.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 63 with params: {'learning_rate': 0.0034320058256246695, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3802,2.138294,0.152,0.032955,0.046349,0.026669
2,1.8611,1.732822,0.426,0.067906,0.095485,0.058206
3,1.5959,1.480854,0.488,0.071927,0.119723,0.07393
4,1.4048,1.310754,0.526,0.085324,0.130402,0.088862
5,1.2511,1.193226,0.558,0.097991,0.153687,0.108795
6,1.116,1.127352,0.566,0.138172,0.168726,0.122808
7,0.9733,1.073677,0.6,0.15327,0.206416,0.15794
8,0.8454,1.039704,0.628,0.214896,0.261413,0.210126
9,0.7294,0.970875,0.644,0.243793,0.282509,0.233111
10,0.6011,0.975587,0.654,0.302159,0.30279,0.270348


[I 2025-03-15 16:10:20,720] Trial 63 finished with value: 0.39876839614245174 and parameters: {'learning_rate': 0.0034320058256246695, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 6.5}. Best is trial 35 with value: 0.44430764628951386.


Trial 64 with params: {'learning_rate': 0.003992930652607662, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2187,1.974314,0.206,0.037998,0.055517,0.03477
2,1.7562,1.671311,0.458,0.068133,0.112559,0.062493
3,1.5285,1.447039,0.492,0.072774,0.118378,0.077432
4,1.3581,1.266354,0.562,0.088526,0.150633,0.10493
5,1.2128,1.193441,0.558,0.10743,0.162242,0.116693
6,1.085,1.124538,0.574,0.143857,0.177907,0.13073
7,0.9551,1.083441,0.604,0.162061,0.21197,0.160113
8,0.8223,1.055502,0.636,0.280781,0.263445,0.234065
9,0.7173,0.971696,0.644,0.256634,0.285539,0.246603
10,0.5911,0.990764,0.644,0.268839,0.303542,0.256968


[I 2025-03-15 16:12:33,901] Trial 64 finished with value: 0.4259649882765149 and parameters: {'learning_rate': 0.003992930652607662, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 7.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 65 with params: {'learning_rate': 0.0034501497133051246, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.6000000000000001, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1611,1.949357,0.214,0.034641,0.06393,0.040236
2,1.7294,1.579765,0.46,0.045712,0.112852,0.063115
3,1.4767,1.40693,0.494,0.077403,0.124168,0.081056
4,1.305,1.20397,0.546,0.084463,0.135452,0.094838
5,1.1721,1.180942,0.564,0.107138,0.175067,0.118115
6,1.0386,1.11963,0.582,0.144528,0.184283,0.137665
7,0.8859,1.034569,0.62,0.17955,0.226775,0.17985
8,0.7649,1.080113,0.628,0.238479,0.247969,0.222209
9,0.6754,1.009946,0.638,0.231159,0.307827,0.239624
10,0.5564,1.070102,0.626,0.286288,0.32041,0.266876


[I 2025-03-15 16:14:45,412] Trial 65 finished with value: 0.38881331881449727 and parameters: {'learning_rate': 0.0034501497133051246, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.6000000000000001, 'temperature': 5.5}. Best is trial 35 with value: 0.44430764628951386.


Trial 66 with params: {'learning_rate': 0.0012370849250114737, 'weight_decay': 0.001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0, 'lambda_param': 0.6000000000000001, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2827,2.127911,0.114,0.003777,0.024762,0.006453
2,2.0142,1.982671,0.356,0.046216,0.093766,0.05514
3,1.81,1.751036,0.406,0.051278,0.097609,0.063584
4,1.6507,1.636216,0.446,0.045227,0.107177,0.060022
5,1.5374,1.499295,0.478,0.073395,0.117381,0.072912
6,1.4444,1.404576,0.518,0.073845,0.127709,0.084962
7,1.3557,1.337332,0.546,0.093791,0.141215,0.099182
8,1.2535,1.272781,0.552,0.103606,0.149826,0.110601
9,1.1988,1.204656,0.562,0.095382,0.150318,0.108316
10,1.106,1.176208,0.568,0.107247,0.163322,0.119334


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--f1/34c46321f42186df33a6260966e34a368f14868d9cc2ba47d142112e2800d233 (last modified on Fri Jan 10 23:14:01 2025) since it couldn't be found locally at evaluate-metric--f1, or remotely on the Hugging Face Hub.
[I 2025-03-15 16:17:31,169] Trial 66 finished with value: 0.2719690352336607 and parameters: {'learning_rate': 0.0012370849250114737, 'weight_decay': 0.001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0, 'lambda_param': 0.6000000000000001, 'temperature': 7.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 67 with params: {'learning_rate': 0.003841675523415964, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2381,2.032799,0.148,0.024447,0.048042,0.028374
2,1.7757,1.681436,0.464,0.065672,0.114097,0.063728
3,1.5494,1.459327,0.498,0.077437,0.122434,0.083756
4,1.3666,1.301473,0.548,0.096224,0.149796,0.107041
5,1.2061,1.18688,0.562,0.119596,0.169084,0.117866
6,1.0828,1.146975,0.562,0.164763,0.159154,0.131708
7,0.977,1.104813,0.604,0.173018,0.209643,0.169403
8,0.8435,1.042558,0.616,0.223322,0.238836,0.206111
9,0.7289,1.029117,0.62,0.245276,0.256553,0.224467
10,0.5973,0.991915,0.638,0.277727,0.293933,0.259539


[I 2025-03-15 16:19:47,890] Trial 67 finished with value: 0.4083392945281006 and parameters: {'learning_rate': 0.003841675523415964, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 4.5}. Best is trial 35 with value: 0.44430764628951386.


Trial 68 with params: {'learning_rate': 0.0037550446718773276, 'weight_decay': 0.005, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.5, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2302,2.016561,0.156,0.025169,0.049947,0.029495
2,1.7764,1.706211,0.432,0.040008,0.107708,0.055574
3,1.5531,1.45677,0.5,0.075906,0.122837,0.082251
4,1.3737,1.314068,0.524,0.093881,0.143048,0.101926
5,1.2121,1.189834,0.568,0.129995,0.170518,0.120935
6,1.0945,1.151792,0.566,0.1606,0.158037,0.13073
7,0.993,1.128845,0.608,0.189396,0.21893,0.175271
8,0.8557,1.05364,0.614,0.22036,0.249748,0.212732
9,0.7369,1.0489,0.622,0.272697,0.264432,0.239795
10,0.608,1.017872,0.628,0.278206,0.292667,0.250602


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--precision/155d3220d6cd4a6553f12da68eeb3d1f97cf431206304a4bc6e2d564c29502e9 (last modified on Fri Jan 10 23:13:59 2025) since it couldn't be found locally at evaluate-metric--precision, or remotely on the Hugging Face Hub.
[I 2025-03-15 16:22:28,447] Trial 68 finished with value: 0.41318398584947197 and parameters: {'learning_rate': 0.0037550446718773276, 'weight_decay': 0.005, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.5, 'temperature': 7.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 69 with params: {'learning_rate': 0.0010559343458298738, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.91, 'warmup_steps': 2, 'lambda_param': 0.8, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3463,2.190619,0.11,0.002619,0.02381,0.004719
2,2.0411,1.988491,0.356,0.054782,0.076776,0.058003
3,1.8285,1.732515,0.392,0.042922,0.083021,0.055463
4,1.6572,1.61311,0.454,0.045891,0.111533,0.06149
5,1.5195,1.466115,0.488,0.067515,0.116999,0.072568
6,1.4134,1.372017,0.526,0.07435,0.129796,0.08904
7,1.3221,1.318732,0.544,0.090125,0.134796,0.096612
8,1.2306,1.281304,0.554,0.122298,0.143934,0.109797
9,1.1826,1.201535,0.564,0.099178,0.158236,0.112326
10,1.0743,1.1661,0.568,0.108644,0.168147,0.121169


[I 2025-03-15 16:24:38,379] Trial 69 finished with value: 0.280569856713581 and parameters: {'learning_rate': 0.0010559343458298738, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.91, 'warmup_steps': 2, 'lambda_param': 0.8, 'temperature': 7.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 70 with params: {'learning_rate': 0.0008711758430034588, 'weight_decay': 0.004, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.5, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3458,2.210252,0.11,0.002619,0.02381,0.004719
2,2.0646,2.05309,0.112,0.026434,0.035714,0.020601
3,1.8818,1.807178,0.398,0.057157,0.086532,0.059922
4,1.7251,1.69615,0.434,0.042642,0.104602,0.057982
5,1.5949,1.555935,0.456,0.049702,0.111857,0.066149
6,1.505,1.484523,0.508,0.070263,0.122337,0.077705
7,1.4148,1.401178,0.524,0.077399,0.129229,0.088123
8,1.3266,1.364823,0.534,0.11789,0.131285,0.098989
9,1.2746,1.270673,0.55,0.099864,0.139787,0.10177
10,1.1932,1.238905,0.556,0.104654,0.146688,0.103893


[I 2025-03-15 16:26:09,198] Trial 70 pruned. 


Trial 71 with params: {'learning_rate': 0.001335668591584621, 'weight_decay': 0.005, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 2, 'lambda_param': 0.8, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3249,2.154205,0.136,0.005927,0.03,0.009876
2,2.0152,1.939924,0.412,0.052199,0.099038,0.059304
3,1.7933,1.709626,0.398,0.04678,0.084406,0.0585
4,1.6257,1.582146,0.456,0.047544,0.111727,0.062927
5,1.4995,1.453794,0.488,0.079327,0.121865,0.079535
6,1.394,1.362616,0.544,0.10086,0.137683,0.095584
7,1.2982,1.293794,0.544,0.1023,0.134855,0.095912
8,1.1943,1.239993,0.552,0.106746,0.140885,0.107431
9,1.1235,1.186603,0.562,0.112208,0.157593,0.116096
10,1.0333,1.168742,0.566,0.113233,0.172658,0.121436


[I 2025-03-15 16:28:18,226] Trial 71 finished with value: 0.3384720363781911 and parameters: {'learning_rate': 0.001335668591584621, 'weight_decay': 0.005, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 2, 'lambda_param': 0.8, 'temperature': 7.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 72 with params: {'learning_rate': 0.0034355216773972745, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1591,1.948485,0.214,0.032797,0.06393,0.039273
2,1.7271,1.569966,0.462,0.06951,0.113359,0.064105
3,1.4757,1.402052,0.496,0.078034,0.12757,0.082228
4,1.3065,1.214052,0.546,0.084259,0.136098,0.095631
5,1.1733,1.181916,0.562,0.103274,0.169496,0.114117
6,1.0387,1.124802,0.572,0.163416,0.180541,0.13921
7,0.8893,1.046441,0.608,0.180512,0.218113,0.175123
8,0.7655,1.089373,0.63,0.264654,0.254382,0.236384
9,0.6747,0.989497,0.646,0.24623,0.29044,0.24576
10,0.5453,1.026936,0.648,0.311969,0.336629,0.290726


[I 2025-03-15 16:30:30,090] Trial 72 finished with value: 0.42783503718077165 and parameters: {'learning_rate': 0.0034355216773972745, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 7.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 73 with params: {'learning_rate': 0.004150535205825585, 'weight_decay': 0.004, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.6000000000000001, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2696,2.05826,0.136,0.023165,0.043016,0.02561
2,1.7866,1.677089,0.476,0.063543,0.11721,0.068945
3,1.5461,1.453633,0.492,0.077007,0.120958,0.080548
4,1.3606,1.291819,0.558,0.095333,0.153067,0.109411
5,1.1964,1.188158,0.568,0.125282,0.179932,0.123839
6,1.075,1.141104,0.58,0.16988,0.169809,0.143878
7,0.9558,1.09709,0.592,0.18434,0.205709,0.168884
8,0.8318,1.041794,0.618,0.212361,0.24824,0.21061
9,0.715,1.012288,0.622,0.250958,0.261902,0.222109
10,0.5793,1.061689,0.602,0.25855,0.289791,0.250592


[I 2025-03-15 16:32:40,618] Trial 73 finished with value: 0.41325129000967864 and parameters: {'learning_rate': 0.004150535205825585, 'weight_decay': 0.004, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.6000000000000001, 'temperature': 6.5}. Best is trial 35 with value: 0.44430764628951386.


Trial 74 with params: {'learning_rate': 0.0021838254011812756, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2144,2.086558,0.144,0.029341,0.04709,0.030064
2,1.8454,1.7364,0.422,0.041456,0.103675,0.0572
3,1.6298,1.52867,0.458,0.070386,0.112646,0.063948
4,1.4529,1.421928,0.484,0.075943,0.123557,0.07888
5,1.3134,1.257857,0.552,0.102438,0.140909,0.100302
6,1.2043,1.19338,0.55,0.089414,0.146162,0.100902
7,1.0999,1.168659,0.566,0.146069,0.167075,0.130377
8,0.9627,1.108246,0.592,0.166076,0.188836,0.145915
9,0.8566,1.070003,0.602,0.186082,0.237655,0.185757
10,0.7404,1.042001,0.61,0.188204,0.242096,0.192098


[I 2025-03-15 16:34:58,750] Trial 74 finished with value: 0.3465445777098193 and parameters: {'learning_rate': 0.0021838254011812756, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 7.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 75 with params: {'learning_rate': 0.004707508506661088, 'weight_decay': 0.004, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 1, 'lambda_param': 0.6000000000000001, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3473,2.114434,0.126,0.025836,0.042804,0.02596
2,1.8142,1.675201,0.446,0.067534,0.109506,0.062241
3,1.5782,1.473751,0.494,0.073379,0.121317,0.07713
4,1.4025,1.364324,0.498,0.093753,0.134325,0.094423
5,1.2533,1.235424,0.55,0.116941,0.169007,0.118336
6,1.1233,1.135469,0.582,0.153046,0.19114,0.143161
7,1.0182,1.120888,0.608,0.160775,0.219952,0.166961
8,0.8867,1.09997,0.602,0.211702,0.241326,0.194257
9,0.7777,1.04768,0.614,0.216697,0.265468,0.215521
10,0.6721,1.026879,0.628,0.230338,0.283754,0.233821


[I 2025-03-15 16:37:12,401] Trial 75 finished with value: 0.40969950389767623 and parameters: {'learning_rate': 0.004707508506661088, 'weight_decay': 0.004, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 1, 'lambda_param': 0.6000000000000001, 'temperature': 6.5}. Best is trial 35 with value: 0.44430764628951386.


Trial 76 with params: {'learning_rate': 0.0036812466761480984, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.5, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2234,2.003379,0.18,0.027106,0.055661,0.032405
2,1.7731,1.726952,0.436,0.040503,0.108617,0.056084
3,1.5558,1.453754,0.51,0.07267,0.128325,0.084219
4,1.3815,1.332573,0.518,0.092586,0.140071,0.099226
5,1.2234,1.188204,0.56,0.113696,0.158002,0.117671
6,1.1012,1.147694,0.566,0.139073,0.161697,0.127739
7,0.9952,1.121762,0.6,0.163886,0.205361,0.162673
8,0.8631,1.055098,0.616,0.244295,0.248257,0.208011
9,0.7425,1.023058,0.626,0.260846,0.26598,0.237769
10,0.6185,1.005149,0.644,0.294603,0.287415,0.255465


[I 2025-03-15 16:39:24,221] Trial 76 finished with value: 0.3588466302924434 and parameters: {'learning_rate': 0.0036812466761480984, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.5, 'temperature': 7.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 77 with params: {'learning_rate': 0.0016986441571415448, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.99, 'warmup_steps': 1, 'lambda_param': 1.0, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3154,2.128709,0.136,0.006138,0.03,0.010188
2,2.0217,1.995061,0.334,0.044158,0.091071,0.051164
3,1.8254,1.786414,0.428,0.050424,0.102934,0.066491
4,1.6772,1.686446,0.432,0.071397,0.104034,0.062993
5,1.5762,1.567872,0.474,0.070157,0.116373,0.071856
6,1.4978,1.468299,0.5,0.072356,0.122837,0.079617
7,1.4223,1.403846,0.516,0.093629,0.137555,0.097592
8,1.3533,1.34807,0.542,0.083479,0.138358,0.097418
9,1.2988,1.297492,0.544,0.086762,0.142789,0.100507
10,1.2318,1.270831,0.556,0.095067,0.143942,0.104931


[I 2025-03-15 16:40:50,974] Trial 77 pruned. 


Trial 78 with params: {'learning_rate': 0.004994190775603122, 'weight_decay': 0.007, 'adam_beta1': 0.91, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3956,2.128379,0.122,0.028505,0.037513,0.022643
2,1.79,1.666428,0.486,0.062288,0.119751,0.074033
3,1.5244,1.457525,0.48,0.082232,0.121306,0.079346
4,1.3421,1.282304,0.544,0.099625,0.151524,0.109243
5,1.1563,1.141784,0.588,0.15502,0.189326,0.14157
6,1.0267,1.116609,0.588,0.171304,0.206069,0.169369
7,0.8981,1.079325,0.604,0.197249,0.237431,0.198999
8,0.7401,1.005069,0.634,0.234191,0.273266,0.230802
9,0.6269,1.008189,0.64,0.277438,0.29928,0.26723
10,0.5038,1.025138,0.62,0.266131,0.307588,0.262449


[I 2025-03-15 16:43:06,161] Trial 78 finished with value: 0.37824885405371983 and parameters: {'learning_rate': 0.004994190775603122, 'weight_decay': 0.007, 'adam_beta1': 0.91, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 5.5}. Best is trial 35 with value: 0.44430764628951386.


Trial 79 with params: {'learning_rate': 1.2801409085483677e-05, 'weight_decay': 0.0, 'adam_beta1': 0.97, 'warmup_steps': 2, 'lambda_param': 0.4, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.478,2.458142,0.006,0.000143,0.02381,0.000284
2,2.4724,2.452918,0.006,0.000143,0.02381,0.000284
3,2.4675,2.448001,0.068,0.018608,0.037229,0.015847
4,2.4627,2.443366,0.11,0.00281,0.02381,0.005027
5,2.4571,2.438926,0.11,0.002624,0.02381,0.004728
6,2.4514,2.43448,0.11,0.002619,0.02381,0.004719
7,2.4439,2.43005,0.11,0.002619,0.02381,0.004719
8,2.4408,2.42568,0.11,0.002619,0.02381,0.004719
9,2.4337,2.421204,0.11,0.002619,0.02381,0.004719
10,2.4323,2.416751,0.11,0.002619,0.02381,0.004719


[I 2025-03-15 16:45:35,943] Trial 79 finished with value: 0.004719004719004719 and parameters: {'learning_rate': 1.2801409085483677e-05, 'weight_decay': 0.0, 'adam_beta1': 0.97, 'warmup_steps': 2, 'lambda_param': 0.4, 'temperature': 6.5}. Best is trial 35 with value: 0.44430764628951386.


Trial 80 with params: {'learning_rate': 0.003415453079956214, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1576,1.947731,0.214,0.034641,0.06393,0.040236
2,1.7308,1.582317,0.458,0.068347,0.112407,0.062944
3,1.4828,1.403285,0.498,0.07766,0.125182,0.082279
4,1.3118,1.215275,0.544,0.082318,0.135814,0.09618
5,1.177,1.190306,0.564,0.109082,0.175067,0.119481
6,1.0452,1.110884,0.584,0.142253,0.180472,0.136795
7,0.9111,1.044305,0.622,0.182427,0.226783,0.180711
8,0.7874,1.078938,0.638,0.272696,0.256197,0.234687
9,0.6867,0.975226,0.642,0.246788,0.291049,0.236592
10,0.5623,1.043463,0.634,0.272797,0.315932,0.262987


[I 2025-03-15 16:47:57,440] Trial 80 finished with value: 0.3761779427260183 and parameters: {'learning_rate': 0.003415453079956214, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 6.5}. Best is trial 35 with value: 0.44430764628951386.


Trial 81 with params: {'learning_rate': 0.0019385661340414991, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 2, 'lambda_param': 1.0, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3221,2.152906,0.176,0.032408,0.054752,0.034637
2,1.9633,1.863102,0.442,0.038931,0.099374,0.053993
3,1.7174,1.646683,0.424,0.045983,0.101982,0.058806
4,1.5293,1.468703,0.466,0.052705,0.115659,0.067248
5,1.3827,1.308267,0.54,0.080299,0.139699,0.096276
6,1.276,1.251791,0.554,0.094897,0.142375,0.101817
7,1.1673,1.173936,0.564,0.106572,0.158281,0.116287
8,1.0447,1.187301,0.566,0.142169,0.175325,0.131891
9,0.953,1.079886,0.6,0.179784,0.19935,0.164134
10,0.8265,1.042953,0.61,0.178382,0.220372,0.173463


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--f1/34c46321f42186df33a6260966e34a368f14868d9cc2ba47d142112e2800d233 (last modified on Fri Jan 10 23:14:01 2025) since it couldn't be found locally at evaluate-metric--f1, or remotely on the Hugging Face Hub.
[I 2025-03-15 16:50:53,140] Trial 81 finished with value: 0.35378044913474166 and parameters: {'learning_rate': 0.0019385661340414991, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 2, 'lambda_param': 1.0, 'temperature': 4.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 82 with params: {'learning_rate': 0.0028074506710445914, 'weight_decay': 0.007, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.7000000000000001, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3158,2.125501,0.16,0.031034,0.050943,0.032638
2,1.8988,1.833685,0.426,0.039537,0.095564,0.053092
3,1.6787,1.599787,0.478,0.062461,0.117416,0.070152
4,1.488,1.470417,0.514,0.082292,0.135383,0.092037
5,1.3518,1.299165,0.532,0.082286,0.138074,0.094732
6,1.2396,1.20103,0.56,0.09289,0.14486,0.104674
7,1.1332,1.19485,0.57,0.161559,0.184324,0.14486
8,1.0033,1.167651,0.59,0.20771,0.206417,0.173437
9,0.9268,1.074253,0.604,0.180743,0.230362,0.18424
10,0.7882,0.995973,0.626,0.192834,0.239841,0.198407


[I 2025-03-15 16:53:13,693] Trial 82 finished with value: 0.37446511375524344 and parameters: {'learning_rate': 0.0028074506710445914, 'weight_decay': 0.007, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.7000000000000001, 'temperature': 7.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 83 with params: {'learning_rate': 0.0011565885381962524, 'weight_decay': 0.005, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.6000000000000001, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2895,2.131982,0.112,0.00404,0.024286,0.005577
2,2.0249,2.002642,0.326,0.046495,0.089166,0.053066
3,1.8209,1.759799,0.414,0.050584,0.099514,0.06479
4,1.6593,1.649175,0.446,0.045629,0.107133,0.060249
5,1.545,1.501299,0.48,0.07287,0.117918,0.073929
6,1.4487,1.416502,0.522,0.072609,0.128796,0.085693
7,1.3588,1.338829,0.548,0.103929,0.138766,0.098061
8,1.2547,1.267648,0.548,0.101424,0.139128,0.103277
9,1.1938,1.205726,0.56,0.104289,0.153306,0.113037
10,1.1017,1.178438,0.57,0.111087,0.158746,0.119526


[I 2025-03-15 16:55:35,365] Trial 83 finished with value: 0.25990210077281534 and parameters: {'learning_rate': 0.0011565885381962524, 'weight_decay': 0.005, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.6000000000000001, 'temperature': 6.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 84 with params: {'learning_rate': 0.004002522298712889, 'weight_decay': 0.001, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2202,1.974645,0.208,0.038006,0.058163,0.037136
2,1.7588,1.680076,0.458,0.044669,0.112486,0.061831
3,1.5331,1.449001,0.494,0.075347,0.124598,0.080779
4,1.3614,1.276819,0.55,0.083326,0.142881,0.097789
5,1.2167,1.194721,0.556,0.107053,0.162053,0.11662
6,1.0863,1.125127,0.572,0.147534,0.170667,0.130073
7,0.9568,1.097383,0.602,0.162609,0.205356,0.15891
8,0.8262,1.039775,0.634,0.237123,0.256212,0.218998
9,0.7173,0.973461,0.638,0.24823,0.284801,0.242764
10,0.5899,0.978694,0.656,0.279827,0.302559,0.264441


[I 2025-03-15 16:57:55,852] Trial 84 finished with value: 0.42353498513623256 and parameters: {'learning_rate': 0.004002522298712889, 'weight_decay': 0.001, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 2.5}. Best is trial 35 with value: 0.44430764628951386.


Trial 85 with params: {'learning_rate': 0.0033457778333964247, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.153,1.948422,0.214,0.032797,0.06393,0.039273
2,1.7272,1.578957,0.458,0.068561,0.112407,0.062978
3,1.4825,1.405064,0.488,0.076903,0.119871,0.079243
4,1.3109,1.219059,0.546,0.087457,0.139094,0.097023
5,1.1745,1.191804,0.568,0.130033,0.171427,0.12549
6,1.042,1.123178,0.578,0.146663,0.180283,0.139122
7,0.9098,1.058435,0.616,0.165924,0.218627,0.175667
8,0.7841,1.105924,0.612,0.217543,0.244556,0.216153
9,0.6866,0.97024,0.646,0.228473,0.293305,0.233821
10,0.5614,1.011199,0.652,0.309038,0.333659,0.282572


[I 2025-03-15 17:00:15,406] Trial 85 finished with value: 0.391531552547044 and parameters: {'learning_rate': 0.0033457778333964247, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 2.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 86 with params: {'learning_rate': 0.00024696163656226093, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.2, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4567,2.387163,0.11,0.002619,0.02381,0.004719
2,2.2968,2.201218,0.11,0.002619,0.02381,0.004719
3,2.127,2.184394,0.11,0.002619,0.02381,0.004719
4,2.0702,2.105539,0.11,0.002619,0.02381,0.004719
5,2.0038,2.006352,0.234,0.051251,0.038546,0.025707
6,1.9274,1.914037,0.424,0.034109,0.083616,0.046757
7,1.8455,1.822641,0.41,0.036641,0.075705,0.048674
8,1.7622,1.750512,0.416,0.037325,0.100034,0.052445
9,1.7161,1.673985,0.434,0.041952,0.104319,0.058532
10,1.6562,1.648688,0.422,0.044632,0.101462,0.059436


[I 2025-03-15 17:02:48,050] Trial 86 finished with value: 0.08839243542781439 and parameters: {'learning_rate': 0.00024696163656226093, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.2, 'temperature': 2.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 87 with params: {'learning_rate': 0.00266976234226281, 'weight_decay': 0.0, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1, 'lambda_param': 0.2, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3221,2.138737,0.172,0.032295,0.0538,0.034418
2,1.9261,1.866278,0.422,0.036805,0.096025,0.050639
3,1.7042,1.620557,0.444,0.060585,0.109095,0.066581
4,1.5104,1.484472,0.49,0.074212,0.121734,0.076897
5,1.3784,1.310145,0.536,0.101349,0.139085,0.096018
6,1.2704,1.220973,0.556,0.094335,0.144883,0.10527
7,1.1602,1.205346,0.562,0.115345,0.171532,0.123132
8,1.0423,1.168463,0.572,0.171628,0.191501,0.1524
9,0.966,1.094553,0.59,0.177656,0.207857,0.16738
10,0.8365,1.03765,0.618,0.176744,0.235977,0.184926


[I 2025-03-15 17:04:19,368] Trial 87 pruned. 


Trial 88 with params: {'learning_rate': 0.0017372714220887152, 'weight_decay': 0.007, 'adam_beta1': 0.93, 'warmup_steps': 4, 'lambda_param': 0.4, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3393,2.159003,0.194,0.011262,0.043853,0.017429
2,1.9815,1.881394,0.42,0.043963,0.105368,0.056638
3,1.7347,1.645712,0.432,0.071258,0.10396,0.06233
4,1.5464,1.489058,0.482,0.071798,0.11859,0.073244
5,1.4002,1.352238,0.536,0.072322,0.129222,0.087175
6,1.2956,1.296831,0.546,0.090961,0.140354,0.098536
7,1.2063,1.20245,0.558,0.093098,0.148561,0.106054
8,1.091,1.191164,0.56,0.13062,0.159326,0.120764
9,1.0052,1.160184,0.58,0.147499,0.193363,0.151245
10,0.8905,1.097523,0.6,0.159468,0.206646,0.161852


[I 2025-03-15 17:06:44,719] Trial 88 finished with value: 0.3294706180809773 and parameters: {'learning_rate': 0.0017372714220887152, 'weight_decay': 0.007, 'adam_beta1': 0.93, 'warmup_steps': 4, 'lambda_param': 0.4, 'temperature': 2.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 89 with params: {'learning_rate': 0.0012653046117939745, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2722,2.130019,0.138,0.029931,0.032646,0.014927
2,1.997,1.935232,0.432,0.046332,0.106343,0.061228
3,1.7795,1.710606,0.396,0.050737,0.095358,0.06241
4,1.6107,1.566181,0.462,0.045978,0.113155,0.063113
5,1.4914,1.44724,0.5,0.07421,0.126225,0.081417
6,1.3853,1.354517,0.544,0.078302,0.13725,0.094624
7,1.2975,1.275262,0.548,0.103528,0.135858,0.09649
8,1.2022,1.237437,0.556,0.114334,0.14495,0.110951
9,1.1381,1.185488,0.56,0.096469,0.151883,0.107784
10,1.0312,1.143673,0.578,0.109643,0.169014,0.124617


[I 2025-03-15 17:08:19,824] Trial 89 pruned. 


Trial 90 with params: {'learning_rate': 0.0014741980005190131, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2577,2.127193,0.134,0.029019,0.038201,0.02338
2,1.979,1.91742,0.44,0.051779,0.113804,0.067461
3,1.7579,1.685107,0.4,0.048817,0.096311,0.060922
4,1.5806,1.52983,0.466,0.07013,0.114212,0.065436
5,1.4594,1.406155,0.524,0.077067,0.129299,0.08878
6,1.353,1.322255,0.542,0.09121,0.137707,0.097131
7,1.2678,1.258651,0.554,0.102706,0.14152,0.104637
8,1.1687,1.235174,0.56,0.111371,0.155177,0.115845
9,1.1111,1.175901,0.572,0.110094,0.165672,0.122779
10,1.0027,1.122942,0.59,0.118547,0.185307,0.133109


[I 2025-03-15 17:10:35,302] Trial 90 finished with value: 0.3265982858837523 and parameters: {'learning_rate': 0.0014741980005190131, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 7.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 91 with params: {'learning_rate': 0.0023972955023157895, 'weight_decay': 0.002, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2309,2.111135,0.15,0.034556,0.051444,0.034736
2,1.8784,1.817297,0.424,0.043786,0.106516,0.059084
3,1.6832,1.608145,0.458,0.065257,0.112637,0.06977
4,1.5037,1.456534,0.488,0.081301,0.125604,0.080819
5,1.3967,1.391786,0.508,0.109828,0.132144,0.095383
6,1.2934,1.241391,0.556,0.095967,0.14571,0.105493
7,1.2077,1.22102,0.556,0.098186,0.156588,0.110481
8,1.0975,1.167823,0.578,0.186944,0.193079,0.151505
9,1.0179,1.124523,0.58,0.156361,0.189718,0.145659
10,0.9126,1.074514,0.614,0.16679,0.214269,0.170298


[I 2025-03-15 17:12:50,796] Trial 91 finished with value: 0.34502401068748445 and parameters: {'learning_rate': 0.0023972955023157895, 'weight_decay': 0.002, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 2.5}. Best is trial 35 with value: 0.44430764628951386.


Trial 92 with params: {'learning_rate': 0.0016685735742802226, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 2, 'lambda_param': 0.9, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3075,2.15248,0.184,0.010033,0.041472,0.0159
2,1.9762,1.883208,0.444,0.041269,0.111322,0.058235
3,1.7366,1.663298,0.426,0.07289,0.102445,0.061962
4,1.5484,1.482875,0.466,0.070569,0.114255,0.066236
5,1.4031,1.347562,0.532,0.074922,0.13418,0.089888
6,1.2936,1.275576,0.552,0.10347,0.154524,0.108827
7,1.1986,1.1841,0.56,0.096082,0.153598,0.108296
8,1.0804,1.172083,0.562,0.110489,0.158104,0.118429
9,0.997,1.107989,0.59,0.155581,0.193297,0.150471
10,0.8699,1.068324,0.598,0.188191,0.204142,0.163696


[I 2025-03-15 17:14:25,878] Trial 92 pruned. 


Trial 93 with params: {'learning_rate': 0.0002579872450204056, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9, 'warmup_steps': 3, 'lambda_param': 0.30000000000000004, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4517,2.374576,0.11,0.002619,0.02381,0.004719
2,2.2717,2.18774,0.11,0.002619,0.02381,0.004719
3,2.1186,2.16539,0.11,0.002619,0.02381,0.004719
4,2.0607,2.090426,0.11,0.002619,0.02381,0.004719
5,1.9866,1.977333,0.362,0.043271,0.056121,0.03877
6,1.9021,1.887773,0.436,0.035513,0.08632,0.048505
7,1.8293,1.785374,0.408,0.03561,0.075228,0.047791
8,1.7427,1.725309,0.416,0.038092,0.100034,0.053541
9,1.7002,1.653475,0.428,0.041894,0.102891,0.058409
10,1.6338,1.625484,0.42,0.041163,0.100986,0.056338


[I 2025-03-15 17:16:56,888] Trial 93 finished with value: 0.08967914593382759 and parameters: {'learning_rate': 0.0002579872450204056, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9, 'warmup_steps': 3, 'lambda_param': 0.30000000000000004, 'temperature': 4.5}. Best is trial 35 with value: 0.44430764628951386.


Trial 94 with params: {'learning_rate': 0.0035701963719424914, 'weight_decay': 0.003, 'adam_beta1': 0.91, 'warmup_steps': 0, 'lambda_param': 0.8, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1707,1.961293,0.212,0.036825,0.063497,0.041089
2,1.7335,1.58276,0.454,0.044599,0.111424,0.062032
3,1.4834,1.407974,0.494,0.076266,0.124168,0.080325
4,1.3157,1.216487,0.548,0.085068,0.135875,0.095562
5,1.178,1.201749,0.566,0.10598,0.17259,0.117833
6,1.0431,1.112158,0.59,0.146137,0.187022,0.14386
7,0.9029,1.041235,0.606,0.176348,0.213753,0.178217
8,0.7882,1.126215,0.608,0.255794,0.235696,0.209589
9,0.6836,0.98256,0.636,0.241539,0.291416,0.235494
10,0.5524,1.005584,0.638,0.287944,0.31615,0.269626


[I 2025-03-15 17:19:21,321] Trial 94 finished with value: 0.3946954747938809 and parameters: {'learning_rate': 0.0035701963719424914, 'weight_decay': 0.003, 'adam_beta1': 0.91, 'warmup_steps': 0, 'lambda_param': 0.8, 'temperature': 7.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 95 with params: {'learning_rate': 0.003970724986061815, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2155,1.969006,0.206,0.037998,0.055517,0.03477
2,1.7518,1.639409,0.464,0.068991,0.113796,0.066555
3,1.5163,1.431932,0.488,0.076109,0.120258,0.079184
4,1.3533,1.253337,0.548,0.093775,0.145522,0.105436
5,1.205,1.194889,0.556,0.115217,0.17184,0.115057
6,1.0779,1.13043,0.564,0.154762,0.174522,0.12871
7,0.9498,1.078471,0.602,0.163262,0.213224,0.162139
8,0.8081,1.053791,0.632,0.256896,0.271558,0.232764
9,0.7064,0.980684,0.632,0.234657,0.275665,0.228611
10,0.579,1.013782,0.642,0.305221,0.331957,0.287492


[I 2025-03-15 17:21:36,815] Trial 95 finished with value: 0.4286527188961751 and parameters: {'learning_rate': 0.003970724986061815, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 2.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 96 with params: {'learning_rate': 0.004166478581882675, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2472,1.980906,0.206,0.036746,0.057643,0.036043
2,1.7696,1.703647,0.452,0.04356,0.111014,0.060201
3,1.5383,1.464272,0.5,0.074518,0.123113,0.080331
4,1.3661,1.285866,0.538,0.07519,0.138407,0.091636
5,1.2199,1.196585,0.55,0.097327,0.151824,0.107287
6,1.099,1.127765,0.564,0.122583,0.161458,0.118382
7,0.9688,1.103434,0.592,0.153163,0.195251,0.14707
8,0.8343,1.045757,0.614,0.198888,0.240851,0.19683
9,0.7282,0.996502,0.634,0.285119,0.284922,0.247074
10,0.5987,0.996449,0.65,0.295882,0.319797,0.280695


[I 2025-03-15 17:24:00,676] Trial 96 finished with value: 0.4370408166389611 and parameters: {'learning_rate': 0.004166478581882675, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 2.5}. Best is trial 35 with value: 0.44430764628951386.


Trial 97 with params: {'learning_rate': 0.002640948978738136, 'weight_decay': 0.004, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0, 'lambda_param': 0.6000000000000001, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2105,2.076566,0.146,0.024577,0.049735,0.029265
2,1.8356,1.76451,0.424,0.044469,0.104108,0.05674
3,1.6379,1.545078,0.484,0.059275,0.118832,0.07125
4,1.4754,1.474853,0.49,0.085642,0.126559,0.084721
5,1.3641,1.345103,0.522,0.083679,0.132359,0.091719
6,1.2584,1.232782,0.56,0.12571,0.149736,0.116499
7,1.1606,1.210925,0.558,0.106954,0.158444,0.117407
8,1.0568,1.155669,0.588,0.156393,0.191316,0.154192
9,0.9611,1.117432,0.586,0.157599,0.195011,0.149135
10,0.8583,1.062307,0.608,0.165166,0.216503,0.174243


[I 2025-03-15 17:26:21,451] Trial 97 finished with value: 0.36680934881371163 and parameters: {'learning_rate': 0.002640948978738136, 'weight_decay': 0.004, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0, 'lambda_param': 0.6000000000000001, 'temperature': 2.5}. Best is trial 35 with value: 0.44430764628951386.


Trial 98 with params: {'learning_rate': 0.0005440430908475466, 'weight_decay': 0.001, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.5, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3524,2.192452,0.11,0.002619,0.02381,0.004719
2,2.0956,2.12263,0.11,0.002619,0.02381,0.004719
3,1.9812,1.948716,0.386,0.040901,0.07216,0.04353
4,1.8464,1.818186,0.43,0.042671,0.096234,0.056826
5,1.7259,1.703001,0.434,0.045948,0.104319,0.06196
6,1.6418,1.596613,0.456,0.044926,0.111727,0.061644
7,1.5608,1.527143,0.456,0.067238,0.107596,0.061761
8,1.4835,1.507623,0.49,0.076405,0.116234,0.078952
9,1.447,1.434528,0.498,0.074739,0.12279,0.08226
10,1.367,1.389169,0.53,0.075569,0.133583,0.08902


[I 2025-03-15 17:28:36,894] Trial 98 finished with value: 0.13289439503064465 and parameters: {'learning_rate': 0.0005440430908475466, 'weight_decay': 0.001, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.5, 'temperature': 2.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 99 with params: {'learning_rate': 0.004431071707769348, 'weight_decay': 0.005, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2917,1.99109,0.198,0.036715,0.047061,0.023749
2,1.7734,1.704774,0.436,0.04435,0.107248,0.05919
3,1.5414,1.46287,0.486,0.075696,0.119664,0.079207
4,1.3658,1.274684,0.544,0.075829,0.139832,0.092469
5,1.2196,1.183666,0.552,0.135766,0.160088,0.112045
6,1.0918,1.130944,0.558,0.123277,0.169381,0.118599
7,0.968,1.079799,0.602,0.163982,0.206482,0.161576
8,0.8341,1.065295,0.618,0.228376,0.249992,0.213981
9,0.7235,0.99277,0.634,0.280434,0.308379,0.265232
10,0.5937,1.072225,0.608,0.275737,0.324709,0.267284


[I 2025-03-15 17:30:55,658] Trial 99 finished with value: 0.41962669482894827 and parameters: {'learning_rate': 0.004431071707769348, 'weight_decay': 0.005, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 2.5}. Best is trial 35 with value: 0.44430764628951386.


Trial 100 with params: {'learning_rate': 0.002594252915173049, 'weight_decay': 0.005, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.30000000000000004, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3157,2.138744,0.166,0.031738,0.052371,0.033629
2,1.9171,1.84207,0.424,0.038716,0.095088,0.052032
3,1.6935,1.613407,0.46,0.060537,0.112905,0.065986
4,1.5013,1.477365,0.496,0.078119,0.127691,0.083192
5,1.36,1.296448,0.538,0.10021,0.139472,0.095396
6,1.2512,1.212367,0.558,0.094759,0.144713,0.104608
7,1.1469,1.188667,0.572,0.161242,0.192021,0.138565
8,1.0156,1.162851,0.576,0.170523,0.192175,0.1532
9,0.9542,1.084428,0.596,0.174835,0.215202,0.171739
10,0.8109,1.028989,0.622,0.19562,0.24093,0.20005


[I 2025-03-15 17:33:23,591] Trial 100 finished with value: 0.35477742904738124 and parameters: {'learning_rate': 0.002594252915173049, 'weight_decay': 0.005, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.30000000000000004, 'temperature': 2.5}. Best is trial 35 with value: 0.44430764628951386.


Trial 101 with params: {'learning_rate': 0.003971726101657246, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 1, 'lambda_param': 0.5, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2535,2.032954,0.148,0.02645,0.053893,0.031386
2,1.7698,1.675683,0.456,0.058596,0.113778,0.066982
3,1.5193,1.428899,0.488,0.074567,0.119797,0.077097
4,1.3333,1.240423,0.56,0.094211,0.150692,0.108991
5,1.1707,1.157835,0.572,0.108777,0.174153,0.121613
6,1.0436,1.130754,0.584,0.185244,0.205145,0.168105
7,0.9148,1.068195,0.61,0.200918,0.22293,0.189695
8,0.7702,1.048347,0.63,0.275892,0.276288,0.241553
9,0.6628,0.981425,0.642,0.254947,0.279086,0.237611
10,0.525,1.032619,0.624,0.335759,0.354861,0.310948


[I 2025-03-15 17:35:42,478] Trial 101 finished with value: 0.4021634748612029 and parameters: {'learning_rate': 0.003971726101657246, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 1, 'lambda_param': 0.5, 'temperature': 2.5}. Best is trial 35 with value: 0.44430764628951386.


Trial 102 with params: {'learning_rate': 0.0025765602228398177, 'weight_decay': 0.004, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2214,2.09243,0.136,0.023771,0.045185,0.027186
2,1.8378,1.748084,0.422,0.043739,0.103718,0.058914
3,1.6269,1.527002,0.492,0.06958,0.120797,0.07484
4,1.4642,1.454595,0.488,0.079578,0.131541,0.081872
5,1.3339,1.293364,0.542,0.084674,0.137337,0.095205
6,1.2288,1.200898,0.558,0.10198,0.148082,0.110167
7,1.134,1.190492,0.566,0.117167,0.165798,0.125987
8,1.0141,1.166919,0.578,0.170362,0.190726,0.15331
9,0.9254,1.086911,0.594,0.164682,0.216158,0.166223
10,0.7934,1.031434,0.61,0.183019,0.226673,0.18302


[I 2025-03-15 17:38:10,964] Trial 102 finished with value: 0.41402548015244495 and parameters: {'learning_rate': 0.0025765602228398177, 'weight_decay': 0.004, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 2.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 103 with params: {'learning_rate': 0.0034518611920210276, 'weight_decay': 0.003, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1621,1.955902,0.21,0.035547,0.058639,0.03726
2,1.7532,1.661983,0.45,0.043375,0.110624,0.060731
3,1.5397,1.434315,0.526,0.072432,0.132452,0.087672
4,1.377,1.340665,0.53,0.097606,0.138107,0.097742
5,1.2394,1.226313,0.544,0.10603,0.151606,0.114252
6,1.1255,1.15141,0.564,0.106955,0.163163,0.118729
7,1.0151,1.113816,0.594,0.15537,0.192047,0.150139
8,0.8999,1.081546,0.596,0.171704,0.219783,0.175045
9,0.7987,1.020799,0.61,0.174744,0.246367,0.18427
10,0.6864,1.006125,0.63,0.242253,0.260723,0.221333


[I 2025-03-15 17:40:41,075] Trial 103 finished with value: 0.41550985216403896 and parameters: {'learning_rate': 0.0034518611920210276, 'weight_decay': 0.003, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 3.0}. Best is trial 35 with value: 0.44430764628951386.


Trial 104 with params: {'learning_rate': 0.0016292905406157508, 'weight_decay': 0.002, 'adam_beta1': 0.96, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2709,2.124012,0.152,0.0301,0.048995,0.031257
2,1.991,1.931595,0.434,0.046005,0.107102,0.059637
3,1.779,1.709875,0.402,0.052537,0.096744,0.064369
4,1.6147,1.574833,0.46,0.068706,0.112796,0.061713
5,1.5021,1.45794,0.48,0.070229,0.115104,0.071353
6,1.4071,1.377316,0.53,0.080783,0.133988,0.09298
7,1.3277,1.315094,0.544,0.104111,0.145998,0.107528
8,1.2357,1.240948,0.55,0.094229,0.138036,0.100776
9,1.1734,1.186357,0.566,0.098823,0.159446,0.114838
10,1.0973,1.17629,0.568,0.112823,0.180942,0.12264


[I 2025-03-15 17:42:54,548] Trial 104 finished with value: 0.2660372246693934 and parameters: {'learning_rate': 0.0016292905406157508, 'weight_decay': 0.002, 'adam_beta1': 0.96, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 3.5}. Best is trial 35 with value: 0.44430764628951386.


Trial 105 with params: {'learning_rate': 0.004107822923895355, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2384,1.980417,0.21,0.036801,0.060722,0.038232
2,1.7711,1.699008,0.454,0.043848,0.11149,0.060565
3,1.5405,1.459188,0.484,0.073198,0.116663,0.075972
4,1.3646,1.282808,0.552,0.087789,0.146624,0.102314
5,1.2188,1.193165,0.554,0.109734,0.167221,0.115287
6,1.0956,1.128808,0.562,0.12701,0.168598,0.120836
7,0.9667,1.106735,0.594,0.151087,0.193474,0.146721
8,0.8387,1.040541,0.61,0.212524,0.231074,0.195733
9,0.7284,0.983782,0.632,0.239132,0.275434,0.232578
10,0.5986,1.028067,0.628,0.278018,0.303059,0.251304


[I 2025-03-15 17:45:08,520] Trial 105 finished with value: 0.48810144914827314 and parameters: {'learning_rate': 0.004107822923895355, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 2.5}. Best is trial 105 with value: 0.48810144914827314.


Trial 106 with params: {'learning_rate': 0.004965219962256929, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5222,2.101883,0.132,0.005691,0.029048,0.009498
2,1.807,1.656501,0.456,0.069341,0.111991,0.06588
3,1.5406,1.467986,0.48,0.074172,0.117845,0.074391
4,1.3636,1.251369,0.544,0.075024,0.140066,0.09239
5,1.1946,1.161998,0.576,0.131477,0.179735,0.13223
6,1.0543,1.101751,0.582,0.156935,0.193176,0.152516
7,0.9217,1.10029,0.604,0.16143,0.223692,0.173696
8,0.7858,1.066443,0.604,0.217066,0.246785,0.207223
9,0.6984,1.010029,0.622,0.254065,0.248237,0.225904
10,0.5673,1.044636,0.62,0.266117,0.290345,0.257041


[I 2025-03-15 17:46:42,829] Trial 106 pruned. 


Trial 107 with params: {'learning_rate': 0.0024544595002184678, 'weight_decay': 0.007, 'adam_beta1': 0.96, 'warmup_steps': 1, 'lambda_param': 0.4, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.296,2.137977,0.164,0.031485,0.051895,0.03328
2,1.9438,1.899537,0.408,0.040197,0.096931,0.053035
3,1.7318,1.678036,0.434,0.067994,0.106788,0.0687
4,1.5515,1.520332,0.48,0.091136,0.121324,0.078966
5,1.4435,1.400358,0.506,0.07747,0.125981,0.085315
6,1.3362,1.312708,0.55,0.078199,0.138679,0.094039
7,1.2513,1.234379,0.56,0.112814,0.164848,0.120265
8,1.1409,1.173951,0.56,0.120874,0.174099,0.119327
9,1.0686,1.143308,0.576,0.154715,0.188897,0.13834
10,0.966,1.118736,0.582,0.155889,0.204725,0.152533


[I 2025-03-15 17:49:02,682] Trial 107 finished with value: 0.3527423288288801 and parameters: {'learning_rate': 0.0024544595002184678, 'weight_decay': 0.007, 'adam_beta1': 0.96, 'warmup_steps': 1, 'lambda_param': 0.4, 'temperature': 2.0}. Best is trial 105 with value: 0.48810144914827314.


Trial 108 with params: {'learning_rate': 0.004760313817151507, 'weight_decay': 0.005, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.6000000000000001, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3577,2.110387,0.124,0.02455,0.040159,0.023941
2,1.7967,1.663671,0.474,0.060369,0.116459,0.071414
3,1.5523,1.477918,0.476,0.074479,0.117144,0.073407
4,1.3754,1.295304,0.54,0.095854,0.146511,0.10649
5,1.2032,1.180641,0.568,0.114821,0.18505,0.120081
6,1.0791,1.115022,0.594,0.174767,0.201789,0.16793
7,0.9586,1.10082,0.61,0.189271,0.228634,0.184106
8,0.813,1.025618,0.616,0.208335,0.249253,0.209558
9,0.6931,0.996826,0.634,0.261434,0.288912,0.247245
10,0.5575,1.007172,0.626,0.275153,0.302608,0.259475


[I 2025-03-15 17:51:21,773] Trial 108 finished with value: 0.4207231441840733 and parameters: {'learning_rate': 0.004760313817151507, 'weight_decay': 0.005, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.6000000000000001, 'temperature': 3.5}. Best is trial 105 with value: 0.48810144914827314.


Trial 109 with params: {'learning_rate': 0.004725101476351509, 'weight_decay': 0.004, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0, 'lambda_param': 0.7000000000000001, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4887,2.119996,0.13,0.052817,0.042169,0.029348
2,1.8161,1.694712,0.436,0.066733,0.095697,0.059447
3,1.5781,1.511744,0.474,0.07279,0.113691,0.070586
4,1.4109,1.308213,0.532,0.086245,0.141748,0.098618
5,1.2574,1.221217,0.546,0.090036,0.166122,0.106573
6,1.1313,1.13655,0.584,0.122696,0.186322,0.134646
7,1.0258,1.117644,0.584,0.134381,0.198517,0.148286
8,0.884,1.107539,0.588,0.178017,0.217236,0.173236
9,0.7752,1.0804,0.612,0.235619,0.249241,0.222843
10,0.6411,1.025871,0.62,0.258652,0.273104,0.230035


[I 2025-03-15 17:53:52,804] Trial 109 finished with value: 0.40113951348101295 and parameters: {'learning_rate': 0.004725101476351509, 'weight_decay': 0.004, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0, 'lambda_param': 0.7000000000000001, 'temperature': 4.5}. Best is trial 105 with value: 0.48810144914827314.


Trial 110 with params: {'learning_rate': 0.000305622575325122, 'weight_decay': 0.007, 'adam_beta1': 0.96, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4347,2.33273,0.11,0.002619,0.02381,0.004719
2,2.2131,2.200108,0.11,0.002619,0.02381,0.004719
3,2.1187,2.158174,0.11,0.002619,0.02381,0.004719
4,2.0554,2.09432,0.11,0.002624,0.02381,0.004728
5,1.9975,2.014224,0.238,0.070549,0.043837,0.032958
6,1.9276,1.906618,0.396,0.049058,0.072371,0.053036
7,1.8543,1.859099,0.438,0.059548,0.098095,0.06532
8,1.777,1.773627,0.432,0.059161,0.094584,0.064855
9,1.7368,1.71002,0.402,0.041391,0.085272,0.054983
10,1.6742,1.67555,0.44,0.040514,0.106031,0.056076


[I 2025-03-15 17:56:09,992] Trial 110 finished with value: 0.09134652377253676 and parameters: {'learning_rate': 0.000305622575325122, 'weight_decay': 0.007, 'adam_beta1': 0.96, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 4.5}. Best is trial 105 with value: 0.48810144914827314.


Trial 111 with params: {'learning_rate': 0.002519512826198917, 'weight_decay': 0.005, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2288,2.10384,0.13,0.023115,0.043757,0.026222
2,1.8545,1.775096,0.42,0.040981,0.103438,0.055559
3,1.6491,1.557244,0.476,0.066329,0.116732,0.06853
4,1.475,1.47201,0.488,0.083448,0.128993,0.084573
5,1.3518,1.309944,0.536,0.083013,0.135852,0.093339
6,1.2431,1.206551,0.554,0.099926,0.147164,0.108589
7,1.1433,1.188347,0.564,0.114015,0.164825,0.124035
8,1.0175,1.181173,0.572,0.168223,0.184397,0.150217
9,0.9384,1.075544,0.598,0.180482,0.213935,0.172087
10,0.799,1.026288,0.614,0.176476,0.220246,0.177912


[I 2025-03-15 17:58:32,689] Trial 111 finished with value: 0.3750835208380376 and parameters: {'learning_rate': 0.002519512826198917, 'weight_decay': 0.005, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 3.5}. Best is trial 105 with value: 0.48810144914827314.


Trial 112 with params: {'learning_rate': 1.7024180271514193e-05, 'weight_decay': 0.005, 'adam_beta1': 0.99, 'warmup_steps': 1, 'lambda_param': 0.2, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.477,2.456204,0.006,0.000143,0.02381,0.000284
2,2.4694,2.449412,0.042,0.022705,0.031602,0.011878
3,2.4627,2.443223,0.11,0.002728,0.02381,0.004895
4,2.4563,2.437356,0.11,0.002624,0.02381,0.004728
5,2.4491,2.431592,0.11,0.002619,0.02381,0.004719
6,2.4419,2.425796,0.11,0.002619,0.02381,0.004719
7,2.433,2.420063,0.11,0.002619,0.02381,0.004719
8,2.4284,2.414349,0.11,0.002619,0.02381,0.004719
9,2.4196,2.408496,0.11,0.002619,0.02381,0.004719
10,2.4164,2.402649,0.11,0.002619,0.02381,0.004719


[I 2025-03-15 18:01:16,070] Trial 112 finished with value: 0.004719004719004719 and parameters: {'learning_rate': 1.7024180271514193e-05, 'weight_decay': 0.005, 'adam_beta1': 0.99, 'warmup_steps': 1, 'lambda_param': 0.2, 'temperature': 4.0}. Best is trial 105 with value: 0.48810144914827314.


Trial 113 with params: {'learning_rate': 0.0018369813664576157, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2166,2.080679,0.176,0.032246,0.054752,0.034449
2,1.8723,1.814715,0.436,0.039738,0.097902,0.053689
3,1.6775,1.579521,0.454,0.0437,0.111533,0.060283
4,1.4913,1.462771,0.49,0.077687,0.123513,0.080588
5,1.3562,1.294997,0.544,0.087412,0.141052,0.09587
6,1.2535,1.215713,0.556,0.094101,0.144811,0.103972
7,1.1559,1.183457,0.566,0.129036,0.178894,0.126007
8,1.0284,1.159287,0.564,0.145185,0.17503,0.132625
9,0.9315,1.09,0.594,0.169431,0.204053,0.164335
10,0.7994,1.044019,0.612,0.180662,0.233919,0.184173


[I 2025-03-15 18:03:40,590] Trial 113 finished with value: 0.3501751887801353 and parameters: {'learning_rate': 0.0018369813664576157, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 2.0}. Best is trial 105 with value: 0.48810144914827314.


Trial 114 with params: {'learning_rate': 0.0019946837994931073, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 0, 'lambda_param': 0.6000000000000001, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2033,2.070124,0.146,0.029709,0.045397,0.028863
2,1.8524,1.766031,0.436,0.04227,0.109417,0.059473
3,1.6572,1.558188,0.462,0.068113,0.113468,0.06153
4,1.4822,1.465653,0.48,0.076633,0.123561,0.076023
5,1.3559,1.318031,0.538,0.086371,0.136288,0.093279
6,1.2565,1.221063,0.556,0.091993,0.143115,0.102253
7,1.1616,1.213794,0.558,0.120479,0.169378,0.122364
8,1.0417,1.188204,0.564,0.127248,0.168409,0.134281
9,0.9675,1.110312,0.586,0.16986,0.203458,0.157301
10,0.8256,1.04705,0.606,0.160953,0.214293,0.167126


[I 2025-03-15 18:05:59,917] Trial 114 finished with value: 0.3535319026577318 and parameters: {'learning_rate': 0.0019946837994931073, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 0, 'lambda_param': 0.6000000000000001, 'temperature': 3.5}. Best is trial 105 with value: 0.48810144914827314.


Trial 115 with params: {'learning_rate': 0.004530035166590545, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3143,1.988473,0.2,0.037321,0.047537,0.024339
2,1.7695,1.689771,0.46,0.056493,0.112992,0.062118
3,1.5261,1.42746,0.498,0.075495,0.12236,0.081818
4,1.3413,1.222502,0.552,0.08795,0.147433,0.101664
5,1.1862,1.154248,0.55,0.113121,0.162598,0.110793
6,1.0502,1.095856,0.566,0.17596,0.179419,0.141201
7,0.9129,1.045616,0.616,0.219371,0.227095,0.191548
8,0.7676,0.994636,0.644,0.26605,0.28929,0.257824
9,0.651,1.02138,0.646,0.302598,0.321717,0.280167
10,0.5168,0.987869,0.632,0.278565,0.335772,0.275992


[I 2025-03-15 18:08:30,617] Trial 115 finished with value: 0.40924194287981963 and parameters: {'learning_rate': 0.004530035166590545, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 2.5}. Best is trial 105 with value: 0.48810144914827314.


Trial 116 with params: {'learning_rate': 0.002478943730441349, 'weight_decay': 0.007, 'adam_beta1': 0.93, 'warmup_steps': 2, 'lambda_param': 0.6000000000000001, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2693,2.116071,0.166,0.032197,0.039355,0.018349
2,1.869,1.773222,0.424,0.039691,0.094805,0.053289
3,1.6558,1.540841,0.484,0.063138,0.118758,0.070816
4,1.4723,1.452594,0.494,0.085256,0.134765,0.088555
5,1.3323,1.289964,0.54,0.08675,0.14013,0.097514
6,1.2269,1.201806,0.558,0.098156,0.148921,0.108892
7,1.1275,1.191063,0.56,0.12995,0.169293,0.129641
8,0.9999,1.180403,0.576,0.174693,0.192406,0.162676
9,0.9139,1.073227,0.604,0.180571,0.223494,0.181807
10,0.7696,1.012169,0.618,0.217064,0.240308,0.202266


[I 2025-03-15 18:10:02,218] Trial 116 pruned. 


Trial 117 with params: {'learning_rate': 0.003943507563474878, 'weight_decay': 0.004, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1, 'lambda_param': 0.5, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2468,2.04214,0.144,0.025823,0.06096,0.031011
2,1.7836,1.679092,0.468,0.067454,0.115123,0.065696
3,1.5536,1.464986,0.504,0.075602,0.124163,0.08336
4,1.3767,1.331564,0.514,0.096277,0.141598,0.101491
5,1.2228,1.208882,0.562,0.126931,0.165754,0.120659
6,1.1075,1.145737,0.57,0.162486,0.160577,0.132905
7,1.0061,1.137507,0.596,0.165035,0.20078,0.156941
8,0.8869,1.073137,0.608,0.199233,0.240612,0.198092
9,0.7669,1.057238,0.614,0.226618,0.23825,0.203227
10,0.6375,0.99972,0.636,0.253143,0.287567,0.243933


[I 2025-03-15 18:12:21,032] Trial 117 finished with value: 0.39757610889749667 and parameters: {'learning_rate': 0.003943507563474878, 'weight_decay': 0.004, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1, 'lambda_param': 0.5, 'temperature': 3.0}. Best is trial 105 with value: 0.48810144914827314.


Trial 118 with params: {'learning_rate': 2.62272221224915e-05, 'weight_decay': 0.004, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.7000000000000001, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4743,2.451516,0.006,0.000143,0.02381,0.000284
2,2.4621,2.441354,0.11,0.002635,0.02381,0.004745
3,2.4511,2.43118,0.11,0.002619,0.02381,0.004719
4,2.4395,2.420267,0.11,0.002619,0.02381,0.004719
5,2.4257,2.40824,0.11,0.002619,0.02381,0.004719
6,2.4105,2.394267,0.11,0.002619,0.02381,0.004719
7,2.3911,2.378288,0.11,0.002619,0.02381,0.004719
8,2.3723,2.360241,0.11,0.002619,0.02381,0.004719
9,2.3472,2.339004,0.11,0.002619,0.02381,0.004719
10,2.322,2.315853,0.11,0.002619,0.02381,0.004719


[I 2025-03-15 18:13:59,073] Trial 118 pruned. 


Trial 119 with params: {'learning_rate': 0.004685443686623622, 'weight_decay': 0.003, 'adam_beta1': 0.96, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3414,2.112648,0.126,0.025344,0.042804,0.025821
2,1.8199,1.666172,0.428,0.042864,0.10519,0.059888
3,1.584,1.46681,0.498,0.071043,0.12236,0.07793
4,1.4156,1.401282,0.496,0.093364,0.137333,0.092325
5,1.2779,1.264675,0.546,0.109339,0.164234,0.111963
6,1.1519,1.153722,0.572,0.152961,0.18446,0.134546
7,1.0458,1.132067,0.586,0.155923,0.210244,0.152428
8,0.919,1.152337,0.586,0.176885,0.230108,0.179167
9,0.828,1.030581,0.614,0.214321,0.256116,0.206697
10,0.72,1.041115,0.614,0.223606,0.262036,0.212022


[I 2025-03-15 18:15:32,781] Trial 119 pruned. 


Trial 120 with params: {'learning_rate': 0.00496096448850179, 'weight_decay': 0.004, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5203,2.103842,0.132,0.005683,0.029048,0.009485
2,1.8121,1.675804,0.45,0.063024,0.110563,0.065559
3,1.555,1.480092,0.478,0.077379,0.118876,0.077462
4,1.381,1.260935,0.544,0.077933,0.140557,0.093644
5,1.2153,1.191906,0.554,0.093548,0.166749,0.109365
6,1.0795,1.10396,0.586,0.14308,0.181708,0.137575
7,0.9573,1.088051,0.586,0.149564,0.202512,0.151514
8,0.8193,1.10405,0.614,0.222895,0.249251,0.216416
9,0.7108,1.009034,0.618,0.250611,0.258827,0.228279
10,0.5778,1.011143,0.636,0.276475,0.295728,0.264545


[I 2025-03-15 18:17:13,617] Trial 120 pruned. 


Trial 121 with params: {'learning_rate': 0.004835078759852181, 'weight_decay': 0.001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 3, 'lambda_param': 1.0, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7971,2.24177,0.132,0.046068,0.080736,0.046156
2,1.9615,1.864709,0.42,0.037424,0.082707,0.04821
3,1.6742,1.567568,0.478,0.061624,0.117416,0.072037
4,1.4717,1.406536,0.496,0.088383,0.132941,0.090588
5,1.3184,1.269926,0.542,0.098234,0.163927,0.112954
6,1.1784,1.170307,0.576,0.139465,0.164393,0.128783
7,1.0561,1.119218,0.594,0.171135,0.209114,0.15807
8,0.9227,1.177638,0.6,0.193954,0.224373,0.181008
9,0.8042,1.08647,0.612,0.216816,0.249976,0.215232
10,0.6805,1.013091,0.638,0.274647,0.286684,0.239171


[I 2025-03-15 18:19:29,698] Trial 121 finished with value: 0.3719443019064375 and parameters: {'learning_rate': 0.004835078759852181, 'weight_decay': 0.001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 3, 'lambda_param': 1.0, 'temperature': 3.5}. Best is trial 105 with value: 0.48810144914827314.


Trial 122 with params: {'learning_rate': 0.004532988355318385, 'weight_decay': 0.004, 'adam_beta1': 0.93, 'warmup_steps': 2, 'lambda_param': 0.5, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2012,1.939092,0.216,0.020288,0.068745,0.029765
2,1.7004,1.554083,0.498,0.068241,0.122477,0.078114
3,1.4504,1.3592,0.51,0.085977,0.134725,0.093437
4,1.2513,1.162409,0.566,0.145432,0.155973,0.123026
5,1.1075,1.112144,0.58,0.159429,0.193786,0.146102
6,0.9669,1.053421,0.606,0.206642,0.230202,0.1943
7,0.826,1.014664,0.63,0.228259,0.268993,0.215948
8,0.6866,0.992378,0.65,0.298673,0.307152,0.277407
9,0.5865,0.920663,0.672,0.333794,0.351147,0.303627
10,0.4671,0.972359,0.66,0.311897,0.367023,0.31831


[I 2025-03-15 18:21:52,306] Trial 122 finished with value: 0.3994312864099461 and parameters: {'learning_rate': 0.004532988355318385, 'weight_decay': 0.004, 'adam_beta1': 0.93, 'warmup_steps': 2, 'lambda_param': 0.5, 'temperature': 4.0}. Best is trial 105 with value: 0.48810144914827314.


Trial 123 with params: {'learning_rate': 0.002206105853400066, 'weight_decay': 0.003, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 1, 'lambda_param': 0.2, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2708,2.130512,0.172,0.032637,0.040784,0.018995
2,1.9223,1.8552,0.426,0.037634,0.096412,0.052119
3,1.7125,1.634804,0.434,0.066668,0.106697,0.065876
4,1.5269,1.483826,0.482,0.077906,0.121693,0.076836
5,1.4076,1.349984,0.534,0.084903,0.139062,0.097368
6,1.3034,1.254264,0.548,0.094497,0.141779,0.103975
7,1.2046,1.229721,0.56,0.117855,0.162642,0.114077
8,1.0952,1.172822,0.568,0.154588,0.187511,0.138171
9,1.0123,1.140172,0.574,0.169084,0.18318,0.142657
10,0.899,1.068101,0.618,0.192321,0.232382,0.187332


[I 2025-03-15 18:24:16,778] Trial 123 finished with value: 0.35503575232309187 and parameters: {'learning_rate': 0.002206105853400066, 'weight_decay': 0.003, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 1, 'lambda_param': 0.2, 'temperature': 2.5}. Best is trial 105 with value: 0.48810144914827314.


Trial 124 with params: {'learning_rate': 0.0033679597191996697, 'weight_decay': 0.002, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1543,1.953174,0.214,0.034641,0.06393,0.040236
2,1.7459,1.660221,0.454,0.04363,0.111294,0.060627
3,1.5356,1.425405,0.532,0.086717,0.14324,0.098934
4,1.3722,1.331406,0.536,0.104319,0.140304,0.099758
5,1.239,1.249167,0.548,0.107081,0.162364,0.117036
6,1.1163,1.159946,0.56,0.125203,0.175549,0.12853
7,0.998,1.114473,0.578,0.154907,0.185686,0.144241
8,0.8813,1.103528,0.602,0.186894,0.227192,0.188783
9,0.7828,1.019838,0.618,0.189371,0.256469,0.199435
10,0.6709,1.040375,0.626,0.239375,0.272854,0.218723


[I 2025-03-15 18:25:53,969] Trial 124 pruned. 


Trial 125 with params: {'learning_rate': 0.0011335804507821492, 'weight_decay': 0.004, 'adam_beta1': 0.99, 'warmup_steps': 2, 'lambda_param': 0.7000000000000001, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3476,2.216522,0.11,0.002619,0.02381,0.004719
2,2.059,2.050546,0.116,0.030461,0.041005,0.019434
3,1.8814,1.834848,0.4,0.050745,0.098437,0.061755
4,1.7516,1.712946,0.404,0.043658,0.085835,0.057504
5,1.6493,1.654066,0.45,0.07238,0.108711,0.068223
6,1.5711,1.54648,0.482,0.073149,0.118308,0.074206
7,1.504,1.476788,0.508,0.07312,0.122002,0.081042
8,1.4256,1.420824,0.514,0.078272,0.129712,0.088597
9,1.3812,1.386937,0.538,0.076707,0.132586,0.090666
10,1.3091,1.336011,0.546,0.089725,0.141934,0.098257


[I 2025-03-15 18:28:17,593] Trial 125 finished with value: 0.15658449971141872 and parameters: {'learning_rate': 0.0011335804507821492, 'weight_decay': 0.004, 'adam_beta1': 0.99, 'warmup_steps': 2, 'lambda_param': 0.7000000000000001, 'temperature': 4.5}. Best is trial 105 with value: 0.48810144914827314.


Trial 126 with params: {'learning_rate': 0.0042766591943723, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2628,1.973903,0.208,0.038622,0.058163,0.037635
2,1.7378,1.576802,0.472,0.063856,0.115953,0.070819
3,1.4817,1.427178,0.486,0.076928,0.119677,0.078608
4,1.3054,1.208459,0.544,0.082178,0.140415,0.098013
5,1.1466,1.158002,0.574,0.161443,0.174041,0.131969
6,1.014,1.07528,0.584,0.199375,0.195053,0.160021
7,0.8619,1.035129,0.616,0.200294,0.234219,0.198619
8,0.7294,1.094126,0.614,0.278732,0.250257,0.232334
9,0.6226,0.973799,0.654,0.304887,0.306489,0.273715
10,0.4901,1.022271,0.636,0.309427,0.321499,0.284876


[I 2025-03-15 18:30:37,594] Trial 126 finished with value: 0.4255484754234142 and parameters: {'learning_rate': 0.0042766591943723, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 7.0}. Best is trial 105 with value: 0.48810144914827314.


Trial 127 with params: {'learning_rate': 0.0031554159406855828, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1427,1.946206,0.212,0.031444,0.063497,0.038386
2,1.7222,1.579516,0.456,0.043968,0.111857,0.061077
3,1.486,1.407169,0.5,0.079423,0.125805,0.084162
4,1.3123,1.230564,0.544,0.087432,0.139498,0.097901
5,1.1767,1.188064,0.552,0.097035,0.159249,0.105165
6,1.0454,1.12708,0.572,0.154662,0.169622,0.129103
7,0.8945,1.054936,0.612,0.197769,0.219658,0.182453
8,0.7649,1.116537,0.616,0.234253,0.246336,0.2192
9,0.6653,0.992489,0.632,0.239524,0.28346,0.239614
10,0.5433,1.031804,0.654,0.319948,0.326026,0.290966


[I 2025-03-15 18:33:13,506] Trial 127 finished with value: 0.39274525789450715 and parameters: {'learning_rate': 0.0031554159406855828, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 6.5}. Best is trial 105 with value: 0.48810144914827314.


Trial 128 with params: {'learning_rate': 4.050179936036862e-05, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4743,2.448252,0.062,0.019201,0.035931,0.01512
2,2.4558,2.432515,0.11,0.002619,0.02381,0.004719
3,2.4371,2.414812,0.11,0.002619,0.02381,0.004719
4,2.4148,2.392838,0.11,0.002619,0.02381,0.004719
5,2.3843,2.363978,0.11,0.002619,0.02381,0.004719
6,2.346,2.326279,0.11,0.002619,0.02381,0.004719
7,2.2941,2.281605,0.11,0.002619,0.02381,0.004719
8,2.2343,2.23842,0.11,0.002619,0.02381,0.004719
9,2.1775,2.204133,0.11,0.002619,0.02381,0.004719
10,2.135,2.189534,0.11,0.002619,0.02381,0.004719


[I 2025-03-15 18:34:47,524] Trial 128 pruned. 


Trial 129 with params: {'learning_rate': 0.0007749878024659708, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.5, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3159,2.177228,0.11,0.002619,0.02381,0.004719
2,2.0539,2.03664,0.134,0.069811,0.042747,0.032597
3,1.8723,1.777314,0.408,0.057641,0.08887,0.061595
4,1.7095,1.664725,0.44,0.041168,0.1082,0.056644
5,1.5768,1.528167,0.46,0.069198,0.112783,0.063482
6,1.49,1.442893,0.504,0.07039,0.124089,0.078636
7,1.3963,1.379807,0.534,0.076268,0.128769,0.088486
8,1.3168,1.350249,0.542,0.102491,0.136889,0.097655
9,1.2742,1.262421,0.544,0.089859,0.137535,0.099014
10,1.1782,1.215733,0.558,0.094572,0.149388,0.105955


[I 2025-03-15 18:37:06,597] Trial 129 finished with value: 0.22037660067443934 and parameters: {'learning_rate': 0.0007749878024659708, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.5, 'temperature': 7.0}. Best is trial 105 with value: 0.48810144914827314.


Trial 130 with params: {'learning_rate': 0.004176004552098378, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2746,2.048074,0.142,0.023997,0.046614,0.027661
2,1.7723,1.689293,0.458,0.060305,0.115506,0.068992
3,1.5209,1.439571,0.486,0.074464,0.119364,0.076747
4,1.3322,1.242181,0.546,0.092161,0.148059,0.105154
5,1.1654,1.156397,0.578,0.158714,0.182478,0.139805
6,1.0393,1.140602,0.576,0.181619,0.204656,0.16757
7,0.9153,1.051804,0.622,0.251358,0.243734,0.213154
8,0.761,1.041039,0.644,0.305521,0.291387,0.261841
9,0.6578,0.967294,0.642,0.275905,0.281384,0.245197
10,0.5208,1.044873,0.616,0.325522,0.34056,0.29958


[I 2025-03-15 18:39:29,658] Trial 130 finished with value: 0.4439432339788636 and parameters: {'learning_rate': 0.004176004552098378, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 7.0}. Best is trial 105 with value: 0.48810144914827314.


Trial 131 with params: {'learning_rate': 0.0025429332650646025, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.221,2.086055,0.134,0.023202,0.044709,0.026407
2,1.8219,1.709259,0.426,0.041905,0.104757,0.057549
3,1.5927,1.493359,0.466,0.072773,0.114415,0.067903
4,1.4289,1.406121,0.49,0.083087,0.128611,0.083687
5,1.2945,1.247457,0.558,0.10273,0.142989,0.103038
6,1.1833,1.17629,0.562,0.122108,0.157631,0.117898
7,1.0706,1.140383,0.578,0.154304,0.190572,0.139671
8,0.9243,1.111136,0.584,0.180784,0.199748,0.159165
9,0.8287,1.033965,0.612,0.202384,0.243514,0.193827
10,0.6991,1.033587,0.608,0.207432,0.249299,0.203374


[I 2025-03-15 18:41:57,078] Trial 131 finished with value: 0.36762690970948186 and parameters: {'learning_rate': 0.0025429332650646025, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 6.5}. Best is trial 105 with value: 0.48810144914827314.


Trial 132 with params: {'learning_rate': 0.0025053041557867558, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 1, 'lambda_param': 0.2, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3072,2.138623,0.156,0.030751,0.04999,0.032201
2,1.9028,1.80696,0.42,0.039871,0.094135,0.053421
3,1.6712,1.577221,0.478,0.069567,0.117355,0.070382
4,1.4799,1.428133,0.514,0.076647,0.127786,0.085357
5,1.3196,1.260931,0.55,0.089769,0.139599,0.096913
6,1.1927,1.18413,0.556,0.102031,0.154047,0.11014
7,1.0752,1.132,0.572,0.175174,0.175787,0.139961
8,0.9449,1.115115,0.586,0.176962,0.195134,0.156591
9,0.853,1.014438,0.624,0.200422,0.256225,0.205783
10,0.7227,1.016625,0.622,0.210115,0.240214,0.200565


[I 2025-03-15 18:44:23,366] Trial 132 finished with value: 0.3901938345721296 and parameters: {'learning_rate': 0.0025053041557867558, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 1, 'lambda_param': 0.2, 'temperature': 7.0}. Best is trial 105 with value: 0.48810144914827314.


Trial 133 with params: {'learning_rate': 0.004875087373958608, 'weight_decay': 0.002, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5096,2.103526,0.132,0.029273,0.031217,0.013897
2,1.8175,1.679669,0.438,0.06991,0.107705,0.065656
3,1.5674,1.47713,0.474,0.100372,0.122876,0.08167
4,1.3999,1.300523,0.538,0.084477,0.142458,0.097073
5,1.2418,1.207499,0.556,0.10208,0.161116,0.112318
6,1.1233,1.135375,0.584,0.122243,0.16995,0.127129
7,1.0176,1.134277,0.568,0.129467,0.197237,0.137656
8,0.8693,1.089014,0.598,0.197402,0.228967,0.191602
9,0.7755,1.090071,0.62,0.26133,0.261229,0.233023
10,0.634,1.050899,0.618,0.305367,0.269924,0.245896


[I 2025-03-15 18:46:06,368] Trial 133 pruned. 


Trial 134 with params: {'learning_rate': 0.002702606363510611, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 1, 'lambda_param': 0.30000000000000004, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3214,2.132851,0.15,0.030036,0.048519,0.031143
2,1.8989,1.824201,0.428,0.040456,0.095758,0.05403
3,1.6708,1.590947,0.482,0.067543,0.118338,0.070755
4,1.4771,1.446743,0.512,0.081548,0.130702,0.089703
5,1.3258,1.252855,0.55,0.091372,0.137167,0.09705
6,1.1996,1.194408,0.558,0.092167,0.145498,0.104658
7,1.0964,1.16694,0.568,0.136311,0.178117,0.132641
8,0.9668,1.13265,0.592,0.196778,0.206929,0.17071
9,0.8695,1.051046,0.622,0.186344,0.244536,0.196437
10,0.7431,1.021597,0.62,0.213177,0.250802,0.200977


[I 2025-03-15 18:48:36,750] Trial 134 finished with value: 0.37728202642848113 and parameters: {'learning_rate': 0.002702606363510611, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 1, 'lambda_param': 0.30000000000000004, 'temperature': 7.0}. Best is trial 105 with value: 0.48810144914827314.


Trial 135 with params: {'learning_rate': 0.00042391934005069233, 'weight_decay': 0.004, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4235,2.279583,0.11,0.002619,0.02381,0.004719
2,2.1649,2.218128,0.11,0.002619,0.02381,0.004719
3,2.0789,2.106572,0.11,0.002624,0.02381,0.004728
4,2.0036,2.01029,0.262,0.062461,0.051063,0.036912
5,1.8879,1.881655,0.42,0.041898,0.080255,0.053524
6,1.8144,1.790083,0.444,0.0604,0.097441,0.066215
7,1.7282,1.708847,0.432,0.046434,0.103843,0.063448
8,1.6406,1.6482,0.438,0.041109,0.105554,0.056233
9,1.5985,1.560066,0.456,0.048379,0.109557,0.064123
10,1.5275,1.53044,0.462,0.072397,0.11372,0.067699


[I 2025-03-15 18:51:02,013] Trial 135 finished with value: 0.11061487187711876 and parameters: {'learning_rate': 0.00042391934005069233, 'weight_decay': 0.004, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 6.0}. Best is trial 105 with value: 0.48810144914827314.


Trial 136 with params: {'learning_rate': 0.004532221972457291, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3153,1.998914,0.194,0.035346,0.046022,0.022417
2,1.7786,1.70465,0.446,0.044515,0.109628,0.0602
3,1.5398,1.466835,0.484,0.100595,0.122398,0.087178
4,1.363,1.254659,0.544,0.075742,0.139526,0.092505
5,1.2115,1.174197,0.556,0.11255,0.167712,0.11034
6,1.0808,1.115906,0.56,0.121163,0.158551,0.117806
7,0.9612,1.083565,0.594,0.180338,0.203695,0.162151
8,0.8168,1.040386,0.61,0.20414,0.237022,0.204227
9,0.7026,1.022161,0.632,0.262893,0.292947,0.249664
10,0.5792,1.044777,0.608,0.286195,0.326495,0.271981


[I 2025-03-15 18:53:27,210] Trial 136 finished with value: 0.3915336898324196 and parameters: {'learning_rate': 0.004532221972457291, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 7.0}. Best is trial 105 with value: 0.48810144914827314.


Trial 137 with params: {'learning_rate': 0.003388892839318184, 'weight_decay': 0.007, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2072,1.966952,0.194,0.028901,0.059038,0.034725
2,1.7722,1.686899,0.434,0.041725,0.107858,0.056991
3,1.5551,1.450212,0.51,0.076232,0.131549,0.087111
4,1.3901,1.397261,0.498,0.099354,0.13772,0.096263
5,1.2606,1.262616,0.546,0.101592,0.151837,0.112809
6,1.1409,1.172151,0.556,0.154331,0.164056,0.120223
7,1.037,1.113096,0.592,0.167493,0.195799,0.159897
8,0.9091,1.117225,0.616,0.22059,0.241239,0.208998
9,0.8053,1.029916,0.63,0.239968,0.276588,0.225082
10,0.6899,1.018887,0.64,0.282173,0.297146,0.2437


[I 2025-03-15 18:55:47,225] Trial 137 finished with value: 0.368964265360413 and parameters: {'learning_rate': 0.003388892839318184, 'weight_decay': 0.007, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 4.5}. Best is trial 105 with value: 0.48810144914827314.


Trial 138 with params: {'learning_rate': 0.00030866305877428067, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.98, 'warmup_steps': 2, 'lambda_param': 0.9, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4412,2.347044,0.11,0.002619,0.02381,0.004719
2,2.2326,2.189593,0.11,0.002619,0.02381,0.004719
3,2.1313,2.192334,0.11,0.002619,0.02381,0.004719
4,2.075,2.115585,0.11,0.002624,0.02381,0.004728
5,2.0265,2.065458,0.124,0.056249,0.044232,0.032784
6,1.9776,1.989777,0.332,0.079154,0.064646,0.055154
7,1.9068,1.900497,0.412,0.049118,0.076181,0.056022
8,1.839,1.852772,0.434,0.064769,0.095103,0.070641
9,1.8041,1.808143,0.434,0.046674,0.097229,0.05952
10,1.7476,1.739722,0.436,0.060013,0.095536,0.06562


[I 2025-03-15 18:58:08,232] Trial 138 finished with value: 0.08209009260945177 and parameters: {'learning_rate': 0.00030866305877428067, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.98, 'warmup_steps': 2, 'lambda_param': 0.9, 'temperature': 2.0}. Best is trial 105 with value: 0.48810144914827314.


Trial 139 with params: {'learning_rate': 0.0022164020204279602, 'weight_decay': 0.008, 'adam_beta1': 0.92, 'warmup_steps': 1, 'lambda_param': 0.2, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2721,2.13307,0.15,0.03117,0.035546,0.01677
2,1.9076,1.822292,0.422,0.040298,0.094612,0.053616
3,1.6855,1.581233,0.462,0.059742,0.11332,0.0636
4,1.4915,1.457718,0.486,0.075927,0.120833,0.078006
5,1.3489,1.294762,0.544,0.100063,0.138913,0.097833
6,1.2434,1.208849,0.554,0.087936,0.141781,0.100368
7,1.1409,1.191575,0.562,0.136501,0.167269,0.128736
8,1.0094,1.165105,0.57,0.175644,0.174068,0.142602
9,0.9156,1.085549,0.588,0.169044,0.203737,0.15956
10,0.79,1.015629,0.614,0.190416,0.239725,0.194884


[I 2025-03-15 19:00:43,917] Trial 139 finished with value: 0.36210969753696703 and parameters: {'learning_rate': 0.0022164020204279602, 'weight_decay': 0.008, 'adam_beta1': 0.92, 'warmup_steps': 1, 'lambda_param': 0.2, 'temperature': 7.0}. Best is trial 105 with value: 0.48810144914827314.


Trial 140 with params: {'learning_rate': 0.004369420290254315, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1993,1.939055,0.314,0.039296,0.077056,0.041801
2,1.7004,1.551507,0.486,0.066084,0.116749,0.073876
3,1.4496,1.355238,0.51,0.090825,0.134773,0.096221
4,1.2507,1.173711,0.558,0.110483,0.151568,0.1113
5,1.0974,1.103307,0.586,0.153982,0.183646,0.147578
6,0.9517,1.080665,0.606,0.206792,0.21495,0.189382
7,0.799,1.00223,0.628,0.239166,0.253502,0.222116
8,0.6394,0.966682,0.662,0.325103,0.338045,0.296189
9,0.5313,0.956668,0.67,0.311764,0.326808,0.2954
10,0.4178,1.011524,0.65,0.337505,0.344006,0.309772


[I 2025-03-15 19:03:10,136] Trial 140 finished with value: 0.4110262662884006 and parameters: {'learning_rate': 0.004369420290254315, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 7.0}. Best is trial 105 with value: 0.48810144914827314.


Trial 141 with params: {'learning_rate': 0.004341147285777764, 'weight_decay': 0.008, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2765,1.99659,0.2,0.035526,0.053959,0.032497
2,1.7796,1.715007,0.448,0.044364,0.110105,0.060781
3,1.5498,1.482951,0.482,0.073833,0.118369,0.076332
4,1.3806,1.3026,0.544,0.08893,0.144524,0.099677
5,1.234,1.200649,0.556,0.107669,0.160188,0.113156
6,1.1068,1.139758,0.564,0.121343,0.156795,0.117321
7,0.9867,1.11096,0.594,0.153487,0.195939,0.149015
8,0.8555,1.07439,0.606,0.209777,0.243874,0.200439
9,0.7459,1.035622,0.62,0.284133,0.278526,0.245579
10,0.6223,0.973502,0.65,0.290969,0.289454,0.270755


[I 2025-03-15 19:05:34,330] Trial 141 finished with value: 0.3897829450827757 and parameters: {'learning_rate': 0.004341147285777764, 'weight_decay': 0.008, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 2.0}. Best is trial 105 with value: 0.48810144914827314.


Trial 142 with params: {'learning_rate': 0.0027922195694078597, 'weight_decay': 0.004, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1651,2.000854,0.194,0.028578,0.059038,0.034382
2,1.7806,1.728968,0.42,0.041547,0.106977,0.055473
3,1.5739,1.481029,0.502,0.068074,0.120564,0.077984
4,1.4205,1.396184,0.512,0.08502,0.136434,0.093596
5,1.2896,1.234925,0.546,0.090647,0.137656,0.100011
6,1.1831,1.182878,0.564,0.094345,0.154439,0.109798
7,1.0843,1.162659,0.564,0.124972,0.174004,0.131278
8,0.9584,1.134281,0.58,0.156859,0.201421,0.153633
9,0.8655,1.07676,0.592,0.165974,0.215711,0.168846
10,0.7432,0.996214,0.632,0.215102,0.261247,0.214467


[I 2025-03-15 19:08:19,149] Trial 142 finished with value: 0.35993047914421505 and parameters: {'learning_rate': 0.0027922195694078597, 'weight_decay': 0.004, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 3.5}. Best is trial 105 with value: 0.48810144914827314.


Trial 143 with params: {'learning_rate': 0.0004921914731583679, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4098,2.227547,0.11,0.002619,0.02381,0.004719
2,2.1252,2.137907,0.11,0.002619,0.02381,0.004719
3,2.0206,2.008467,0.248,0.039633,0.046974,0.033856
4,1.8835,1.854271,0.428,0.034377,0.084286,0.046304
5,1.7506,1.699483,0.438,0.041704,0.105272,0.058579
6,1.658,1.618865,0.46,0.068235,0.112992,0.06171
7,1.5768,1.540407,0.452,0.067814,0.108679,0.060764
8,1.4958,1.53402,0.472,0.075367,0.114127,0.074123
9,1.4623,1.440102,0.49,0.077155,0.120617,0.081608
10,1.3745,1.391753,0.522,0.074144,0.125801,0.084371


[I 2025-03-15 19:09:50,984] Trial 143 pruned. 


Trial 144 with params: {'learning_rate': 0.004873468483144, 'weight_decay': 0.005, 'adam_beta1': 0.92, 'warmup_steps': 0, 'lambda_param': 0.7000000000000001, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5126,2.096846,0.132,0.029346,0.031217,0.014013
2,1.806,1.660202,0.44,0.060477,0.108151,0.063683
3,1.5429,1.470734,0.48,0.104727,0.122784,0.08206
4,1.3644,1.257363,0.546,0.078196,0.143594,0.095614
5,1.1974,1.160984,0.568,0.113094,0.176505,0.12242
6,1.0633,1.120394,0.588,0.178768,0.203278,0.160588
7,0.9202,1.068595,0.596,0.184363,0.213492,0.170466
8,0.7914,1.061429,0.616,0.233673,0.27151,0.230834
9,0.6951,0.996002,0.618,0.264155,0.281425,0.246221
10,0.565,1.02393,0.634,0.272517,0.295571,0.255555


[I 2025-03-15 19:12:12,915] Trial 144 finished with value: 0.3579566504455861 and parameters: {'learning_rate': 0.004873468483144, 'weight_decay': 0.005, 'adam_beta1': 0.92, 'warmup_steps': 0, 'lambda_param': 0.7000000000000001, 'temperature': 2.0}. Best is trial 105 with value: 0.48810144914827314.


Trial 145 with params: {'learning_rate': 0.002222710730315421, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.218,2.0968,0.146,0.029539,0.047566,0.030379
2,1.8699,1.802536,0.422,0.042343,0.105997,0.057392
3,1.6759,1.592951,0.452,0.043483,0.1111,0.060056
4,1.4903,1.4702,0.482,0.087506,0.130501,0.084616
5,1.3749,1.359233,0.526,0.084316,0.127375,0.09111
6,1.2682,1.22671,0.56,0.102565,0.159326,0.113556
7,1.1633,1.197245,0.562,0.11185,0.174666,0.119829
8,1.0494,1.161426,0.578,0.165719,0.194179,0.147915
9,0.9708,1.080065,0.598,0.163152,0.209624,0.155513
10,0.8345,1.040877,0.604,0.156926,0.208782,0.161934


[I 2025-03-15 19:13:47,239] Trial 145 pruned. 


Trial 146 with params: {'learning_rate': 0.003924231996584426, 'weight_decay': 0.0, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2098,1.974117,0.204,0.037991,0.052872,0.032011
2,1.7592,1.668952,0.456,0.067942,0.111757,0.062525
3,1.537,1.456571,0.508,0.073921,0.122337,0.082301
4,1.3696,1.297781,0.544,0.086419,0.141426,0.098105
5,1.2313,1.206574,0.554,0.110111,0.153263,0.117313
6,1.1104,1.153023,0.576,0.151685,0.166854,0.126712
7,1.0026,1.097895,0.594,0.156858,0.204964,0.151452
8,0.8632,1.060943,0.612,0.208446,0.253941,0.201386
9,0.7513,1.00776,0.634,0.234824,0.273257,0.229308
10,0.6323,0.980388,0.646,0.320595,0.300523,0.274057


[I 2025-03-15 19:16:09,207] Trial 146 finished with value: 0.4245076665039607 and parameters: {'learning_rate': 0.003924231996584426, 'weight_decay': 0.0, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 2.5}. Best is trial 105 with value: 0.48810144914827314.


Trial 147 with params: {'learning_rate': 0.0013506151458172716, 'weight_decay': 0.0, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2627,2.123855,0.132,0.029267,0.031217,0.013886
2,1.9932,1.940294,0.43,0.046874,0.108037,0.061409
3,1.79,1.731363,0.398,0.051225,0.095791,0.062897
4,1.6301,1.608374,0.456,0.047468,0.111727,0.062706
5,1.522,1.48014,0.488,0.071175,0.117155,0.07504
6,1.4279,1.390546,0.522,0.072857,0.128722,0.085645
7,1.345,1.330661,0.538,0.095002,0.146845,0.104745
8,1.2491,1.257556,0.552,0.101654,0.140239,0.105309
9,1.1966,1.20203,0.558,0.092032,0.146907,0.104184
10,1.1158,1.192294,0.558,0.113238,0.175671,0.118577


[I 2025-03-15 19:18:48,995] Trial 147 finished with value: 0.2683578976462126 and parameters: {'learning_rate': 0.0013506151458172716, 'weight_decay': 0.0, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 3.0}. Best is trial 105 with value: 0.48810144914827314.


Trial 148 with params: {'learning_rate': 0.0037447250623325926, 'weight_decay': 0.001, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1888,1.974345,0.208,0.037911,0.058163,0.037058
2,1.7674,1.715767,0.434,0.042853,0.106771,0.058196
3,1.5562,1.455606,0.504,0.068144,0.121118,0.0803
4,1.3928,1.368202,0.526,0.083926,0.138454,0.094978
5,1.2619,1.216632,0.546,0.094375,0.138558,0.10361
6,1.1503,1.166842,0.56,0.098183,0.14624,0.109485
7,1.0398,1.115329,0.588,0.14755,0.194015,0.141694
8,0.9064,1.129668,0.596,0.177858,0.218673,0.174834
9,0.806,1.031921,0.616,0.212281,0.243341,0.198596
10,0.6919,1.024493,0.64,0.266612,0.282239,0.253783


[I 2025-03-15 19:21:17,358] Trial 148 finished with value: 0.4029932102759337 and parameters: {'learning_rate': 0.0037447250623325926, 'weight_decay': 0.001, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}. Best is trial 105 with value: 0.48810144914827314.


Trial 149 with params: {'learning_rate': 0.0012829186602505733, 'weight_decay': 0.005, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3103,2.141548,0.112,0.003442,0.024286,0.005615
2,2.0101,1.937922,0.432,0.055282,0.105969,0.071569
3,1.7928,1.713169,0.398,0.048612,0.084406,0.059685
4,1.6267,1.579298,0.456,0.044522,0.111727,0.061059
5,1.4987,1.458768,0.484,0.074858,0.122202,0.076148
6,1.3928,1.368903,0.542,0.10915,0.143916,0.102423
7,1.3032,1.292126,0.548,0.089531,0.136583,0.096958
8,1.2083,1.262442,0.55,0.132789,0.143772,0.110813
9,1.1427,1.189052,0.558,0.106134,0.154078,0.113351
10,1.0445,1.155826,0.572,0.126878,0.173084,0.128191


[I 2025-03-15 19:23:39,330] Trial 149 finished with value: 0.28782323812439825 and parameters: {'learning_rate': 0.0012829186602505733, 'weight_decay': 0.005, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 7.0}. Best is trial 105 with value: 0.48810144914827314.


In [56]:
print(best_trial2)

BestRun(run_id='105', objective=0.48810144914827314, hyperparameters={'learning_rate': 0.004107822923895355, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 2.5}, run_summary=None)


In [57]:
base.reset_seed()

In [58]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base_fine_aug_hp-search", logging_dir=f"~/logs/{DATASET}/bilstm-base_fine_aug_hp-search", epochs=num_epochs, batch_size=batch_size)

In [59]:
#Nápočet epoch na steps
data_length = len(all_train_data)
min_r = math.ceil(data_length/batch_size)*5
max_r = math.ceil(data_length/batch_size)*num_epochs
warm_up = math.ceil(data_length/batch_size/10)

In [60]:
def hp_space(trial):
    params =  {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-3, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0, 1e-2, step=1e-3),
        "adam_beta1" : trial.suggest_float("adam_beta1", 0.9, 0.99, step=0.01),
        "warmup_steps" : trial.suggest_int("warmup_steps", 0, warm_up),
    }
    print(f"Trial {trial.number} with params: {params}")
    return params

In [61]:
pruner = optuna.pruners.HyperbandPruner(min_resource=min_r, max_resource=max_r, reduction_factor=2, bootstrap_count=2)
sampler = optuna.samplers.TPESampler(seed=42, multivariate=True)



In [62]:
trainer = Trainer(
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    model_init = lambda: get_BiLSTM(),
    #callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)
  

In [63]:
best_trial3 = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    compute_objective=lambda metrics: metrics["eval_f1"],
    pruner=pruner,
    sampler=sampler,
    study_name="Test-base-aug",
    n_trials=150
)

[I 2025-03-15 19:23:39,682] A new study created in memory with name: Test-base-aug


Trial 0 with params: {'learning_rate': 0.0001025350969016849, 'weight_decay': 0.01, 'adam_beta1': 0.97, 'warmup_steps': 32}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4992,1.884102,0.506874,0.125877,0.140456,0.11763
2,1.4844,1.439945,0.635197,0.289269,0.251349,0.242625
3,1.0609,1.232606,0.681027,0.314723,0.328485,0.314586
4,0.7976,1.142544,0.700275,0.395861,0.369578,0.3714
5,0.6196,1.083841,0.71494,0.440765,0.426609,0.425328


[I 2025-03-15 19:25:16,160] Trial 0 pruned. 


Trial 1 with params: {'learning_rate': 2.6368755339723032e-05, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 46}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0481,2.638261,0.355637,0.034507,0.072683,0.045307
2,2.3592,2.292239,0.422548,0.063726,0.09541,0.067939
3,2.0691,2.072689,0.471127,0.111152,0.120472,0.095796
4,1.8658,1.911924,0.516957,0.140635,0.145014,0.122725
5,1.707,1.789347,0.541705,0.159589,0.166541,0.148413


[I 2025-03-15 19:26:53,086] Trial 1 pruned. 


Trial 2 with params: {'learning_rate': 0.00041917115166952007, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 52}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5786,1.101706,0.71494,0.425292,0.371589,0.379576
2,0.4612,0.988831,0.769019,0.631578,0.55305,0.571192
3,0.1804,1.14708,0.76352,0.686243,0.624921,0.641111
4,0.0848,1.161893,0.778185,0.680135,0.634536,0.63948
5,0.0481,1.34866,0.775435,0.684973,0.641515,0.651309
6,0.0305,1.410541,0.780935,0.70406,0.657365,0.662937
7,0.0205,1.519354,0.782768,0.678554,0.645503,0.647684
8,0.0157,1.525653,0.791017,0.715259,0.645863,0.663755
9,0.0102,1.501509,0.791934,0.699259,0.669874,0.671839
10,0.0082,1.550093,0.789184,0.691955,0.661268,0.661946


[I 2025-03-15 19:30:05,110] Trial 2 pruned. 


Trial 3 with params: {'learning_rate': 0.001764971584817572, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 9}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8568,0.986994,0.790101,0.69218,0.615523,0.637568
2,0.0868,1.246358,0.788268,0.723372,0.646146,0.663349
3,0.0299,1.331934,0.813016,0.767106,0.691096,0.715826
4,0.019,1.312798,0.823098,0.774475,0.686327,0.71251
5,0.0116,1.507873,0.816682,0.805369,0.698977,0.731719
6,0.0114,1.475876,0.805683,0.764386,0.697791,0.714649
7,0.0108,1.533732,0.800183,0.759397,0.693,0.702873
8,0.0097,1.504759,0.815765,0.746422,0.69693,0.703495
9,0.0042,1.665742,0.816682,0.754454,0.69573,0.713157
10,0.0062,1.651262,0.811182,0.76898,0.706493,0.722645


[I 2025-03-15 19:33:11,783] Trial 3 pruned. 


Trial 4 with params: {'learning_rate': 6.62431060594998e-05, 'weight_decay': 0.005, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 15}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.629,2.124491,0.461962,0.102411,0.11208,0.087604
2,1.7722,1.717896,0.563703,0.206043,0.185575,0.171175
3,1.4141,1.493856,0.622365,0.242062,0.242474,0.22725
4,1.1621,1.343816,0.651696,0.289858,0.281511,0.273125
5,0.9724,1.235602,0.678277,0.396224,0.33765,0.344293
6,0.8219,1.170734,0.690192,0.43942,0.375167,0.381762
7,0.7013,1.150532,0.704858,0.438992,0.414845,0.415028
8,0.6066,1.117547,0.709441,0.471158,0.416585,0.425044
9,0.5318,1.108037,0.718607,0.50986,0.457022,0.470434
10,0.4643,1.109867,0.716774,0.507685,0.449542,0.464042


[I 2025-03-15 19:39:10,017] Trial 4 pruned. 


Trial 5 with params: {'learning_rate': 0.0004480975918214954, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 19}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5197,1.076513,0.72044,0.441602,0.399247,0.407107
2,0.4338,1.03045,0.768103,0.634486,0.571543,0.579196
3,0.1651,1.134288,0.769936,0.661347,0.626542,0.628833
4,0.0773,1.233686,0.769936,0.630678,0.624605,0.615264
5,0.0427,1.438241,0.781852,0.674346,0.643689,0.647306
6,0.0275,1.426953,0.772686,0.677533,0.619508,0.629721
7,0.0183,1.47417,0.786434,0.677213,0.663408,0.658211
8,0.0134,1.508163,0.790101,0.721455,0.645841,0.66388
9,0.0096,1.598658,0.784601,0.663359,0.661038,0.654312
10,0.0079,1.622137,0.782768,0.705896,0.662939,0.66945


[I 2025-03-15 19:45:17,251] Trial 5 pruned. 


Trial 6 with params: {'learning_rate': 0.00017018418817029164, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 27}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1146,1.55114,0.595784,0.203364,0.208265,0.191474
2,1.0758,1.168438,0.703025,0.3993,0.383352,0.381546
3,0.6686,1.051921,0.718607,0.472804,0.426128,0.437351
4,0.4463,1.042467,0.736022,0.552579,0.479277,0.502231
5,0.304,1.071211,0.752521,0.637658,0.557609,0.580482
6,0.2043,1.122528,0.752521,0.625643,0.568229,0.577756
7,0.1389,1.211239,0.754354,0.666879,0.599162,0.614189
8,0.1024,1.215625,0.756187,0.632468,0.592124,0.592909
9,0.0746,1.305439,0.762603,0.671629,0.606185,0.62274
10,0.0549,1.380391,0.757104,0.619191,0.589712,0.590742


[I 2025-03-15 19:48:24,077] Trial 6 pruned. 


Trial 7 with params: {'learning_rate': 0.00039710847107924746, 'weight_decay': 0.0, 'adam_beta1': 0.96, 'warmup_steps': 9}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6688,1.151784,0.700275,0.412015,0.358869,0.367435
2,0.5409,1.044564,0.750687,0.613053,0.550692,0.565146
3,0.2249,1.162249,0.769936,0.694636,0.612098,0.635063
4,0.1052,1.23476,0.773602,0.699125,0.606215,0.633745
5,0.0574,1.319118,0.777269,0.646434,0.613615,0.614815
6,0.0355,1.428792,0.788268,0.690475,0.644279,0.656081
7,0.022,1.478387,0.781852,0.654033,0.652182,0.643364
8,0.0145,1.691224,0.778185,0.66736,0.634005,0.636541
9,0.0109,1.525586,0.791934,0.704279,0.655111,0.665667
10,0.0092,1.561903,0.791934,0.681002,0.658998,0.661808


[I 2025-03-15 19:51:36,626] Trial 7 pruned. 


Trial 8 with params: {'learning_rate': 1.498208643215546e-05, 'weight_decay': 0.01, 'adam_beta1': 0.99, 'warmup_steps': 43}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4221,3.087399,0.179652,0.018551,0.020822,0.00759
2,2.8017,2.663558,0.334555,0.033626,0.066149,0.040857
3,2.4471,2.421661,0.396884,0.060104,0.08543,0.05518
4,2.2371,2.256413,0.419798,0.064762,0.093885,0.066908
5,2.0825,2.13533,0.456462,0.099134,0.109431,0.084094


[I 2025-03-15 19:53:14,412] Trial 8 pruned. 


Trial 9 with params: {'learning_rate': 6.639623079859457e-05, 'weight_decay': 0.001, 'adam_beta1': 0.96, 'warmup_steps': 23}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6716,2.129006,0.460128,0.09151,0.111237,0.086336
2,1.7699,1.706332,0.565536,0.17476,0.181035,0.164876
3,1.407,1.476548,0.622365,0.246618,0.240883,0.227831
4,1.1563,1.332251,0.659028,0.290515,0.280458,0.27529
5,0.9627,1.220727,0.671861,0.370436,0.322447,0.32297


[I 2025-03-15 19:54:49,754] Trial 9 pruned. 


Trial 10 with params: {'learning_rate': 0.00012001988398838816, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 15}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2855,1.755229,0.551787,0.158485,0.168325,0.14784
2,1.3467,1.341579,0.656279,0.29689,0.282175,0.277177
3,0.943,1.159753,0.698442,0.416334,0.367174,0.371231
4,0.6918,1.088432,0.713107,0.477462,0.402311,0.422856
5,0.5224,1.039587,0.732356,0.547346,0.464423,0.486082
6,0.3982,1.035225,0.735105,0.566904,0.48705,0.511564
7,0.3035,1.066374,0.737855,0.583186,0.515349,0.529866
8,0.2341,1.066881,0.747938,0.617547,0.535072,0.557843
9,0.1807,1.116706,0.750687,0.632912,0.559866,0.57835
10,0.144,1.204198,0.743355,0.64894,0.555617,0.575371


[I 2025-03-15 19:57:54,465] Trial 10 pruned. 


Trial 11 with params: {'learning_rate': 0.0001577858185676611, 'weight_decay': 0.006, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 12}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1802,1.596778,0.595784,0.210435,0.202743,0.18779
2,1.1593,1.204811,0.689276,0.342652,0.344585,0.3348
3,0.7453,1.069643,0.713107,0.471705,0.421608,0.435668
4,0.5114,1.050808,0.730522,0.559429,0.479058,0.499328
5,0.3584,1.056882,0.737855,0.581313,0.510628,0.532208
6,0.25,1.101886,0.743355,0.632731,0.54534,0.57083
7,0.1757,1.125367,0.759853,0.71994,0.60604,0.632019
8,0.1271,1.195197,0.748854,0.634825,0.580808,0.592296
9,0.0957,1.208141,0.757104,0.655902,0.615574,0.614459
10,0.0712,1.335063,0.761687,0.657324,0.618996,0.621486


[I 2025-03-15 20:00:55,163] Trial 11 pruned. 


Trial 12 with params: {'learning_rate': 5.6354797084228695e-05, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 6}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6689,2.214614,0.43813,0.083512,0.101907,0.076222
2,1.8789,1.824503,0.529789,0.146371,0.157909,0.138816
3,1.5429,1.596002,0.588451,0.235376,0.209644,0.197652
4,1.3011,1.433142,0.63428,0.265378,0.255564,0.244868
5,1.1151,1.331347,0.653529,0.320846,0.287655,0.286233


[I 2025-03-15 20:02:22,940] Trial 12 pruned. 


Trial 13 with params: {'learning_rate': 0.0005277845444967641, 'weight_decay': 0.003, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 32}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4939,1.062658,0.731439,0.463735,0.441405,0.444542
2,0.3687,1.036531,0.761687,0.650057,0.592834,0.593822
3,0.1316,1.203778,0.770852,0.656392,0.612888,0.613988
4,0.0622,1.254592,0.784601,0.670294,0.645674,0.64283
5,0.0342,1.377296,0.782768,0.652858,0.643755,0.631296
6,0.0228,1.423211,0.783685,0.697338,0.649854,0.658259
7,0.0147,1.504442,0.781852,0.641777,0.616031,0.61524
8,0.0093,1.564032,0.796517,0.657737,0.653448,0.647735
9,0.008,1.614101,0.797434,0.666891,0.659763,0.650428
10,0.0084,1.627864,0.791017,0.67835,0.657982,0.658032


[I 2025-03-15 20:05:29,802] Trial 13 pruned. 


Trial 14 with params: {'learning_rate': 0.0019052703466447157, 'weight_decay': 0.0, 'adam_beta1': 0.92, 'warmup_steps': 51}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9132,1.012673,0.790101,0.68078,0.608498,0.622634
2,0.0807,1.110809,0.810266,0.804952,0.719584,0.740769
3,0.0274,1.246899,0.821265,0.753983,0.714391,0.722031
4,0.0193,1.283373,0.824931,0.770559,0.739357,0.743957
5,0.0124,1.305595,0.820348,0.772087,0.721852,0.73266
6,0.0112,1.426205,0.831347,0.803869,0.709946,0.736156
7,0.0093,1.346563,0.825848,0.806101,0.70231,0.734224
8,0.0081,1.59196,0.810266,0.789469,0.699,0.727901
9,0.0108,1.527594,0.813016,0.744825,0.693257,0.704853
10,0.0074,1.55255,0.821265,0.787784,0.721066,0.735902


[I 2025-03-15 20:11:39,762] Trial 14 pruned. 


Trial 15 with params: {'learning_rate': 0.002282444566080665, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 41}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8327,1.057617,0.799267,0.780303,0.678468,0.708825
2,0.0695,1.093814,0.823098,0.829381,0.735477,0.761053
3,0.0267,1.228717,0.812099,0.745827,0.69845,0.707712
4,0.0192,1.366989,0.816682,0.808914,0.697511,0.723714
5,0.0137,1.383078,0.831347,0.803801,0.721837,0.742318
6,0.0133,1.460906,0.817599,0.754017,0.698342,0.710456
7,0.0112,1.620257,0.814849,0.760905,0.71136,0.719097
8,0.0054,1.574133,0.823098,0.779097,0.71933,0.734623
9,0.0091,1.553874,0.821265,0.789108,0.723881,0.737116
10,0.0077,1.826478,0.817599,0.765554,0.674763,0.699886


[I 2025-03-15 20:17:55,723] Trial 15 pruned. 


Trial 16 with params: {'learning_rate': 0.0026517326544565163, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 48}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.791,1.007378,0.809349,0.75414,0.672891,0.699244
2,0.0631,1.18653,0.810266,0.780978,0.696918,0.715307
3,0.0252,1.264154,0.820348,0.757269,0.715894,0.722745
4,0.0199,1.373747,0.811182,0.796786,0.702152,0.732394
5,0.0152,1.323756,0.828598,0.784117,0.743532,0.748688
6,0.0116,1.441609,0.814849,0.775559,0.745573,0.742794
7,0.0098,1.515197,0.823098,0.772567,0.708274,0.719797
8,0.0104,1.709252,0.813016,0.750725,0.689414,0.704098
9,0.0103,1.558639,0.821265,0.764509,0.727443,0.729216
10,0.0075,1.661653,0.816682,0.722017,0.697125,0.699172


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--f1/34c46321f42186df33a6260966e34a368f14868d9cc2ba47d142112e2800d233 (last modified on Fri Jan 10 23:14:01 2025) since it couldn't be found locally at evaluate-metric--f1, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--f1/34c46321f42186df33a6260966e34a368f14868d9cc2ba47d142112e2800d233 (last modified on Fri Jan 10 23:14:01 2025) since it couldn't be found locally at evaluate-metric--f1, or remotely on the Hugging Face Hub.
[I 2025-03-15 20:30:38,338] Trial 16 finished with value: 0.7640432955491052 and parameters: {'learning_rate': 0.0026517326544565163, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 48}. Best is trial 16 with value: 0.7640432955491052.


Trial 17 with params: {'learning_rate': 0.0009835686234471472, 'weight_decay': 0.002, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 52}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.272,0.999355,0.759853,0.585154,0.51414,0.529349
2,0.1796,1.144698,0.791017,0.683892,0.667826,0.656014
3,0.0576,1.272156,0.806599,0.708214,0.67279,0.671042
4,0.0259,1.275195,0.822181,0.755774,0.692453,0.702852
5,0.0143,1.291393,0.814849,0.724677,0.688896,0.690883
6,0.0104,1.452839,0.805683,0.761597,0.697001,0.711395
7,0.0098,1.452394,0.808433,0.737165,0.683818,0.694505
8,0.007,1.485364,0.826764,0.741322,0.710384,0.713627
9,0.009,1.446637,0.802933,0.712982,0.713533,0.699078
10,0.0043,1.609268,0.818515,0.760211,0.717146,0.72357


[I 2025-03-15 20:37:03,831] Trial 17 pruned. 


Trial 18 with params: {'learning_rate': 0.004148055707053298, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 39}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6842,1.061624,0.805683,0.784956,0.695757,0.718757
2,0.0535,1.202277,0.813016,0.759122,0.727784,0.728882
3,0.0309,1.363923,0.823098,0.754411,0.717654,0.72132
4,0.0237,1.344071,0.816682,0.802184,0.740349,0.754364
5,0.022,1.523276,0.819432,0.776991,0.737159,0.74401
6,0.0185,1.563051,0.824015,0.769891,0.740369,0.738631
7,0.017,1.62452,0.822181,0.804483,0.743962,0.762223
8,0.0169,1.764948,0.828598,0.789932,0.734238,0.751908
9,0.0135,1.859242,0.822181,0.757463,0.723578,0.729401
10,0.0118,1.92778,0.829514,0.794916,0.726727,0.746182


[I 2025-03-15 20:46:05,166] Trial 18 finished with value: 0.7570139588304845 and parameters: {'learning_rate': 0.004148055707053298, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 39}. Best is trial 16 with value: 0.7640432955491052.


Trial 19 with params: {'learning_rate': 0.0027574086347731926, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 40}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7832,1.04654,0.804766,0.747704,0.682769,0.702195
2,0.0636,1.192867,0.818515,0.738199,0.681735,0.694527
3,0.0259,1.313827,0.814849,0.732366,0.70688,0.701437
4,0.0165,1.3505,0.823098,0.755536,0.738184,0.72888
5,0.0164,1.45572,0.816682,0.760515,0.694145,0.711311
6,0.0138,1.537363,0.822181,0.773838,0.721079,0.72738
7,0.0127,1.588421,0.827681,0.771545,0.712905,0.726134
8,0.0082,1.858979,0.817599,0.765643,0.707898,0.720238
9,0.0097,1.709217,0.824931,0.770362,0.724142,0.732219
10,0.0092,1.728205,0.813016,0.708318,0.728151,0.704537


[I 2025-03-15 20:55:24,143] Trial 19 finished with value: 0.7553984482350572 and parameters: {'learning_rate': 0.0027574086347731926, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 40}. Best is trial 16 with value: 0.7640432955491052.


Trial 20 with params: {'learning_rate': 0.0004681365777131873, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 45}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5029,1.045107,0.71769,0.456634,0.399164,0.411804
2,0.3999,0.999851,0.774519,0.635545,0.58474,0.595609
3,0.1488,1.157432,0.777269,0.718328,0.657719,0.67144
4,0.0719,1.180425,0.789184,0.733086,0.652192,0.671881
5,0.0417,1.440425,0.775435,0.732028,0.644073,0.671872
6,0.0234,1.471969,0.780018,0.677144,0.637059,0.641064
7,0.0168,1.396283,0.790101,0.707311,0.698902,0.689345
8,0.0118,1.565649,0.7956,0.739282,0.668598,0.684608
9,0.0102,1.617985,0.790101,0.698159,0.675692,0.675555
10,0.0083,1.644079,0.800183,0.708069,0.656712,0.669689


[I 2025-03-15 21:01:43,557] Trial 20 pruned. 


Trial 21 with params: {'learning_rate': 0.004072875160933785, 'weight_decay': 0.005, 'adam_beta1': 0.92, 'warmup_steps': 45}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7082,1.045223,0.819432,0.769793,0.697114,0.717021
2,0.0547,1.173891,0.812099,0.786587,0.73395,0.739266
3,0.0291,1.36447,0.812099,0.749216,0.67982,0.696724
4,0.0274,1.433126,0.805683,0.787661,0.691153,0.720658
5,0.0206,1.5234,0.820348,0.787843,0.718508,0.732199
6,0.0188,1.728856,0.813932,0.808681,0.718407,0.739731
7,0.0163,1.754771,0.809349,0.76533,0.699955,0.715978
8,0.0118,1.786551,0.814849,0.772921,0.715174,0.734012
9,0.0152,1.915755,0.819432,0.781454,0.748063,0.75143
10,0.0112,1.843311,0.824931,0.769753,0.746176,0.744274


[I 2025-03-15 21:10:32,685] Trial 21 finished with value: 0.7401382531541099 and parameters: {'learning_rate': 0.004072875160933785, 'weight_decay': 0.005, 'adam_beta1': 0.92, 'warmup_steps': 45}. Best is trial 16 with value: 0.7640432955491052.


Trial 22 with params: {'learning_rate': 0.0026826241523527678, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.92, 'warmup_steps': 27}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7698,1.034248,0.802016,0.741358,0.69312,0.70604
2,0.0639,1.094041,0.806599,0.721622,0.690288,0.691622
3,0.027,1.346802,0.802933,0.684025,0.645604,0.653235
4,0.0173,1.339012,0.811182,0.754872,0.698792,0.715138
5,0.0137,1.4191,0.813016,0.777377,0.706254,0.718856
6,0.0169,1.446039,0.820348,0.769922,0.729916,0.733638
7,0.0133,1.566231,0.819432,0.790885,0.723778,0.740113
8,0.0084,1.574579,0.826764,0.761156,0.7369,0.740145
9,0.0067,1.680099,0.813016,0.778266,0.707498,0.72931
10,0.008,1.655134,0.824931,0.794757,0.743213,0.754983


[I 2025-03-15 21:19:18,799] Trial 22 finished with value: 0.777016449838325 and parameters: {'learning_rate': 0.0026826241523527678, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.92, 'warmup_steps': 27}. Best is trial 22 with value: 0.777016449838325.


Trial 23 with params: {'learning_rate': 0.004929110668195983, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'warmup_steps': 28}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.628,1.110354,0.804766,0.763381,0.697134,0.70651
2,0.059,1.115047,0.825848,0.76939,0.721851,0.731122
3,0.0316,1.353551,0.812099,0.754777,0.690734,0.704426
4,0.0291,1.424911,0.819432,0.797625,0.734132,0.749212
5,0.0301,1.524509,0.805683,0.775746,0.699537,0.715006
6,0.0253,1.902663,0.821265,0.78944,0.688746,0.719368
7,0.019,1.967885,0.815765,0.795054,0.696126,0.721636
8,0.0141,1.788918,0.816682,0.801289,0.713496,0.736659
9,0.0144,1.86837,0.813932,0.768495,0.712756,0.721973
10,0.015,2.345291,0.807516,0.779397,0.67926,0.709704


[I 2025-03-15 21:28:12,737] Trial 23 finished with value: 0.7354118091690418 and parameters: {'learning_rate': 0.004929110668195983, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'warmup_steps': 28}. Best is trial 22 with value: 0.777016449838325.


Trial 24 with params: {'learning_rate': 0.0014543824808659505, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 26}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0061,1.03908,0.771769,0.644998,0.576167,0.591052
2,0.1096,1.133924,0.805683,0.763458,0.697449,0.715707
3,0.0346,1.279101,0.802016,0.764735,0.67738,0.69694
4,0.0182,1.335911,0.818515,0.729005,0.700769,0.695888
5,0.0129,1.525097,0.810266,0.744691,0.688439,0.70433
6,0.0114,1.482923,0.807516,0.72502,0.680112,0.688726
7,0.0067,1.622035,0.814849,0.793153,0.710586,0.728801
8,0.0094,1.529851,0.811182,0.786379,0.704641,0.7297
9,0.0061,1.508115,0.819432,0.769967,0.725587,0.734615
10,0.0078,1.631811,0.825848,0.799477,0.721936,0.745904


[I 2025-03-15 21:37:31,279] Trial 24 finished with value: 0.7295166384175511 and parameters: {'learning_rate': 0.0014543824808659505, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 26}. Best is trial 22 with value: 0.777016449838325.


Trial 25 with params: {'learning_rate': 0.0033803188220711904, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 28}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.704,1.027008,0.806599,0.783106,0.688816,0.716402
2,0.058,1.15612,0.807516,0.762471,0.701634,0.717427
3,0.0282,1.323453,0.802933,0.774115,0.68769,0.71283
4,0.0179,1.292073,0.805683,0.716664,0.703094,0.695278
5,0.0174,1.474596,0.813016,0.711004,0.692018,0.692722
6,0.0164,1.59138,0.8011,0.733868,0.688492,0.690511
7,0.0132,1.696412,0.814849,0.755137,0.702203,0.716074
8,0.0123,1.630168,0.818515,0.737688,0.711664,0.713128
9,0.0104,1.721334,0.816682,0.76033,0.713263,0.723495
10,0.0065,1.868067,0.818515,0.783186,0.704093,0.729003


[I 2025-03-15 21:43:46,588] Trial 25 pruned. 


Trial 26 with params: {'learning_rate': 0.0009704495086858971, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'warmup_steps': 10}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0599,0.955335,0.765353,0.664219,0.584471,0.604702
2,0.1653,1.0794,0.791017,0.733419,0.688211,0.689713
3,0.055,1.196531,0.802933,0.727834,0.675463,0.687806
4,0.0277,1.310085,0.799267,0.755071,0.676986,0.698576
5,0.0147,1.433214,0.802016,0.727436,0.685217,0.694849
6,0.0118,1.450889,0.804766,0.722264,0.694569,0.694976
7,0.0099,1.490694,0.80385,0.735583,0.687285,0.694726
8,0.0076,1.569428,0.807516,0.697457,0.674092,0.67508
9,0.0068,1.579306,0.8011,0.748502,0.682634,0.700785
10,0.0034,1.622703,0.800183,0.729492,0.687167,0.691602


[I 2025-03-15 21:46:39,563] Trial 26 pruned. 


Trial 27 with params: {'learning_rate': 0.004086354013516855, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 53}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.705,1.012133,0.797434,0.708985,0.70084,0.696967
2,0.0579,1.160624,0.808433,0.785317,0.697015,0.717815
3,0.0302,1.260864,0.825848,0.736073,0.698121,0.704171
4,0.024,1.35569,0.820348,0.767975,0.737049,0.738343
5,0.0211,1.584573,0.808433,0.748374,0.682685,0.69513
6,0.0219,1.775226,0.812099,0.737039,0.716539,0.712226
7,0.0148,1.760841,0.819432,0.739088,0.714999,0.710618
8,0.0109,1.664047,0.823098,0.76447,0.720223,0.730951
9,0.0121,2.003721,0.807516,0.726892,0.700225,0.704221
10,0.0142,1.981755,0.816682,0.758042,0.685441,0.707801


[I 2025-03-15 21:49:47,783] Trial 27 pruned. 


Trial 28 with params: {'learning_rate': 0.0038981520240048643, 'weight_decay': 0.006, 'adam_beta1': 0.96, 'warmup_steps': 10}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7827,1.031787,0.796517,0.755681,0.676784,0.693791
2,0.0659,1.325357,0.796517,0.750544,0.688063,0.701751
3,0.0351,1.381038,0.815765,0.771884,0.710418,0.728786
4,0.0276,1.450921,0.811182,0.768529,0.70472,0.71978
5,0.0165,1.565288,0.808433,0.750075,0.720743,0.71804
6,0.0233,1.627483,0.814849,0.786012,0.704647,0.726271
7,0.0185,1.644099,0.810266,0.772458,0.707959,0.721199
8,0.0099,1.77767,0.814849,0.774962,0.722265,0.734875
9,0.0134,1.83315,0.800183,0.776273,0.695325,0.715066
10,0.012,1.6695,0.822181,0.731635,0.711376,0.70739


[I 2025-03-15 21:59:08,243] Trial 28 finished with value: 0.7532782228036554 and parameters: {'learning_rate': 0.0038981520240048643, 'weight_decay': 0.006, 'adam_beta1': 0.96, 'warmup_steps': 10}. Best is trial 22 with value: 0.777016449838325.


Trial 29 with params: {'learning_rate': 0.002877681786853018, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 22}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.738,1.041138,0.8011,0.762454,0.685541,0.70878
2,0.0599,1.09587,0.812099,0.751896,0.709881,0.712185
3,0.0254,1.261451,0.811182,0.734077,0.679944,0.690077
4,0.0194,1.287097,0.814849,0.740914,0.716873,0.71245
5,0.0145,1.540108,0.815765,0.775564,0.697421,0.712431
6,0.0152,1.529481,0.823098,0.789627,0.708267,0.731253
7,0.0091,1.61286,0.814849,0.779688,0.710524,0.727558
8,0.0091,1.726722,0.809349,0.794165,0.691998,0.726033
9,0.0135,1.814621,0.810266,0.783507,0.694359,0.725058
10,0.0082,1.797676,0.827681,0.765563,0.703428,0.717885


[I 2025-03-15 22:05:17,449] Trial 29 pruned. 


Trial 30 with params: {'learning_rate': 0.0015969434634642223, 'weight_decay': 0.01, 'adam_beta1': 0.96, 'warmup_steps': 44}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0803,0.989045,0.775435,0.635435,0.60893,0.607867
2,0.1096,1.107425,0.797434,0.751646,0.687444,0.698619
3,0.0359,1.188071,0.811182,0.753861,0.705442,0.719154
4,0.0178,1.298908,0.826764,0.75968,0.721998,0.720349
5,0.0156,1.374475,0.819432,0.754907,0.714348,0.718329
6,0.0095,1.449152,0.823098,0.79033,0.715913,0.738719
7,0.0114,1.372467,0.833181,0.759817,0.704063,0.716277
8,0.0069,1.472411,0.824931,0.793977,0.746848,0.754959
9,0.0052,1.61585,0.826764,0.753484,0.724464,0.723255
10,0.0074,1.688054,0.816682,0.779224,0.719826,0.732806


[I 2025-03-15 22:14:26,819] Trial 30 finished with value: 0.7589201720863066 and parameters: {'learning_rate': 0.0015969434634642223, 'weight_decay': 0.01, 'adam_beta1': 0.96, 'warmup_steps': 44}. Best is trial 22 with value: 0.777016449838325.


Trial 31 with params: {'learning_rate': 0.0021727419005441854, 'weight_decay': 0.01, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 47}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.942,0.964956,0.79835,0.657963,0.615413,0.622056
2,0.0775,1.163865,0.809349,0.758266,0.676866,0.702048
3,0.0276,1.260172,0.820348,0.762116,0.699252,0.720234
4,0.0184,1.373013,0.813932,0.788223,0.722635,0.741061
5,0.0129,1.262024,0.823098,0.764221,0.722305,0.732444
6,0.0097,1.526276,0.824931,0.807733,0.717939,0.748725
7,0.0129,1.402542,0.832264,0.788638,0.735834,0.751324
8,0.0113,1.497615,0.825848,0.776457,0.725045,0.735701
9,0.0055,1.590332,0.821265,0.800898,0.726673,0.741563
10,0.0065,1.573973,0.823098,0.783903,0.703635,0.71969


[I 2025-03-15 22:23:54,704] Trial 31 finished with value: 0.7458976733409318 and parameters: {'learning_rate': 0.0021727419005441854, 'weight_decay': 0.01, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 47}. Best is trial 22 with value: 0.777016449838325.


Trial 32 with params: {'learning_rate': 0.0016538120545415622, 'weight_decay': 0.007, 'adam_beta1': 0.97, 'warmup_steps': 38}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1179,1.012526,0.775435,0.677637,0.60769,0.62336
2,0.1168,1.210696,0.802933,0.731797,0.688227,0.700825
3,0.0325,1.25872,0.807516,0.75304,0.688597,0.703404
4,0.0216,1.343737,0.809349,0.750013,0.703038,0.71186
5,0.0141,1.462454,0.804766,0.777693,0.679344,0.714932
6,0.01,1.470298,0.814849,0.75675,0.710267,0.720143
7,0.0065,1.547954,0.807516,0.763725,0.692473,0.707652
8,0.0082,1.598598,0.80385,0.754878,0.697427,0.712749
9,0.011,1.469747,0.817599,0.763155,0.705048,0.720699
10,0.0075,1.565643,0.813016,0.760179,0.708364,0.717171


[I 2025-03-15 22:30:17,655] Trial 32 pruned. 


Trial 33 with params: {'learning_rate': 0.00014279024285011015, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 47}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2874,1.653325,0.580202,0.197464,0.192786,0.174801
2,1.2087,1.242528,0.673694,0.307286,0.320466,0.308048
3,0.7912,1.085279,0.709441,0.456691,0.413446,0.424134
4,0.5532,1.053565,0.72044,0.485652,0.415771,0.43304
5,0.3975,1.041344,0.744271,0.596257,0.511548,0.533896


[I 2025-03-15 22:31:44,970] Trial 33 pruned. 


Trial 34 with params: {'learning_rate': 0.001678009303299174, 'weight_decay': 0.01, 'adam_beta1': 0.98, 'warmup_steps': 46}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2368,1.022447,0.769019,0.611889,0.541882,0.556904
2,0.134,1.203115,0.796517,0.746268,0.684788,0.698602
3,0.0373,1.404729,0.797434,0.761985,0.689081,0.707438
4,0.0188,1.470599,0.802933,0.770516,0.686631,0.71298
5,0.0123,1.550392,0.806599,0.763677,0.70205,0.713034
6,0.0108,1.470354,0.812099,0.707181,0.702959,0.688593
7,0.0113,1.576957,0.812099,0.791146,0.70338,0.730534
8,0.0077,1.573048,0.815765,0.751333,0.701084,0.713237
9,0.0087,1.74214,0.822181,0.753996,0.747014,0.732736
10,0.0047,1.733501,0.813932,0.7283,0.719925,0.71284


[I 2025-03-15 22:40:28,653] Trial 34 finished with value: 0.7447663166466311 and parameters: {'learning_rate': 0.001678009303299174, 'weight_decay': 0.01, 'adam_beta1': 0.98, 'warmup_steps': 46}. Best is trial 22 with value: 0.777016449838325.


Trial 35 with params: {'learning_rate': 0.0010465229492901214, 'weight_decay': 0.01, 'adam_beta1': 0.93, 'warmup_steps': 38}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1395,0.998161,0.769019,0.632833,0.570655,0.580302
2,0.1571,1.15326,0.779102,0.710704,0.666194,0.664755
3,0.0491,1.211513,0.799267,0.722334,0.684376,0.688173
4,0.0252,1.37141,0.810266,0.726609,0.675115,0.685453
5,0.0151,1.37135,0.805683,0.712003,0.669074,0.675918
6,0.0108,1.464017,0.813932,0.759249,0.711006,0.720411
7,0.0092,1.476752,0.812099,0.726438,0.691735,0.694848
8,0.0076,1.625801,0.80385,0.716082,0.648911,0.660259
9,0.009,1.496797,0.811182,0.776217,0.696661,0.714292
10,0.0026,1.577933,0.821265,0.747885,0.706925,0.709382


[I 2025-03-15 22:43:27,502] Trial 35 pruned. 


Trial 36 with params: {'learning_rate': 2.067868899631534e-05, 'weight_decay': 0.001, 'adam_beta1': 0.99, 'warmup_steps': 36}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3258,2.977796,0.303391,0.022477,0.055727,0.030719
2,2.6251,2.488482,0.381302,0.037618,0.080368,0.050992
3,2.2493,2.231516,0.434464,0.068909,0.09901,0.072431
4,2.0311,2.055166,0.472961,0.092824,0.117792,0.092226
5,1.8724,1.932249,0.504125,0.124166,0.137531,0.11606
6,1.7537,1.836919,0.527039,0.145821,0.150313,0.129604
7,1.6512,1.754701,0.549954,0.148171,0.166627,0.148971
8,1.56,1.689926,0.560953,0.19001,0.179481,0.166504
9,1.4803,1.625807,0.582951,0.204671,0.206531,0.193111
10,1.4048,1.567985,0.601283,0.247307,0.22105,0.211442


[I 2025-03-15 22:46:19,662] Trial 36 pruned. 


Trial 37 with params: {'learning_rate': 0.0006376443982681265, 'weight_decay': 0.01, 'adam_beta1': 0.96, 'warmup_steps': 33}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4738,1.05321,0.732356,0.471389,0.458881,0.45477
2,0.3243,1.087747,0.771769,0.669283,0.618806,0.623498
3,0.1044,1.210477,0.776352,0.655015,0.616831,0.621427
4,0.0494,1.324213,0.780935,0.71065,0.634166,0.651668
5,0.0257,1.347894,0.789184,0.700416,0.679463,0.674373
6,0.0168,1.520909,0.785518,0.683039,0.633747,0.646329
7,0.0118,1.501591,0.789184,0.723872,0.658124,0.673035
8,0.0088,1.664708,0.794684,0.72283,0.669885,0.682473
9,0.0082,1.550629,0.7956,0.68135,0.66169,0.664367
10,0.0056,1.553203,0.8011,0.735772,0.672259,0.680833


[I 2025-03-15 22:49:13,030] Trial 37 pruned. 


Trial 38 with params: {'learning_rate': 0.004929751930199651, 'weight_decay': 0.004, 'adam_beta1': 0.96, 'warmup_steps': 30}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7541,1.066642,0.800183,0.708387,0.651918,0.664375
2,0.0637,1.176419,0.812099,0.753599,0.698817,0.711087
3,0.0321,1.290712,0.826764,0.79637,0.720151,0.743698
4,0.0297,1.334252,0.821265,0.776709,0.687241,0.715591
5,0.0269,1.481379,0.815765,0.754505,0.704915,0.713684
6,0.029,1.711219,0.811182,0.758373,0.676199,0.69551
7,0.0234,1.756052,0.796517,0.740743,0.696847,0.703767
8,0.0145,1.850131,0.809349,0.706124,0.665965,0.671741
9,0.0177,1.899131,0.814849,0.750296,0.701911,0.710709
10,0.0151,2.080078,0.814849,0.744715,0.671953,0.694038


[I 2025-03-15 22:55:30,433] Trial 38 pruned. 


Trial 39 with params: {'learning_rate': 0.0016930955152743361, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 42}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9038,0.951786,0.788268,0.719492,0.642672,0.66294
2,0.0888,1.181581,0.802933,0.773883,0.69897,0.711718
3,0.0287,1.255182,0.813016,0.773908,0.693961,0.715277
4,0.0181,1.343742,0.810266,0.727228,0.704259,0.701422
5,0.0129,1.439812,0.808433,0.725481,0.714339,0.70436
6,0.0107,1.454207,0.826764,0.809786,0.724797,0.748257
7,0.0111,1.471312,0.828598,0.816616,0.721445,0.74795
8,0.0089,1.456246,0.815765,0.749305,0.733212,0.729237
9,0.0072,1.398283,0.821265,0.767729,0.724713,0.732629
10,0.0043,1.472715,0.821265,0.772166,0.711811,0.726171


[I 2025-03-15 23:04:31,896] Trial 39 finished with value: 0.7447989835068576 and parameters: {'learning_rate': 0.0016930955152743361, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 42}. Best is trial 22 with value: 0.777016449838325.


Trial 40 with params: {'learning_rate': 1.3924590329248805e-05, 'weight_decay': 0.002, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.2882,2.930979,0.293309,0.037594,0.05498,0.02715
2,2.6837,2.617074,0.348304,0.034838,0.06982,0.043988
3,2.4387,2.434631,0.387718,0.038997,0.082928,0.052782
4,2.2686,2.294781,0.414299,0.071324,0.0921,0.065224
5,2.1292,2.183861,0.447296,0.096529,0.105124,0.079372
6,2.0186,2.086876,0.464711,0.107019,0.114227,0.090042
7,1.9233,2.005413,0.491292,0.11452,0.129079,0.105427
8,1.8432,1.944287,0.502291,0.116587,0.136059,0.112835
9,1.7763,1.88285,0.517874,0.142266,0.146701,0.127165
10,1.7148,1.831303,0.527956,0.138388,0.154917,0.135908


[I 2025-03-15 23:07:40,351] Trial 40 pruned. 


Trial 41 with params: {'learning_rate': 0.0013022588038983387, 'weight_decay': 0.007, 'adam_beta1': 0.92, 'warmup_steps': 43}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0459,0.955358,0.770852,0.643799,0.574147,0.590218
2,0.1203,1.133556,0.792851,0.780831,0.67813,0.703258
3,0.0384,1.259441,0.812099,0.725646,0.671827,0.682751
4,0.0198,1.299617,0.805683,0.707802,0.66891,0.673732
5,0.0125,1.361675,0.802933,0.714333,0.660207,0.671555
6,0.0102,1.389501,0.815765,0.744588,0.688853,0.700426
7,0.0101,1.487225,0.808433,0.730093,0.664571,0.685797
8,0.009,1.527187,0.807516,0.774093,0.707582,0.724865
9,0.0082,1.523515,0.806599,0.718426,0.681602,0.685598
10,0.0052,1.562286,0.809349,0.715054,0.677412,0.686899


[I 2025-03-15 23:13:57,780] Trial 41 pruned. 


Trial 42 with params: {'learning_rate': 0.002073148768139123, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 46}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8477,1.072927,0.794684,0.752413,0.637714,0.668833
2,0.0751,1.218626,0.7956,0.741577,0.683132,0.696459
3,0.0266,1.287602,0.804766,0.761226,0.699945,0.7127
4,0.0166,1.317056,0.802933,0.770774,0.687562,0.713375
5,0.0134,1.448069,0.814849,0.791358,0.689746,0.719609
6,0.0112,1.506456,0.812099,0.789422,0.725543,0.743671
7,0.0096,1.526426,0.808433,0.822441,0.747478,0.769012
8,0.0116,1.633611,0.813016,0.789676,0.711997,0.730263
9,0.0062,1.641472,0.820348,0.794659,0.726397,0.745296
10,0.0064,1.54753,0.826764,0.777973,0.717892,0.732641


[I 2025-03-15 23:23:32,172] Trial 42 finished with value: 0.7328499033844605 and parameters: {'learning_rate': 0.002073148768139123, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 46}. Best is trial 22 with value: 0.777016449838325.


Trial 43 with params: {'learning_rate': 0.0019945348575854706, 'weight_decay': 0.005, 'adam_beta1': 0.91, 'warmup_steps': 37}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8688,1.012068,0.799267,0.702329,0.633061,0.651157
2,0.0761,1.073062,0.829514,0.786751,0.733724,0.745146
3,0.0251,1.326554,0.815765,0.75485,0.692037,0.707231
4,0.0212,1.327819,0.814849,0.774476,0.727169,0.738864
5,0.0132,1.395169,0.824015,0.793496,0.731715,0.750323
6,0.0112,1.376239,0.826764,0.798132,0.737121,0.752207
7,0.0091,1.502386,0.816682,0.76823,0.695056,0.714154
8,0.0069,1.58346,0.821265,0.770072,0.726415,0.733667
9,0.0076,1.595131,0.824015,0.754088,0.739904,0.72615
10,0.007,1.584627,0.818515,0.791279,0.724521,0.737418


[I 2025-03-15 23:32:39,613] Trial 43 finished with value: 0.7594693957871711 and parameters: {'learning_rate': 0.0019945348575854706, 'weight_decay': 0.005, 'adam_beta1': 0.91, 'warmup_steps': 37}. Best is trial 22 with value: 0.777016449838325.


Trial 44 with params: {'learning_rate': 0.003534663024333818, 'weight_decay': 0.01, 'adam_beta1': 0.93, 'warmup_steps': 15}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7006,1.084629,0.802016,0.742982,0.672769,0.692623
2,0.0613,1.098104,0.832264,0.789581,0.760488,0.758402
3,0.0249,1.357409,0.811182,0.780485,0.704019,0.726091
4,0.0205,1.431388,0.813016,0.757759,0.695005,0.709444
5,0.0228,1.498719,0.824931,0.748034,0.689984,0.700813
6,0.0141,1.409935,0.836847,0.807871,0.717579,0.745191
7,0.0127,1.554868,0.824931,0.733141,0.69739,0.701793
8,0.0114,1.702259,0.811182,0.743678,0.679754,0.698235
9,0.0124,1.667281,0.824015,0.802577,0.715936,0.742288
10,0.0082,1.996027,0.813016,0.767771,0.712827,0.718633


[I 2025-03-15 23:41:51,773] Trial 44 finished with value: 0.7472429853379253 and parameters: {'learning_rate': 0.003534663024333818, 'weight_decay': 0.01, 'adam_beta1': 0.93, 'warmup_steps': 15}. Best is trial 22 with value: 0.777016449838325.


Trial 45 with params: {'learning_rate': 1.5109064595787985e-05, 'weight_decay': 0.006, 'adam_beta1': 0.96, 'warmup_steps': 31}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3083,2.925189,0.296059,0.037443,0.055045,0.026674
2,2.6617,2.587351,0.36022,0.03475,0.073944,0.04628
3,2.4006,2.394789,0.395967,0.060849,0.085623,0.055751
4,2.2257,2.252294,0.427131,0.07023,0.097146,0.070355
5,2.0794,2.133364,0.454629,0.102536,0.10875,0.084173


[I 2025-03-15 23:43:17,009] Trial 45 pruned. 


Trial 46 with params: {'learning_rate': 0.00012141763564544629, 'weight_decay': 0.01, 'adam_beta1': 0.96, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.321,1.750823,0.555454,0.152975,0.171172,0.150823
2,1.3373,1.325712,0.658112,0.303132,0.285071,0.281549
3,0.9277,1.165368,0.690192,0.410739,0.360592,0.363404
4,0.6767,1.103922,0.707608,0.459956,0.40465,0.416779
5,0.5107,1.070098,0.726856,0.527941,0.475985,0.48675
6,0.3856,1.071657,0.731439,0.58228,0.483415,0.51127
7,0.2922,1.107039,0.741522,0.605244,0.532563,0.555189
8,0.2231,1.163636,0.735105,0.591285,0.525793,0.541504
9,0.1753,1.175855,0.740605,0.597421,0.552201,0.563382
10,0.1363,1.264022,0.743355,0.632822,0.578027,0.588014


[I 2025-03-15 23:49:00,650] Trial 46 pruned. 


Trial 47 with params: {'learning_rate': 0.0033881847384723447, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 31}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7382,1.049168,0.791934,0.716215,0.654301,0.665935
2,0.0581,1.225854,0.8011,0.710649,0.683161,0.679016
3,0.028,1.298056,0.818515,0.771544,0.703174,0.714355
4,0.0191,1.408175,0.802016,0.737332,0.673161,0.685613
5,0.0195,1.610059,0.814849,0.759128,0.69657,0.712948
6,0.016,1.580575,0.815765,0.760573,0.716106,0.722558
7,0.0136,1.602791,0.813932,0.761378,0.706227,0.715124
8,0.0108,1.686671,0.83593,0.777745,0.711168,0.728854
9,0.0109,1.699641,0.826764,0.808963,0.757826,0.768512
10,0.0085,1.62385,0.835014,0.787638,0.753877,0.758842


[I 2025-03-15 23:57:47,454] Trial 47 finished with value: 0.7578453284362761 and parameters: {'learning_rate': 0.0033881847384723447, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 31}. Best is trial 22 with value: 0.777016449838325.


Trial 48 with params: {'learning_rate': 8.153679865827409e-05, 'weight_decay': 0.004, 'adam_beta1': 0.99, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7124,2.172927,0.450962,0.092934,0.10699,0.079097
2,1.7533,1.647901,0.593951,0.21263,0.203418,0.190043
3,1.3289,1.383945,0.655362,0.323947,0.275064,0.273349
4,1.0451,1.247798,0.67736,0.34253,0.319857,0.310227
5,0.8289,1.145542,0.700275,0.431385,0.372731,0.379929


[I 2025-03-15 23:59:19,115] Trial 48 pruned. 


Trial 49 with params: {'learning_rate': 0.004812290828879115, 'weight_decay': 0.01, 'adam_beta1': 0.99, 'warmup_steps': 5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.918,1.160984,0.796517,0.737216,0.642266,0.6733
2,0.0952,1.339761,0.79835,0.786739,0.693745,0.716324
3,0.0384,1.399409,0.817599,0.757604,0.691639,0.710947
4,0.0284,1.560918,0.813932,0.775462,0.724581,0.735781
5,0.0283,1.599523,0.794684,0.763651,0.71462,0.720531
6,0.0244,1.673118,0.802016,0.735211,0.685115,0.691042
7,0.0203,1.871697,0.80385,0.776936,0.692236,0.71523
8,0.017,1.825125,0.808433,0.780865,0.715025,0.733241
9,0.0166,1.955663,0.814849,0.749695,0.729337,0.723417
10,0.0177,1.96149,0.815765,0.762305,0.728593,0.727392


[I 2025-03-16 00:08:07,805] Trial 49 finished with value: 0.7607900695856966 and parameters: {'learning_rate': 0.004812290828879115, 'weight_decay': 0.01, 'adam_beta1': 0.99, 'warmup_steps': 5}. Best is trial 22 with value: 0.777016449838325.


Trial 50 with params: {'learning_rate': 0.0030545606907110114, 'weight_decay': 0.008, 'adam_beta1': 0.99, 'warmup_steps': 12}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0451,1.08733,0.800183,0.652188,0.60644,0.614544
2,0.1028,1.229575,0.813932,0.746737,0.666053,0.685869
3,0.0325,1.365764,0.810266,0.725858,0.667371,0.682911
4,0.0195,1.410864,0.810266,0.747142,0.714461,0.718123
5,0.017,1.62615,0.814849,0.754895,0.685769,0.701958
6,0.0144,1.676739,0.809349,0.795117,0.671549,0.710589
7,0.0143,1.635032,0.825848,0.773126,0.715249,0.729028
8,0.0141,1.715303,0.810266,0.747331,0.695875,0.70494
9,0.0082,1.868898,0.818515,0.763379,0.682506,0.704218
10,0.006,1.892858,0.810266,0.749671,0.68786,0.702689


[I 2025-03-16 00:11:02,571] Trial 50 pruned. 


Trial 51 with params: {'learning_rate': 0.004844621492895092, 'weight_decay': 0.01, 'adam_beta1': 0.99, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8966,1.113693,0.789184,0.707233,0.588634,0.616382
2,0.0977,1.130739,0.824015,0.754552,0.700473,0.717224
3,0.0359,1.329198,0.824931,0.747388,0.731726,0.725113
4,0.0321,1.562818,0.813016,0.750067,0.680286,0.694518
5,0.0299,1.763183,0.807516,0.76836,0.687083,0.710403
6,0.028,1.662844,0.818515,0.794523,0.68939,0.720371
7,0.0177,1.731472,0.819432,0.766814,0.69032,0.71065
8,0.0204,1.874562,0.808433,0.737889,0.700615,0.706861
9,0.018,1.903221,0.813932,0.75148,0.703699,0.71058
10,0.0145,1.898612,0.815765,0.7767,0.712913,0.727755


[I 2025-03-16 00:17:00,059] Trial 51 pruned. 


Trial 52 with params: {'learning_rate': 0.0015566373554609526, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 25}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9656,1.041913,0.773602,0.6573,0.598368,0.608889
2,0.1019,1.296584,0.791017,0.713656,0.638156,0.647781
3,0.0333,1.277537,0.813932,0.767976,0.693025,0.714381
4,0.0187,1.341069,0.817599,0.791274,0.706313,0.730285
5,0.0155,1.443217,0.811182,0.762692,0.696313,0.713397
6,0.0113,1.433702,0.812099,0.768791,0.697301,0.714234
7,0.0062,1.678478,0.802016,0.771652,0.711369,0.725605
8,0.0071,1.626043,0.813016,0.77642,0.707374,0.725856
9,0.0112,1.599206,0.802016,0.776071,0.699383,0.720188
10,0.0046,1.667895,0.812099,0.768556,0.721958,0.729485


[I 2025-03-16 00:25:48,733] Trial 52 finished with value: 0.7260994955871988 and parameters: {'learning_rate': 0.0015566373554609526, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 25}. Best is trial 22 with value: 0.777016449838325.


Trial 53 with params: {'learning_rate': 0.00400906163728517, 'weight_decay': 0.001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 35}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.731,1.013794,0.811182,0.7242,0.6678,0.679211
2,0.0622,1.160751,0.808433,0.772261,0.682034,0.703218
3,0.0297,1.335122,0.809349,0.725987,0.657452,0.672173
4,0.0255,1.392886,0.809349,0.782786,0.699643,0.726092
5,0.0221,1.453565,0.824931,0.781243,0.714479,0.732796
6,0.0204,1.665995,0.823098,0.7774,0.703057,0.728128
7,0.0163,1.585365,0.809349,0.761483,0.712487,0.721789
8,0.0128,1.551166,0.825848,0.791104,0.737302,0.75052
9,0.0075,2.004698,0.814849,0.770985,0.721268,0.73215
10,0.0114,1.931103,0.817599,0.756206,0.717704,0.726459


[I 2025-03-16 00:34:55,552] Trial 53 finished with value: 0.7404643730128398 and parameters: {'learning_rate': 0.00400906163728517, 'weight_decay': 0.001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 35}. Best is trial 22 with value: 0.777016449838325.


Trial 54 with params: {'learning_rate': 0.003532615375378192, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.96, 'warmup_steps': 8}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7846,1.090174,0.799267,0.738302,0.656011,0.67591
2,0.0695,1.156518,0.815765,0.774264,0.711857,0.72549
3,0.0313,1.308391,0.810266,0.750094,0.709533,0.720148
4,0.0215,1.472572,0.811182,0.7932,0.695931,0.72147
5,0.021,1.343757,0.828598,0.791829,0.737634,0.752503
6,0.0146,1.661881,0.812099,0.794421,0.709603,0.735532
7,0.0148,1.719404,0.813016,0.735714,0.69117,0.697969
8,0.0153,1.722408,0.811182,0.724048,0.6933,0.701549
9,0.0088,1.742696,0.819432,0.764782,0.738096,0.737291
10,0.0088,1.753513,0.824015,0.788483,0.729646,0.741473


[I 2025-03-16 00:43:54,206] Trial 54 finished with value: 0.7425348521324338 and parameters: {'learning_rate': 0.003532615375378192, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.96, 'warmup_steps': 8}. Best is trial 22 with value: 0.777016449838325.


Trial 55 with params: {'learning_rate': 0.004533240859711196, 'weight_decay': 0.008, 'adam_beta1': 0.93, 'warmup_steps': 32}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6847,1.050653,0.807516,0.723131,0.687084,0.693646
2,0.0582,1.103412,0.815765,0.765267,0.707708,0.722709
3,0.0293,1.511972,0.797434,0.713165,0.701914,0.686215
4,0.0304,1.510769,0.825848,0.754078,0.690604,0.704913
5,0.0231,1.538987,0.813016,0.782051,0.711783,0.732849
6,0.017,1.782026,0.809349,0.747829,0.702861,0.711605
7,0.0193,1.926512,0.810266,0.767226,0.723296,0.731233
8,0.0206,1.801463,0.822181,0.775821,0.726827,0.740103
9,0.0163,1.961318,0.809349,0.776507,0.700815,0.722653
10,0.0131,1.97961,0.807516,0.751331,0.710756,0.717763


[I 2025-03-16 00:52:42,771] Trial 55 finished with value: 0.7656531088432571 and parameters: {'learning_rate': 0.004533240859711196, 'weight_decay': 0.008, 'adam_beta1': 0.93, 'warmup_steps': 32}. Best is trial 22 with value: 0.777016449838325.


Trial 56 with params: {'learning_rate': 0.0017682779310178565, 'weight_decay': 0.007, 'adam_beta1': 0.92, 'warmup_steps': 26}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9049,0.974188,0.792851,0.685941,0.647061,0.651015
2,0.0849,1.243519,0.793767,0.74323,0.678466,0.693368
3,0.0277,1.367662,0.79835,0.767694,0.683145,0.70791
4,0.0198,1.361967,0.80385,0.786463,0.674559,0.701171
5,0.0135,1.45267,0.808433,0.774015,0.6975,0.71905
6,0.0108,1.534499,0.814849,0.793401,0.718438,0.734542
7,0.0131,1.522444,0.816682,0.784294,0.730275,0.731649
8,0.0065,1.72689,0.806599,0.79847,0.720912,0.737051
9,0.0052,1.660854,0.814849,0.776896,0.719777,0.732348
10,0.0063,1.845628,0.805683,0.7376,0.720952,0.717419


[I 2025-03-16 00:55:39,797] Trial 56 pruned. 


Trial 57 with params: {'learning_rate': 0.004893885512008118, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.96, 'warmup_steps': 39}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7623,1.093298,0.806599,0.715468,0.671933,0.682167
2,0.0651,1.175471,0.810266,0.745054,0.721021,0.717901
3,0.0362,1.438504,0.812099,0.807436,0.740786,0.757453
4,0.0272,1.410457,0.814849,0.744686,0.714606,0.71092
5,0.0261,1.582632,0.809349,0.768326,0.711074,0.722544
6,0.0235,1.857493,0.812099,0.761636,0.684179,0.705001
7,0.0251,1.721852,0.814849,0.760364,0.705727,0.718582
8,0.0188,1.957332,0.806599,0.76656,0.699518,0.715201
9,0.0119,1.92885,0.813016,0.743897,0.693635,0.700192
10,0.0138,1.938053,0.810266,0.769321,0.69839,0.718188


[I 2025-03-16 01:01:30,992] Trial 57 pruned. 


Trial 58 with params: {'learning_rate': 0.004849876344808655, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 29}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6985,1.106744,0.809349,0.731586,0.671593,0.68851
2,0.0616,1.122528,0.817599,0.728437,0.704337,0.70501
3,0.0331,1.444632,0.814849,0.761856,0.697198,0.713446
4,0.0306,1.38921,0.809349,0.752491,0.688712,0.699588
5,0.0314,1.686934,0.80385,0.760878,0.706524,0.710316
6,0.0211,1.807667,0.810266,0.745932,0.688447,0.70369
7,0.0179,1.891861,0.813016,0.770557,0.714539,0.729792
8,0.0122,1.89042,0.806599,0.743772,0.71426,0.714841
9,0.0172,1.911104,0.814849,0.799285,0.722905,0.738822
10,0.0165,2.02495,0.805683,0.780197,0.710778,0.729849


[I 2025-03-16 01:07:56,390] Trial 58 pruned. 


Trial 59 with params: {'learning_rate': 0.0009061274164568013, 'weight_decay': 0.0, 'adam_beta1': 0.99, 'warmup_steps': 43}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6582,1.144194,0.71494,0.390548,0.396238,0.385095
2,0.3812,1.090665,0.789184,0.70131,0.643762,0.659085
3,0.0933,1.341007,0.781852,0.72181,0.664238,0.673438
4,0.0368,1.441322,0.790101,0.7494,0.647978,0.680452
5,0.0179,1.579425,0.802016,0.761431,0.658109,0.687652
6,0.0129,1.624844,0.807516,0.799111,0.683308,0.721065
7,0.0109,1.71617,0.791017,0.758894,0.673553,0.698491
8,0.0064,1.681403,0.8011,0.720535,0.683863,0.690419
9,0.0053,1.817501,0.802016,0.781512,0.695528,0.724432
10,0.0053,1.719835,0.811182,0.77668,0.723744,0.739214


[I 2025-03-16 01:14:02,668] Trial 59 pruned. 


Trial 60 with params: {'learning_rate': 0.00469084090174424, 'weight_decay': 0.01, 'adam_beta1': 0.97, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7246,1.075515,0.807516,0.712028,0.644395,0.662729
2,0.0672,1.264105,0.813932,0.726616,0.657037,0.675826
3,0.0347,1.509206,0.819432,0.705807,0.652861,0.665869
4,0.0262,1.547367,0.813932,0.755565,0.69174,0.701724
5,0.0268,1.655992,0.819432,0.750698,0.68914,0.705776
6,0.0225,1.728751,0.818515,0.753825,0.682612,0.699805
7,0.0178,1.923713,0.824015,0.768579,0.696402,0.708544
8,0.0175,1.924699,0.809349,0.741692,0.669674,0.689351
9,0.0169,2.095219,0.800183,0.725454,0.695969,0.694834
10,0.0148,2.156637,0.805683,0.771081,0.681319,0.704905


[I 2025-03-16 01:17:14,294] Trial 60 pruned. 


Trial 61 with params: {'learning_rate': 0.0007838959300346599, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 52}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2448,0.98656,0.756187,0.590376,0.504519,0.527688
2,0.2172,1.092259,0.790101,0.721733,0.666724,0.67215
3,0.0694,1.20428,0.80385,0.735951,0.671117,0.687691
4,0.0372,1.372822,0.794684,0.731583,0.664833,0.673872
5,0.0203,1.348789,0.805683,0.75432,0.674364,0.695689
6,0.0147,1.534076,0.806599,0.755139,0.686731,0.70874
7,0.01,1.583162,0.800183,0.748935,0.674606,0.697337
8,0.0078,1.667365,0.7956,0.705802,0.665388,0.676066
9,0.008,1.619162,0.80385,0.72482,0.688669,0.695826
10,0.0056,1.816536,0.7956,0.712942,0.658879,0.672517


[I 2025-03-16 01:20:33,947] Trial 61 pruned. 


Trial 62 with params: {'learning_rate': 0.0015412575944477767, 'weight_decay': 0.004, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 33}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9965,0.97062,0.786434,0.667906,0.588544,0.602776
2,0.104,1.156158,0.802933,0.756117,0.716776,0.715223
3,0.0316,1.207233,0.821265,0.741423,0.695902,0.708323
4,0.0201,1.292252,0.807516,0.724765,0.667514,0.679686
5,0.0133,1.417319,0.813932,0.739784,0.713752,0.711264
6,0.0096,1.389327,0.822181,0.776352,0.719825,0.735243
7,0.0088,1.472653,0.816682,0.744552,0.721073,0.720537
8,0.0115,1.534714,0.808433,0.728776,0.698775,0.697566
9,0.0062,1.509959,0.819432,0.754599,0.723155,0.720147
10,0.0066,1.423573,0.827681,0.757316,0.724872,0.725655


[I 2025-03-16 01:23:35,397] Trial 62 pruned. 


Trial 63 with params: {'learning_rate': 0.0004605303827047067, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 33}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5297,1.063569,0.726856,0.434677,0.393392,0.397373
2,0.4214,1.009377,0.780935,0.65209,0.591893,0.603344
3,0.1575,1.139446,0.771769,0.695765,0.604635,0.626559
4,0.0754,1.192217,0.780018,0.686429,0.629195,0.637049
5,0.0406,1.408476,0.790101,0.690019,0.672941,0.66936
6,0.0239,1.430251,0.791934,0.740898,0.658761,0.676347
7,0.0165,1.423093,0.790101,0.693767,0.646492,0.655778
8,0.0146,1.545274,0.788268,0.649001,0.619811,0.617313
9,0.0098,1.519204,0.778185,0.63179,0.637525,0.62455
10,0.0079,1.628368,0.792851,0.70195,0.64958,0.659885


[I 2025-03-16 01:29:40,235] Trial 63 pruned. 


Trial 64 with params: {'learning_rate': 1.6488779238415115e-05, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 40}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.2267,2.846303,0.306141,0.04296,0.057745,0.031024
2,2.6097,2.542906,0.367553,0.035956,0.076125,0.047997
3,2.3511,2.34402,0.4033,0.064466,0.087567,0.058052
4,2.1703,2.195674,0.439047,0.075679,0.101125,0.075522
5,2.0239,2.076756,0.466544,0.101665,0.115201,0.089621
6,1.9082,1.982374,0.493126,0.105957,0.130674,0.105172
7,1.8125,1.903794,0.514207,0.120856,0.144216,0.121674
8,1.7303,1.843926,0.527039,0.141168,0.15304,0.133386
9,1.6602,1.78621,0.540788,0.168396,0.16315,0.145742
10,1.5939,1.730363,0.558203,0.177704,0.177845,0.162096


[I 2025-03-16 01:35:48,263] Trial 64 pruned. 


Trial 65 with params: {'learning_rate': 0.0041831950146033236, 'weight_decay': 0.007, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 41}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7333,0.999537,0.799267,0.735115,0.70266,0.701193
2,0.0598,1.191913,0.807516,0.761199,0.699584,0.709361
3,0.0297,1.181458,0.822181,0.765397,0.718746,0.728563
4,0.0221,1.416148,0.805683,0.765413,0.685233,0.707743
5,0.0245,1.382988,0.828598,0.756091,0.739803,0.726865
6,0.0209,1.547972,0.805683,0.771193,0.709768,0.722689
7,0.0174,1.669418,0.815765,0.774619,0.724034,0.736111
8,0.0126,1.881007,0.816682,0.803588,0.722361,0.744188
9,0.016,1.68851,0.825848,0.785936,0.724995,0.743168
10,0.0119,1.940307,0.822181,0.775034,0.707621,0.722303


[I 2025-03-16 01:41:41,606] Trial 65 pruned. 


Trial 66 with params: {'learning_rate': 0.004274375294704235, 'weight_decay': 0.007, 'adam_beta1': 0.93, 'warmup_steps': 30}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7087,1.034981,0.811182,0.766649,0.682748,0.702095
2,0.0629,1.064541,0.827681,0.780263,0.706928,0.720091
3,0.0262,1.415862,0.813016,0.790266,0.69829,0.728162
4,0.0311,1.437073,0.802933,0.763737,0.717055,0.717373
5,0.024,1.596665,0.810266,0.749281,0.681525,0.691618
6,0.0187,1.596425,0.819432,0.756076,0.710358,0.719627
7,0.0202,1.731121,0.823098,0.762321,0.722725,0.730191
8,0.0134,1.673312,0.831347,0.757011,0.703812,0.719198
9,0.0083,1.833349,0.819432,0.709103,0.696183,0.687626
10,0.0124,2.058572,0.819432,0.783412,0.702135,0.720469


[I 2025-03-16 01:47:55,501] Trial 66 pruned. 


Trial 67 with params: {'learning_rate': 0.004057307328681772, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 38}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6771,1.0468,0.811182,0.760166,0.701163,0.708558
2,0.0579,1.213304,0.805683,0.762475,0.702043,0.711188
3,0.0284,1.350805,0.813932,0.765053,0.709147,0.724066
4,0.0267,1.305256,0.804766,0.706428,0.681839,0.684483
5,0.0186,1.557567,0.831347,0.800253,0.741468,0.753055
6,0.0232,1.476933,0.821265,0.753739,0.729078,0.727607
7,0.0133,1.82815,0.824931,0.787513,0.719642,0.738744
8,0.0112,1.839143,0.813016,0.783258,0.693118,0.715785
9,0.0123,1.939337,0.807516,0.727843,0.704986,0.704838
10,0.0139,2.094596,0.817599,0.760268,0.69813,0.713658


[I 2025-03-16 01:51:01,983] Trial 67 pruned. 


Trial 68 with params: {'learning_rate': 0.0039093367886792255, 'weight_decay': 0.01, 'adam_beta1': 0.98, 'warmup_steps': 15}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8728,1.126392,0.802933,0.772365,0.667498,0.696725
2,0.0748,1.208984,0.810266,0.758406,0.695703,0.710282
3,0.0338,1.30738,0.806599,0.763261,0.705849,0.717858
4,0.0205,1.507359,0.818515,0.801983,0.708754,0.729664
5,0.0201,1.453221,0.814849,0.750869,0.691386,0.701606
6,0.02,1.824378,0.805683,0.721637,0.687597,0.688189
7,0.0191,1.513142,0.822181,0.77312,0.735101,0.741532
8,0.0111,1.668384,0.824931,0.779199,0.710417,0.728719
9,0.0127,1.52043,0.826764,0.775954,0.749489,0.747852
10,0.0128,1.837716,0.826764,0.787488,0.737098,0.749077


[I 2025-03-16 01:59:50,932] Trial 68 finished with value: 0.7703628244209768 and parameters: {'learning_rate': 0.0039093367886792255, 'weight_decay': 0.01, 'adam_beta1': 0.98, 'warmup_steps': 15}. Best is trial 22 with value: 0.777016449838325.


Trial 69 with params: {'learning_rate': 0.0010013248990299681, 'weight_decay': 0.01, 'adam_beta1': 0.98, 'warmup_steps': 7}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2898,1.044018,0.740605,0.551425,0.478881,0.491929
2,0.2141,1.140759,0.786434,0.677526,0.655626,0.654093
3,0.0621,1.259338,0.799267,0.705154,0.654473,0.664983
4,0.029,1.285871,0.80385,0.704738,0.661468,0.660247
5,0.018,1.339145,0.815765,0.72759,0.672535,0.679534


[I 2025-03-16 02:01:24,716] Trial 69 pruned. 


Trial 70 with params: {'learning_rate': 0.003909262868720604, 'weight_decay': 0.008, 'adam_beta1': 0.99, 'warmup_steps': 20}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0296,1.098553,0.780935,0.646197,0.579311,0.597753
2,0.1043,1.136752,0.818515,0.746229,0.695001,0.707042
3,0.0341,1.427595,0.816682,0.782089,0.694959,0.721989
4,0.0218,1.623877,0.809349,0.760356,0.703734,0.706785
5,0.0155,1.73171,0.812099,0.768004,0.6973,0.718376
6,0.018,1.751003,0.811182,0.731903,0.692048,0.698497
7,0.02,1.625334,0.809349,0.735383,0.676091,0.691326
8,0.0142,1.800539,0.807516,0.775901,0.687019,0.712238
9,0.0105,1.793845,0.819432,0.765796,0.738096,0.740009
10,0.0089,2.07866,0.824015,0.752922,0.692309,0.700942


[I 2025-03-16 02:04:28,961] Trial 70 pruned. 


Trial 71 with params: {'learning_rate': 0.000632613833813617, 'weight_decay': 0.008, 'adam_beta1': 0.96, 'warmup_steps': 49}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5037,1.03788,0.734189,0.500781,0.464324,0.466803
2,0.3336,1.072451,0.774519,0.658791,0.616824,0.623912
3,0.1056,1.171276,0.786434,0.692568,0.627424,0.637062
4,0.0476,1.267721,0.792851,0.732995,0.681262,0.689217
5,0.0284,1.341059,0.791934,0.683643,0.665935,0.658545
6,0.0145,1.481949,0.792851,0.736561,0.668923,0.686346
7,0.0114,1.582491,0.792851,0.763775,0.685668,0.706472
8,0.0096,1.602144,0.787351,0.697213,0.623449,0.642051
9,0.0073,1.689451,0.797434,0.767957,0.664449,0.694695
10,0.0067,1.61934,0.802016,0.738211,0.671617,0.688361


[I 2025-03-16 02:10:49,623] Trial 71 pruned. 


Trial 72 with params: {'learning_rate': 0.0009852672252848175, 'weight_decay': 0.01, 'adam_beta1': 0.99, 'warmup_steps': 17}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4923,1.074156,0.731439,0.456909,0.436137,0.436854
2,0.3076,1.139376,0.786434,0.702806,0.619119,0.644062
3,0.0762,1.264084,0.797434,0.693763,0.643501,0.653139
4,0.0307,1.371358,0.80385,0.732683,0.684247,0.692028
5,0.0169,1.484271,0.800183,0.721674,0.657382,0.665467


[I 2025-03-16 02:12:20,161] Trial 72 pruned. 


Trial 73 with params: {'learning_rate': 0.0013313615772000725, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.92, 'warmup_steps': 31}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9906,0.949712,0.787351,0.652768,0.610894,0.614833
2,0.1154,1.154634,0.805683,0.768276,0.69923,0.713819
3,0.0364,1.212683,0.8011,0.733779,0.680784,0.695465
4,0.0222,1.384411,0.808433,0.801713,0.707534,0.737487
5,0.0117,1.39206,0.824015,0.797258,0.724415,0.742295
6,0.009,1.382466,0.813016,0.759796,0.706237,0.712916
7,0.012,1.423579,0.809349,0.76491,0.698688,0.712314
8,0.008,1.631853,0.807516,0.7969,0.687392,0.715889
9,0.0052,1.650964,0.813932,0.754624,0.698505,0.709033
10,0.0067,1.612018,0.814849,0.792629,0.729606,0.742845


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--precision/155d3220d6cd4a6553f12da68eeb3d1f97cf431206304a4bc6e2d564c29502e9 (last modified on Fri Jan 10 23:13:59 2025) since it couldn't be found locally at evaluate-metric--precision, or remotely on the Hugging Face Hub.
[I 2025-03-16 02:21:22,207] Trial 73 finished with value: 0.7386252474835205 and parameters: {'learning_rate': 0.0013313615772000725, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.92, 'warmup_steps': 31}. Best is trial 22 with value: 0.777016449838325.


Trial 74 with params: {'learning_rate': 0.0007154226295777224, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 37}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2602,0.991803,0.746104,0.593926,0.505421,0.528858
2,0.2419,1.094695,0.777269,0.67643,0.644177,0.639924
3,0.0817,1.298117,0.782768,0.76489,0.660344,0.68816
4,0.0401,1.321032,0.791017,0.748905,0.680505,0.691959
5,0.0223,1.419866,0.780935,0.698037,0.655598,0.657692


[I 2025-03-16 02:22:54,193] Trial 74 pruned. 


Trial 75 with params: {'learning_rate': 0.003274787212572728, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 37}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7502,1.012866,0.812099,0.773963,0.686108,0.708395
2,0.0619,1.057064,0.823098,0.716586,0.67836,0.681948
3,0.028,1.319476,0.814849,0.772584,0.709234,0.727942
4,0.0208,1.326714,0.826764,0.777862,0.718396,0.736455
5,0.0185,1.49222,0.827681,0.786636,0.724861,0.741885
6,0.0146,1.609582,0.804766,0.787912,0.710892,0.728954
7,0.0151,1.528014,0.816682,0.80739,0.731484,0.753223
8,0.01,1.701882,0.810266,0.795492,0.721488,0.737864
9,0.0111,1.847219,0.817599,0.777033,0.71777,0.733455
10,0.0076,1.865312,0.813932,0.721562,0.693161,0.695487


[I 2025-03-16 02:25:54,526] Trial 75 pruned. 


Trial 76 with params: {'learning_rate': 0.0024334016897220394, 'weight_decay': 0.01, 'adam_beta1': 0.99, 'warmup_steps': 26}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1781,1.048142,0.774519,0.589082,0.542235,0.553012
2,0.1407,1.268773,0.8011,0.721798,0.67831,0.689858
3,0.0361,1.266012,0.818515,0.808069,0.719097,0.745564
4,0.0202,1.426533,0.808433,0.734807,0.688453,0.700153
5,0.0164,1.552584,0.815765,0.750089,0.697667,0.707597
6,0.0148,1.588407,0.819432,0.807536,0.709256,0.73811
7,0.015,1.690104,0.811182,0.730991,0.680142,0.693298
8,0.0083,1.863428,0.816682,0.770708,0.697171,0.716483
9,0.0081,1.789451,0.821265,0.761017,0.705764,0.715655
10,0.0066,1.830951,0.811182,0.726302,0.696267,0.700204


[I 2025-03-16 02:29:00,412] Trial 76 pruned. 


Trial 77 with params: {'learning_rate': 0.00022309840089248744, 'weight_decay': 0.004, 'adam_beta1': 0.99, 'warmup_steps': 36}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3269,1.665477,0.571036,0.201499,0.187733,0.174369
2,1.1318,1.152686,0.707608,0.376873,0.377277,0.371137
3,0.6357,1.049446,0.731439,0.478636,0.445973,0.454122
4,0.3735,1.052909,0.738772,0.572721,0.5176,0.532805
5,0.2133,1.137053,0.742438,0.572302,0.571398,0.562998


[I 2025-03-16 02:30:30,493] Trial 77 pruned. 


Trial 78 with params: {'learning_rate': 0.003390959709901615, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.97, 'warmup_steps': 14}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8426,1.094252,0.793767,0.766343,0.648968,0.678133
2,0.0723,1.194225,0.805683,0.747467,0.703241,0.709183
3,0.029,1.403433,0.814849,0.775686,0.710123,0.728224
4,0.0192,1.446831,0.814849,0.768362,0.696177,0.717381
5,0.0187,1.547932,0.823098,0.769349,0.725715,0.732696
6,0.017,1.655791,0.813932,0.767641,0.730135,0.733774
7,0.016,1.799402,0.813016,0.765163,0.711572,0.720957
8,0.0135,1.704859,0.820348,0.781801,0.71226,0.729189
9,0.0108,1.830984,0.821265,0.787382,0.722376,0.734217
10,0.0099,1.806909,0.811182,0.788397,0.726001,0.736547


[I 2025-03-16 02:36:29,693] Trial 78 pruned. 


Trial 79 with params: {'learning_rate': 0.004007367043202988, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.91, 'warmup_steps': 34}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6885,1.032461,0.805683,0.788287,0.700016,0.724626
2,0.0556,1.031861,0.824931,0.751508,0.736355,0.733556
3,0.0269,1.359378,0.816682,0.765021,0.684823,0.703323
4,0.0284,1.406812,0.809349,0.730087,0.691349,0.692963
5,0.017,1.567254,0.812099,0.735254,0.711024,0.709249
6,0.0161,1.794661,0.813932,0.736393,0.713641,0.712865
7,0.0151,1.792697,0.799267,0.714454,0.670555,0.675018
8,0.0176,1.882473,0.811182,0.713921,0.710124,0.70197
9,0.0085,2.081944,0.802933,0.736473,0.701253,0.70123
10,0.0138,1.960129,0.804766,0.719434,0.694415,0.693454


[I 2025-03-16 02:42:39,250] Trial 79 pruned. 


Trial 80 with params: {'learning_rate': 8.073149975828258e-05, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 50}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5519,2.011141,0.493126,0.09872,0.12976,0.10349
2,1.6396,1.59299,0.598533,0.213089,0.211063,0.196152
3,1.2608,1.375269,0.651696,0.321605,0.27876,0.277033
4,1.0007,1.229224,0.683776,0.35067,0.318501,0.320639
5,0.8107,1.150381,0.701192,0.477136,0.39732,0.413107
6,0.6679,1.105431,0.712191,0.458402,0.415207,0.41882
7,0.5563,1.110118,0.713107,0.479846,0.44761,0.448242
8,0.467,1.085396,0.72044,0.522187,0.456018,0.474277
9,0.3954,1.101257,0.716774,0.525085,0.480911,0.491485
10,0.3376,1.129705,0.733272,0.555568,0.49019,0.510354


[I 2025-03-16 02:45:34,466] Trial 80 pruned. 


Trial 81 with params: {'learning_rate': 0.0035499455467792127, 'weight_decay': 0.004, 'adam_beta1': 0.92, 'warmup_steps': 35}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7221,1.084416,0.792851,0.720556,0.678101,0.683919
2,0.0611,1.073898,0.821265,0.773828,0.704101,0.724445
3,0.024,1.309267,0.822181,0.769483,0.721323,0.732702
4,0.022,1.28632,0.822181,0.785439,0.716708,0.731629
5,0.0162,1.644149,0.811182,0.77238,0.703785,0.718969
6,0.0197,1.470755,0.826764,0.780738,0.712637,0.730338
7,0.015,1.541326,0.818515,0.781929,0.725898,0.737417
8,0.0135,1.646662,0.812099,0.792427,0.723811,0.740107
9,0.0114,1.726206,0.817599,0.763439,0.757161,0.74594
10,0.0095,1.90784,0.815765,0.753934,0.706783,0.716829


[I 2025-03-16 02:54:08,927] Trial 81 finished with value: 0.7523433176385893 and parameters: {'learning_rate': 0.0035499455467792127, 'weight_decay': 0.004, 'adam_beta1': 0.92, 'warmup_steps': 35}. Best is trial 22 with value: 0.777016449838325.


Trial 82 with params: {'learning_rate': 0.0008949101338834617, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 30}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.14,1.014638,0.759853,0.613064,0.549025,0.559354
2,0.175,1.140335,0.783685,0.700003,0.669353,0.672711
3,0.0584,1.277093,0.791017,0.724958,0.668337,0.685717
4,0.0279,1.354646,0.802933,0.744908,0.690396,0.700215
5,0.0183,1.387273,0.802016,0.741543,0.678993,0.689147
6,0.0112,1.488592,0.800183,0.749678,0.666201,0.688316
7,0.0094,1.530783,0.796517,0.736213,0.690152,0.696974
8,0.0082,1.586833,0.800183,0.73512,0.661575,0.677752
9,0.0062,1.629997,0.80385,0.718902,0.683449,0.6864
10,0.0068,1.510398,0.804766,0.717525,0.684257,0.686105


[I 2025-03-16 02:59:58,202] Trial 82 pruned. 


Trial 83 with params: {'learning_rate': 0.004043101133640759, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 39}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6785,1.037264,0.818515,0.775173,0.704368,0.723702
2,0.0551,1.120147,0.813932,0.773633,0.709457,0.718934
3,0.0287,1.304947,0.810266,0.756464,0.717436,0.723498
4,0.0212,1.445593,0.819432,0.762895,0.732041,0.736121
5,0.0291,1.448664,0.820348,0.823582,0.715459,0.751384
6,0.0151,1.562109,0.829514,0.737347,0.706718,0.710255
7,0.0167,1.785781,0.813932,0.740395,0.704941,0.713422
8,0.0145,1.815226,0.826764,0.789953,0.72993,0.744092
9,0.0115,1.823446,0.821265,0.730273,0.715185,0.710618
10,0.0116,2.0264,0.814849,0.756327,0.719708,0.726778


[I 2025-03-16 03:02:53,298] Trial 83 pruned. 


Trial 84 with params: {'learning_rate': 0.0032983887302725423, 'weight_decay': 0.01, 'adam_beta1': 0.99, 'warmup_steps': 15}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0233,1.081302,0.791017,0.711136,0.641192,0.661256
2,0.1042,1.275531,0.804766,0.719211,0.677586,0.685847
3,0.0338,1.367119,0.806599,0.723068,0.689855,0.690876
4,0.0253,1.541194,0.809349,0.738535,0.688976,0.702443
5,0.0164,1.653817,0.807516,0.714671,0.694161,0.695366
6,0.017,1.751916,0.810266,0.706279,0.670492,0.67042
7,0.0157,1.595644,0.814849,0.764519,0.704994,0.717396
8,0.0129,1.715748,0.810266,0.755464,0.715256,0.721783
9,0.0103,1.750267,0.817599,0.729357,0.718817,0.700209
10,0.0093,1.877203,0.820348,0.747958,0.685676,0.701128


[I 2025-03-16 03:12:03,534] Trial 84 finished with value: 0.7365706362036554 and parameters: {'learning_rate': 0.0032983887302725423, 'weight_decay': 0.01, 'adam_beta1': 0.99, 'warmup_steps': 15}. Best is trial 22 with value: 0.777016449838325.


Trial 85 with params: {'learning_rate': 0.0009190840436269094, 'weight_decay': 0.01, 'adam_beta1': 0.96, 'warmup_steps': 48}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3352,0.984316,0.753437,0.543484,0.527289,0.523687
2,0.2072,1.190323,0.777269,0.684884,0.637305,0.637075
3,0.0658,1.201825,0.794684,0.713179,0.675603,0.67765
4,0.027,1.290719,0.814849,0.751454,0.693538,0.704762
5,0.0175,1.447736,0.810266,0.741466,0.691725,0.698821
6,0.0107,1.569274,0.807516,0.765438,0.679107,0.70305
7,0.0118,1.519874,0.810266,0.764116,0.695764,0.710754
8,0.0068,1.58795,0.805683,0.740845,0.700424,0.705744
9,0.0067,1.716538,0.799267,0.740338,0.66342,0.677651
10,0.0076,1.606397,0.808433,0.771637,0.703489,0.715383


[I 2025-03-16 03:18:20,478] Trial 85 pruned. 


Trial 86 with params: {'learning_rate': 1.6562808358868146e-05, 'weight_decay': 0.006, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 51}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.2738,2.868586,0.301558,0.028491,0.056113,0.028166
2,2.6124,2.542116,0.368469,0.03639,0.076281,0.048478
3,2.3533,2.352105,0.4033,0.063143,0.087732,0.058068
4,2.1735,2.201748,0.441797,0.073042,0.101876,0.074842
5,2.0258,2.083269,0.465628,0.105853,0.113886,0.088403


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--f1/34c46321f42186df33a6260966e34a368f14868d9cc2ba47d142112e2800d233 (last modified on Fri Jan 10 23:14:01 2025) since it couldn't be found locally at evaluate-metric--f1, or remotely on the Hugging Face Hub.
[I 2025-03-16 03:19:56,265] Trial 86 pruned. 


Trial 87 with params: {'learning_rate': 0.004431460824091724, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 47}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6873,1.040074,0.80385,0.717257,0.662883,0.670869
2,0.0573,1.064439,0.833181,0.802077,0.736471,0.753878
3,0.0295,1.317631,0.813016,0.748322,0.684604,0.701854
4,0.0262,1.385487,0.812099,0.756542,0.688644,0.702084
5,0.025,1.463961,0.817599,0.761148,0.699796,0.71166
6,0.0229,1.527851,0.821265,0.811874,0.706815,0.738338
7,0.0176,1.758508,0.821265,0.733941,0.682177,0.688143
8,0.0159,1.88767,0.817599,0.775228,0.729005,0.737526
9,0.0123,1.818401,0.827681,0.777028,0.741822,0.744201
10,0.0149,1.969149,0.819432,0.766939,0.732842,0.737004


[I 2025-03-16 03:25:52,053] Trial 87 pruned. 


Trial 88 with params: {'learning_rate': 0.0029440095539669865, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.92, 'warmup_steps': 24}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7515,1.044364,0.810266,0.740051,0.674439,0.692483
2,0.0631,1.112793,0.799267,0.787498,0.714563,0.733025
3,0.0245,1.353996,0.808433,0.748646,0.7127,0.721446
4,0.0229,1.28365,0.802933,0.766319,0.682916,0.70882
5,0.0156,1.517973,0.808433,0.750358,0.702132,0.704933
6,0.0101,1.507657,0.815765,0.751568,0.715717,0.720757
7,0.0136,1.597969,0.813016,0.809918,0.728659,0.750504
8,0.0083,1.743404,0.812099,0.723694,0.676928,0.688258
9,0.0108,1.64872,0.813932,0.739665,0.709909,0.711294
10,0.0098,1.744829,0.811182,0.702341,0.702162,0.687838


[I 2025-03-16 03:28:46,198] Trial 88 pruned. 


Trial 89 with params: {'learning_rate': 0.0018740884748954057, 'weight_decay': 0.005, 'adam_beta1': 0.92, 'warmup_steps': 35}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.898,1.000431,0.808433,0.719515,0.6326,0.65953
2,0.0816,1.19717,0.810266,0.777406,0.727768,0.738256
3,0.0282,1.320829,0.818515,0.748265,0.691861,0.701846
4,0.0192,1.423661,0.824931,0.795639,0.711447,0.732614
5,0.0111,1.367889,0.828598,0.749611,0.746098,0.731013
6,0.0101,1.430495,0.817599,0.766438,0.7032,0.716742
7,0.009,1.53148,0.809349,0.741368,0.703384,0.706706
8,0.0068,1.664237,0.816682,0.794776,0.705776,0.729333
9,0.0094,1.56813,0.816682,0.74786,0.713831,0.716218
10,0.0069,1.670683,0.823098,0.735538,0.71649,0.716346


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--recall/11f90e583db35601050aed380d48e83202a896976b9608432fba9244fb447f24 (last modified on Fri Jan 10 23:14:00 2025) since it couldn't be found locally at evaluate-metric--recall, or remotely on the Hugging Face Hub.
[I 2025-03-16 03:38:16,110] Trial 89 finished with value: 0.7489816574976095 and parameters: {'learning_rate': 0.0018740884748954057, 'weight_decay': 0.005, 'adam_beta1': 0.92, 'warmup_steps': 35}. Best is trial 22 with value: 0.777016449838325.


Trial 90 with params: {'learning_rate': 0.0008794243847821058, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 47}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2196,1.001806,0.752521,0.648642,0.549171,0.575453
2,0.1923,1.147062,0.783685,0.737967,0.672732,0.68505
3,0.0646,1.230097,0.797434,0.701501,0.647423,0.662023
4,0.0305,1.295081,0.8011,0.736165,0.684673,0.696979
5,0.0169,1.474518,0.793767,0.73606,0.688809,0.697082
6,0.0118,1.512162,0.8011,0.737534,0.674418,0.691568
7,0.0102,1.547023,0.800183,0.765394,0.682901,0.706548
8,0.0082,1.612372,0.807516,0.752716,0.696087,0.711542
9,0.0097,1.572202,0.813932,0.756321,0.695948,0.714278
10,0.0037,1.676594,0.808433,0.719509,0.6775,0.688069


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--recall/11f90e583db35601050aed380d48e83202a896976b9608432fba9244fb447f24 (last modified on Fri Jan 10 23:14:00 2025) since it couldn't be found locally at evaluate-metric--recall, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--f1/34c46321f42186df33a6260966e34a368f14868d9cc2ba47d142112e2800d233 (last modified on Fri Jan 10 23:14:01 2025) since it couldn't be found locally at evaluate-metric--f1, or remotely on the Hugging Face Hub.
[I 2025-03-16 03:41:31,963] Trial 90 pruned. 


Trial 91 with params: {'learning_rate': 0.002161349669270908, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 45}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8431,1.056107,0.799267,0.730529,0.656964,0.67626
2,0.0716,1.10706,0.810266,0.787027,0.709703,0.735265
3,0.0265,1.214316,0.814849,0.768125,0.719765,0.731078
4,0.0199,1.377069,0.813016,0.79187,0.719793,0.73903
5,0.0117,1.4067,0.815765,0.75497,0.711229,0.716788
6,0.012,1.476546,0.825848,0.792312,0.737968,0.753801
7,0.0088,1.466912,0.828598,0.792667,0.744383,0.756085
8,0.0082,1.484373,0.815765,0.781122,0.731583,0.741021
9,0.0075,1.598369,0.822181,0.783274,0.713789,0.735395
10,0.0077,1.676172,0.813016,0.781386,0.701869,0.726654


[I 2025-03-16 03:50:37,564] Trial 91 finished with value: 0.724959277326398 and parameters: {'learning_rate': 0.002161349669270908, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 45}. Best is trial 22 with value: 0.777016449838325.


Trial 92 with params: {'learning_rate': 1.3206453021903648e-05, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 32}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3031,2.942386,0.296975,0.038908,0.05469,0.028757
2,2.7115,2.651075,0.338222,0.033792,0.067158,0.041657
3,2.4742,2.466373,0.382218,0.038264,0.080969,0.051697
4,2.3044,2.326166,0.406049,0.064096,0.088431,0.059286
5,2.1651,2.212105,0.436297,0.087302,0.099941,0.073269
6,2.0545,2.118305,0.460128,0.091173,0.111973,0.086959
7,1.9603,2.036409,0.483043,0.103667,0.124899,0.10009
8,1.8813,1.975824,0.492209,0.110766,0.130181,0.105618
9,1.815,1.918007,0.508708,0.139013,0.139694,0.117571
10,1.7538,1.863344,0.51604,0.133687,0.146197,0.126093


[I 2025-03-16 03:57:06,199] Trial 92 pruned. 


Trial 93 with params: {'learning_rate': 0.0036589493386690298, 'weight_decay': 0.005, 'adam_beta1': 0.91, 'warmup_steps': 40}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7069,1.070321,0.807516,0.679127,0.66827,0.663262
2,0.0588,1.143445,0.815765,0.729089,0.698286,0.701879
3,0.0234,1.387722,0.812099,0.755458,0.736174,0.729122
4,0.0254,1.429372,0.813016,0.782246,0.723017,0.738617
5,0.017,1.477518,0.819432,0.736127,0.689696,0.702747
6,0.0131,1.685574,0.816682,0.789839,0.713977,0.739492
7,0.0142,1.725161,0.808433,0.765309,0.709383,0.71958
8,0.0145,1.90955,0.815765,0.780881,0.694724,0.717566
9,0.0163,1.777781,0.819432,0.763903,0.715454,0.725898
10,0.0086,1.862003,0.821265,0.755818,0.732291,0.733317


[I 2025-03-16 04:06:23,050] Trial 93 finished with value: 0.7539832581966835 and parameters: {'learning_rate': 0.0036589493386690298, 'weight_decay': 0.005, 'adam_beta1': 0.91, 'warmup_steps': 40}. Best is trial 22 with value: 0.777016449838325.


Trial 94 with params: {'learning_rate': 0.003022724378501476, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 18}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7428,1.002303,0.813016,0.762745,0.662836,0.684943
2,0.0622,1.104141,0.813016,0.787185,0.708343,0.727964
3,0.0293,1.22579,0.815765,0.786839,0.72373,0.736853
4,0.0191,1.334638,0.822181,0.739281,0.685622,0.694907
5,0.013,1.429313,0.814849,0.741441,0.712164,0.712581
6,0.0172,1.436907,0.826764,0.720019,0.684247,0.684558
7,0.0112,1.397442,0.83593,0.789264,0.73204,0.745854
8,0.0121,1.672946,0.813932,0.742786,0.698308,0.702991
9,0.0117,1.485284,0.822181,0.756714,0.748087,0.737634
10,0.0069,1.702665,0.813016,0.782616,0.728373,0.737875


[I 2025-03-16 04:15:15,823] Trial 94 finished with value: 0.7552464768995477 and parameters: {'learning_rate': 0.003022724378501476, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 18}. Best is trial 22 with value: 0.777016449838325.


Trial 95 with params: {'learning_rate': 0.003235419113614414, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 29}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7312,1.094143,0.791934,0.749469,0.671504,0.695684
2,0.0622,1.130425,0.820348,0.769003,0.714485,0.728485
3,0.0265,1.264396,0.808433,0.763936,0.735951,0.730669
4,0.0189,1.492587,0.814849,0.74785,0.702206,0.715784
5,0.0206,1.507303,0.821265,0.787453,0.725308,0.741203
6,0.0153,1.597375,0.824931,0.792518,0.72833,0.740849
7,0.0112,1.704168,0.814849,0.75422,0.690901,0.707457
8,0.0109,1.810845,0.822181,0.773418,0.722114,0.735253
9,0.0097,1.795023,0.827681,0.777407,0.716075,0.729885
10,0.0088,1.863218,0.814849,0.748679,0.695244,0.708691


[I 2025-03-16 04:18:10,579] Trial 95 pruned. 


Trial 96 with params: {'learning_rate': 0.004200584320161594, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 26}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6939,1.092536,0.800183,0.766882,0.683623,0.708003
2,0.0594,1.149842,0.823098,0.738385,0.710464,0.714027
3,0.0289,1.182599,0.822181,0.80973,0.732784,0.751378
4,0.0294,1.433766,0.808433,0.770607,0.703632,0.725069
5,0.0222,1.487404,0.817599,0.774882,0.743275,0.74198
6,0.0172,1.665607,0.815765,0.79607,0.708818,0.737369
7,0.0171,1.783563,0.811182,0.775119,0.723553,0.736877
8,0.0134,1.815451,0.818515,0.757298,0.694894,0.715385
9,0.0133,1.948497,0.814849,0.760405,0.725353,0.732449
10,0.012,2.21624,0.806599,0.792732,0.715014,0.734612


[I 2025-03-16 04:23:58,041] Trial 96 pruned. 


Trial 97 with params: {'learning_rate': 1.2647062779972101e-05, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 23}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3389,2.985312,0.295142,0.019518,0.054149,0.02803
2,2.7378,2.668165,0.335472,0.034956,0.066364,0.04138
3,2.495,2.488479,0.384968,0.038678,0.081763,0.052228
4,2.3284,2.352315,0.401467,0.063,0.086728,0.056586
5,2.1924,2.243876,0.426214,0.082814,0.096299,0.06922


[I 2025-03-16 04:25:32,349] Trial 97 pruned. 


Trial 98 with params: {'learning_rate': 0.002879141452584742, 'weight_decay': 0.01, 'adam_beta1': 0.96, 'warmup_steps': 25}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8479,1.04889,0.794684,0.690822,0.664092,0.661354
2,0.0693,1.196214,0.800183,0.709985,0.667215,0.677726
3,0.026,1.361763,0.818515,0.790998,0.71603,0.731542
4,0.0217,1.467542,0.816682,0.765175,0.706135,0.720949
5,0.0151,1.409277,0.822181,0.784194,0.737893,0.747108
6,0.0191,1.546894,0.813016,0.785896,0.719764,0.73379
7,0.0107,1.71926,0.807516,0.724545,0.648282,0.672626
8,0.0106,1.739594,0.810266,0.717565,0.687192,0.689197
9,0.007,1.757178,0.808433,0.739167,0.680679,0.692401
10,0.0083,1.969923,0.821265,0.768304,0.689568,0.707724


[I 2025-03-16 04:31:39,414] Trial 98 pruned. 


Trial 99 with params: {'learning_rate': 0.0009961165962936351, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 40}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1091,0.97885,0.765353,0.640149,0.574844,0.588682
2,0.1558,1.122052,0.779102,0.736889,0.670572,0.680889
3,0.052,1.199669,0.805683,0.765227,0.700792,0.715968
4,0.0256,1.266914,0.807516,0.798688,0.6891,0.717958
5,0.0168,1.314385,0.805683,0.779624,0.697305,0.72182
6,0.0105,1.478223,0.8011,0.761221,0.680666,0.699431
7,0.0084,1.650587,0.797434,0.727038,0.665764,0.679351
8,0.0098,1.558964,0.799267,0.738389,0.671323,0.689067
9,0.0077,1.410024,0.813932,0.770207,0.724803,0.733111
10,0.0048,1.556514,0.800183,0.735625,0.66312,0.681054


[I 2025-03-16 04:34:45,482] Trial 99 pruned. 


Trial 100 with params: {'learning_rate': 0.001092630898738911, 'weight_decay': 0.006, 'adam_beta1': 0.99, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3247,1.103576,0.739688,0.539861,0.462094,0.478153
2,0.2487,1.25555,0.776352,0.691511,0.617756,0.638778
3,0.0646,1.245913,0.807516,0.727141,0.679419,0.685753
4,0.025,1.41009,0.812099,0.760325,0.677545,0.700119
5,0.0151,1.517952,0.814849,0.72948,0.699298,0.700682
6,0.0107,1.549944,0.817599,0.772757,0.688309,0.715341
7,0.0124,1.733733,0.792851,0.728592,0.65315,0.673355
8,0.0093,1.723308,0.815765,0.751247,0.702933,0.712416
9,0.0063,1.726955,0.806599,0.716721,0.687211,0.686586
10,0.0059,1.729242,0.815765,0.767417,0.70295,0.718562


[I 2025-03-16 04:43:57,565] Trial 100 finished with value: 0.7323657355512438 and parameters: {'learning_rate': 0.001092630898738911, 'weight_decay': 0.006, 'adam_beta1': 0.99, 'warmup_steps': 1}. Best is trial 22 with value: 0.777016449838325.


Trial 101 with params: {'learning_rate': 0.003174940186884278, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 10}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6709,0.976653,0.812099,0.798016,0.69595,0.727283
2,0.0586,1.065081,0.820348,0.79782,0.717574,0.738368
3,0.0228,1.272934,0.811182,0.756159,0.730557,0.731351
4,0.0208,1.294879,0.818515,0.751831,0.703327,0.710701
5,0.0153,1.48007,0.819432,0.794582,0.729132,0.738231
6,0.0135,1.433139,0.809349,0.76424,0.735401,0.728358
7,0.014,1.506306,0.821265,0.772655,0.721725,0.733752
8,0.006,1.677594,0.820348,0.782006,0.72723,0.740282
9,0.0135,1.536389,0.824015,0.776651,0.723783,0.735571
10,0.0082,1.631353,0.824931,0.739288,0.711914,0.712216


[I 2025-03-16 04:49:47,504] Trial 101 pruned. 


Trial 102 with params: {'learning_rate': 0.002717665049910737, 'weight_decay': 0.004, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 19}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7954,1.034956,0.809349,0.75108,0.698422,0.711362
2,0.0685,1.069111,0.810266,0.762307,0.713112,0.720015
3,0.026,1.218969,0.826764,0.787376,0.726629,0.744745
4,0.0197,1.292353,0.822181,0.776706,0.709066,0.724324
5,0.0147,1.46831,0.810266,0.708062,0.680861,0.679963


[I 2025-03-16 04:51:17,163] Trial 102 pruned. 


Trial 103 with params: {'learning_rate': 1.546855136785054e-05, 'weight_decay': 0.01, 'adam_beta1': 0.98, 'warmup_steps': 10}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3384,2.979682,0.293309,0.02082,0.052687,0.028928
2,2.6898,2.598147,0.35747,0.034638,0.073277,0.045886
3,2.3963,2.383657,0.399633,0.060713,0.08648,0.05735
4,2.2054,2.232248,0.428048,0.063414,0.096581,0.069342
5,2.0576,2.111813,0.461045,0.100091,0.111692,0.086532


[I 2025-03-16 04:52:41,016] Trial 103 pruned. 


Trial 104 with params: {'learning_rate': 0.0040769140086474405, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 9}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6666,1.072376,0.814849,0.743906,0.694009,0.697741
2,0.06,1.222478,0.808433,0.750242,0.703054,0.7093
3,0.029,1.353734,0.810266,0.758242,0.711432,0.714171
4,0.0263,1.447755,0.816682,0.746411,0.687143,0.699078
5,0.0208,1.450105,0.823098,0.7723,0.727707,0.739542
6,0.0165,1.596013,0.824015,0.724918,0.722728,0.704004
7,0.0185,1.536391,0.831347,0.797791,0.740684,0.755023
8,0.0155,1.613087,0.824931,0.75052,0.728171,0.72239
9,0.0129,1.762516,0.814849,0.758385,0.693321,0.70823
10,0.0128,1.950178,0.825848,0.794468,0.742817,0.754021


[I 2025-03-16 05:01:47,647] Trial 104 finished with value: 0.7369590936198733 and parameters: {'learning_rate': 0.0040769140086474405, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 9}. Best is trial 22 with value: 0.777016449838325.


Trial 105 with params: {'learning_rate': 0.004855660294332716, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 36}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6517,0.989455,0.822181,0.75031,0.680201,0.701868
2,0.0566,1.184307,0.811182,0.723225,0.701176,0.700801
3,0.0322,1.309018,0.832264,0.741498,0.716864,0.717205
4,0.0294,1.45811,0.821265,0.803831,0.718464,0.739267
5,0.023,1.733714,0.818515,0.769044,0.690676,0.71226
6,0.0257,1.523685,0.812099,0.762595,0.702792,0.717078
7,0.015,1.851405,0.825848,0.743702,0.711185,0.717162
8,0.0153,1.856748,0.816682,0.764896,0.697934,0.714137
9,0.0247,1.991254,0.806599,0.738527,0.683053,0.699113
10,0.0135,1.978518,0.816682,0.794906,0.715133,0.740189


[I 2025-03-16 05:10:52,216] Trial 105 finished with value: 0.7681301594039637 and parameters: {'learning_rate': 0.004855660294332716, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 36}. Best is trial 22 with value: 0.777016449838325.


Trial 106 with params: {'learning_rate': 0.003220715492685992, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 21}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7095,1.015906,0.813932,0.752502,0.680803,0.70278
2,0.0581,1.168118,0.814849,0.791637,0.700635,0.721555
3,0.0273,1.233184,0.822181,0.788243,0.725655,0.733377
4,0.0177,1.409058,0.814849,0.781921,0.709262,0.720864
5,0.0147,1.525667,0.813016,0.751649,0.716164,0.716143
6,0.0178,1.632656,0.802933,0.802366,0.722277,0.745935
7,0.0136,1.4693,0.836847,0.808533,0.734677,0.755366
8,0.0122,1.691276,0.823098,0.785297,0.72832,0.741236
9,0.0101,1.656224,0.828598,0.772644,0.712467,0.725555
10,0.0109,1.786328,0.835014,0.796013,0.734734,0.745404


[I 2025-03-16 05:20:17,292] Trial 106 finished with value: 0.7703947179667631 and parameters: {'learning_rate': 0.003220715492685992, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 21}. Best is trial 22 with value: 0.777016449838325.


Trial 107 with params: {'learning_rate': 0.0008434893094761935, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 14}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1556,0.988088,0.75802,0.625144,0.555834,0.571183
2,0.1952,1.142272,0.771769,0.704127,0.650592,0.657772
3,0.0633,1.245793,0.791934,0.735383,0.652455,0.676746
4,0.0331,1.315137,0.797434,0.711994,0.6666,0.67571
5,0.0187,1.39934,0.796517,0.729801,0.673932,0.686605
6,0.0122,1.415907,0.80385,0.738673,0.687186,0.694732
7,0.0088,1.460284,0.80385,0.743432,0.682536,0.695047
8,0.0086,1.689006,0.790101,0.759857,0.652756,0.678193
9,0.0082,1.689471,0.794684,0.7171,0.667685,0.674987
10,0.0059,1.604617,0.80385,0.740227,0.686991,0.699043


[I 2025-03-16 05:29:02,192] Trial 107 finished with value: 0.7375505907451537 and parameters: {'learning_rate': 0.0008434893094761935, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 14}. Best is trial 22 with value: 0.777016449838325.


Trial 108 with params: {'learning_rate': 0.001641753874243815, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9, 'warmup_steps': 17}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8659,1.021407,0.788268,0.711144,0.62972,0.655401
2,0.0905,1.301159,0.788268,0.730711,0.664018,0.680747
3,0.0305,1.26913,0.817599,0.726896,0.704502,0.709687
4,0.0199,1.394006,0.810266,0.765399,0.681688,0.708393
5,0.0142,1.445937,0.816682,0.751365,0.68915,0.709054
6,0.0107,1.568368,0.816682,0.73999,0.690837,0.699643
7,0.0101,1.541591,0.815765,0.757805,0.688917,0.709898
8,0.0097,1.610857,0.826764,0.733543,0.685509,0.695893
9,0.0057,1.60738,0.812099,0.779711,0.720283,0.732183
10,0.0032,1.708868,0.823098,0.759449,0.704385,0.715575


[I 2025-03-16 05:31:53,367] Trial 108 pruned. 


Trial 109 with params: {'learning_rate': 0.0037425751153574442, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 26}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6882,0.995419,0.817599,0.751065,0.691491,0.706018
2,0.0579,1.213452,0.812099,0.771662,0.688819,0.715041
3,0.026,1.368798,0.813016,0.735137,0.706989,0.712655
4,0.0235,1.444345,0.816682,0.781486,0.708057,0.727913
5,0.0201,1.635019,0.809349,0.824644,0.72875,0.755901
6,0.0152,1.626347,0.814849,0.746503,0.708334,0.713997
7,0.0154,1.85025,0.813016,0.796871,0.718141,0.739167
8,0.0143,1.822883,0.823098,0.726828,0.707724,0.70109
9,0.0126,1.852927,0.822181,0.768696,0.719279,0.724842
10,0.0094,1.932319,0.828598,0.753162,0.728669,0.727179


[I 2025-03-16 05:40:49,799] Trial 109 finished with value: 0.7447314216929327 and parameters: {'learning_rate': 0.0037425751153574442, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 26}. Best is trial 22 with value: 0.777016449838325.


Trial 110 with params: {'learning_rate': 0.0046032445747569515, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 10}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6056,1.049272,0.79835,0.760914,0.662916,0.688253
2,0.0568,1.201903,0.822181,0.775575,0.708075,0.726717
3,0.0294,1.35051,0.818515,0.767613,0.701293,0.716285
4,0.0338,1.445034,0.812099,0.768374,0.719459,0.727758
5,0.0244,1.593768,0.812099,0.73408,0.710114,0.708058
6,0.0202,1.756124,0.818515,0.7271,0.709206,0.702553
7,0.0233,1.71642,0.813932,0.769266,0.715777,0.730548
8,0.0161,1.943137,0.815765,0.771157,0.707854,0.723869
9,0.0146,1.949203,0.822181,0.772542,0.738282,0.735513
10,0.0104,1.744444,0.829514,0.77919,0.743625,0.747118


[I 2025-03-16 05:49:59,658] Trial 110 finished with value: 0.7530769772157083 and parameters: {'learning_rate': 0.0046032445747569515, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 10}. Best is trial 22 with value: 0.777016449838325.


Trial 111 with params: {'learning_rate': 0.003548328615397719, 'weight_decay': 0.008, 'adam_beta1': 0.99, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8592,1.164524,0.792851,0.716876,0.627805,0.654333
2,0.0825,1.313598,0.810266,0.710958,0.658176,0.667015
3,0.0284,1.309027,0.818515,0.744729,0.716776,0.722871
4,0.0208,1.518721,0.80385,0.762833,0.682516,0.700791
5,0.0201,1.781291,0.824015,0.820522,0.703369,0.731391
6,0.0194,1.798339,0.810266,0.713924,0.684463,0.684333
7,0.0159,1.801641,0.810266,0.789053,0.694286,0.723344
8,0.0124,1.82869,0.813016,0.771629,0.708911,0.72014
9,0.0114,1.669809,0.814849,0.745265,0.703303,0.70605
10,0.0085,1.948849,0.806599,0.738779,0.692129,0.702061


[I 2025-03-16 05:53:06,069] Trial 111 pruned. 


Trial 112 with params: {'learning_rate': 0.0022288771519664084, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 38}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8166,1.006223,0.796517,0.76527,0.666625,0.696024
2,0.0692,1.257073,0.800183,0.763108,0.714284,0.717896
3,0.0286,1.261803,0.813016,0.756544,0.72183,0.726448
4,0.0183,1.229711,0.825848,0.788408,0.737991,0.750891
5,0.0124,1.359151,0.819432,0.750644,0.722488,0.724571
6,0.0103,1.477777,0.816682,0.763276,0.710298,0.719287
7,0.0107,1.522413,0.816682,0.767467,0.718991,0.729384
8,0.0092,1.71748,0.813932,0.775084,0.713101,0.72943
9,0.0059,1.659591,0.820348,0.810155,0.74465,0.759707
10,0.0076,1.68893,0.822181,0.783618,0.732203,0.741207


[I 2025-03-16 06:02:33,268] Trial 112 finished with value: 0.7572231295742498 and parameters: {'learning_rate': 0.0022288771519664084, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 38}. Best is trial 22 with value: 0.777016449838325.


Trial 113 with params: {'learning_rate': 0.004755080636231557, 'weight_decay': 0.007, 'adam_beta1': 0.91, 'warmup_steps': 31}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6598,1.016166,0.808433,0.770234,0.697814,0.721064
2,0.0576,1.127866,0.817599,0.802301,0.718883,0.740963
3,0.029,1.468755,0.8011,0.729071,0.683973,0.687725
4,0.0332,1.502585,0.814849,0.74796,0.676937,0.693357
5,0.0254,1.556451,0.818515,0.795982,0.728903,0.740878
6,0.0252,1.529567,0.814849,0.751028,0.713834,0.715636
7,0.0238,1.869564,0.812099,0.769957,0.698841,0.714876
8,0.0152,1.739863,0.825848,0.775309,0.719011,0.730959
9,0.012,1.969014,0.810266,0.760025,0.700358,0.714222
10,0.0108,1.942862,0.816682,0.769057,0.700459,0.714924


[I 2025-03-16 06:05:42,693] Trial 113 pruned. 


Trial 114 with params: {'learning_rate': 0.0008457539411883055, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 40}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1915,0.961776,0.757104,0.610091,0.555119,0.568165
2,0.1964,1.040425,0.790101,0.707743,0.663895,0.667976
3,0.0642,1.176276,0.80385,0.746446,0.691821,0.704794
4,0.0342,1.331053,0.796517,0.728635,0.693007,0.690703
5,0.0184,1.48564,0.796517,0.715517,0.664794,0.670064
6,0.0135,1.463565,0.800183,0.753489,0.695703,0.7074
7,0.0122,1.378541,0.812099,0.720306,0.676808,0.687137
8,0.0064,1.54862,0.816682,0.738492,0.688724,0.698099
9,0.0053,1.529205,0.799267,0.732728,0.694841,0.696613
10,0.0076,1.631162,0.805683,0.723566,0.669095,0.679315


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--f1/34c46321f42186df33a6260966e34a368f14868d9cc2ba47d142112e2800d233 (last modified on Fri Jan 10 23:14:01 2025) since it couldn't be found locally at evaluate-metric--f1, or remotely on the Hugging Face Hub.
[I 2025-03-16 06:08:52,450] Trial 114 pruned. 


Trial 115 with params: {'learning_rate': 0.0010008849497856638, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9, 'warmup_steps': 32}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0951,1.00117,0.767186,0.645979,0.563047,0.58206
2,0.1565,1.114949,0.787351,0.709935,0.66957,0.675761
3,0.0511,1.202687,0.80385,0.736459,0.678141,0.689649
4,0.0264,1.375909,0.8011,0.746321,0.676508,0.692387
5,0.0156,1.42956,0.805683,0.723357,0.674773,0.681986
6,0.0103,1.416659,0.808433,0.711743,0.675137,0.675926
7,0.0107,1.577114,0.790101,0.708468,0.638574,0.648585
8,0.0061,1.649561,0.800183,0.718894,0.669704,0.678646
9,0.007,1.640244,0.80385,0.733855,0.687445,0.69603
10,0.0081,1.583273,0.816682,0.74094,0.688724,0.704524


[I 2025-03-16 06:11:46,827] Trial 115 pruned. 


Trial 116 with params: {'learning_rate': 0.002698228507458894, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 36}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7669,1.015988,0.8011,0.746356,0.687123,0.702256
2,0.0624,1.168344,0.824931,0.773119,0.738393,0.74511
3,0.0239,1.301963,0.811182,0.779921,0.716575,0.730881
4,0.0164,1.379955,0.833181,0.798645,0.737164,0.749396
5,0.0172,1.286056,0.829514,0.759889,0.723793,0.730045
6,0.0104,1.437836,0.827681,0.783401,0.751284,0.750337
7,0.0118,1.627381,0.824931,0.771992,0.727973,0.732445
8,0.0134,1.719615,0.821265,0.765855,0.725561,0.730981
9,0.0087,1.644232,0.826764,0.789145,0.715307,0.728544
10,0.0069,1.831489,0.815765,0.788641,0.71839,0.735959


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--f1/34c46321f42186df33a6260966e34a368f14868d9cc2ba47d142112e2800d233 (last modified on Fri Jan 10 23:14:01 2025) since it couldn't be found locally at evaluate-metric--f1, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--precision/155d3220d6cd4a6553f12da68eeb3d1f97cf431206304a4bc6e2d564c29502e9 (last modified on Fri Jan 10 23:13:59 2025) since it couldn't be found locally at evaluate-metric--precision, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--recall/11f90e583db35601050aed380d48e83202a896976b9608432fba9244fb447f24 (last modified on Fri Jan 10 23:14:00 2025) since it couldn't be found locally at evaluate-metric--

Trial 117 with params: {'learning_rate': 0.004432653262014441, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 53}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.697,1.152506,0.796517,0.732072,0.668747,0.681135
2,0.058,1.251627,0.809349,0.7263,0.712364,0.708842
3,0.0294,1.419015,0.823098,0.782375,0.721914,0.740082
4,0.0332,1.465072,0.824015,0.775836,0.712867,0.729268
5,0.0181,1.542342,0.817599,0.752602,0.742781,0.733267
6,0.0207,1.818386,0.813932,0.744943,0.684977,0.698485
7,0.0181,1.788602,0.824931,0.775562,0.708092,0.725447
8,0.0169,1.945074,0.813016,0.753815,0.723536,0.719639
9,0.0134,1.960699,0.808433,0.748677,0.698126,0.707957
10,0.0106,2.148803,0.808433,0.799404,0.723529,0.741814


[I 2025-03-16 06:26:33,434] Trial 117 finished with value: 0.7523858544930248 and parameters: {'learning_rate': 0.004432653262014441, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 53}. Best is trial 22 with value: 0.777016449838325.


Trial 118 with params: {'learning_rate': 0.002561536082215949, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 31}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7689,1.036001,0.796517,0.67059,0.669071,0.656955
2,0.0661,1.137145,0.811182,0.725271,0.702481,0.698978
3,0.0254,1.392814,0.823098,0.727738,0.717901,0.707774
4,0.0187,1.335269,0.824015,0.791912,0.740324,0.74673
5,0.0132,1.518577,0.816682,0.750177,0.695056,0.708604
6,0.0138,1.531238,0.822181,0.7798,0.716216,0.733017
7,0.0108,1.606303,0.824931,0.781234,0.74174,0.747752
8,0.0115,1.687537,0.815765,0.780196,0.71903,0.733983
9,0.009,1.661516,0.824015,0.803215,0.732169,0.749568
10,0.0063,1.650333,0.825848,0.751062,0.729537,0.72804


[I 2025-03-16 06:29:47,326] Trial 118 pruned. 


Trial 119 with params: {'learning_rate': 0.004648719836846262, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 37}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6492,1.086464,0.8011,0.789312,0.692076,0.719957
2,0.0566,1.22446,0.80385,0.729263,0.684173,0.691669
3,0.0295,1.36001,0.809349,0.758195,0.71636,0.71764
4,0.0277,1.532467,0.808433,0.761232,0.712558,0.720488
5,0.0272,1.687414,0.812099,0.744726,0.701016,0.709352
6,0.0267,1.804104,0.811182,0.731326,0.684067,0.69343
7,0.0187,1.815126,0.815765,0.759722,0.709427,0.717206
8,0.0143,1.90915,0.804766,0.782449,0.718237,0.732352
9,0.0139,2.008954,0.812099,0.752701,0.700429,0.710967
10,0.0137,2.212797,0.802933,0.72862,0.678574,0.687281


[I 2025-03-16 06:35:51,651] Trial 119 pruned. 


Trial 120 with params: {'learning_rate': 0.002602592973871715, 'weight_decay': 0.01, 'adam_beta1': 0.97, 'warmup_steps': 9}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8927,1.004483,0.792851,0.706202,0.660292,0.667593
2,0.0792,1.176715,0.813016,0.755699,0.686961,0.705711
3,0.0289,1.325904,0.809349,0.713804,0.66692,0.677956
4,0.0178,1.370663,0.819432,0.795076,0.68445,0.71688
5,0.0171,1.351982,0.828598,0.786751,0.732298,0.745379
6,0.0115,1.528109,0.823098,0.790003,0.680211,0.717146
7,0.0092,1.566973,0.822181,0.780017,0.719838,0.734584
8,0.0132,1.780656,0.807516,0.784443,0.706599,0.728977
9,0.0103,1.73515,0.810266,0.749782,0.686743,0.700732
10,0.0104,1.731896,0.821265,0.770603,0.689768,0.713391


[I 2025-03-16 06:44:42,868] Trial 120 finished with value: 0.7358110531047592 and parameters: {'learning_rate': 0.002602592973871715, 'weight_decay': 0.01, 'adam_beta1': 0.97, 'warmup_steps': 9}. Best is trial 22 with value: 0.777016449838325.


Trial 121 with params: {'learning_rate': 0.0024688808604935277, 'weight_decay': 0.01, 'adam_beta1': 0.97, 'warmup_steps': 42}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9914,1.042896,0.802933,0.712057,0.646313,0.662132
2,0.0773,1.18734,0.811182,0.780416,0.72643,0.73884
3,0.031,1.23657,0.817599,0.803783,0.711856,0.742194
4,0.0199,1.307694,0.815765,0.786594,0.694831,0.722346
5,0.0156,1.432,0.816682,0.761436,0.684663,0.708009
6,0.0106,1.446942,0.826764,0.802273,0.722771,0.744248
7,0.0125,1.533889,0.808433,0.797751,0.698158,0.727328
8,0.011,1.694765,0.827681,0.786728,0.710258,0.73141
9,0.0073,1.63833,0.824931,0.778408,0.717417,0.732893
10,0.0066,1.705967,0.817599,0.791585,0.71983,0.738278


[I 2025-03-16 06:50:33,556] Trial 121 pruned. 


Trial 122 with params: {'learning_rate': 0.00442274715683165, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 19}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.628,1.060601,0.814849,0.719118,0.670458,0.67894
2,0.0549,1.230415,0.813932,0.725524,0.683041,0.692291
3,0.0324,1.37807,0.810266,0.733359,0.665467,0.681
4,0.0274,1.366062,0.833181,0.783707,0.744631,0.752282
5,0.0226,1.606194,0.799267,0.7987,0.687473,0.722589
6,0.0232,1.502376,0.829514,0.74103,0.691428,0.701783
7,0.0139,1.729114,0.817599,0.767338,0.693571,0.709568
8,0.0155,1.793746,0.822181,0.789062,0.732782,0.749871
9,0.0157,1.853728,0.824015,0.777729,0.71676,0.731681
10,0.0101,1.875511,0.820348,0.720102,0.705425,0.699645


[I 2025-03-16 06:53:38,884] Trial 122 pruned. 


Trial 123 with params: {'learning_rate': 0.004988609184868615, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 40}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6487,1.096205,0.80385,0.724599,0.639907,0.665812
2,0.0569,1.218797,0.824931,0.759868,0.711508,0.717272
3,0.0335,1.382543,0.812099,0.770099,0.722115,0.733079
4,0.0323,1.438164,0.808433,0.729268,0.675643,0.688299
5,0.0305,1.580824,0.810266,0.74487,0.700574,0.706147
6,0.0227,1.70231,0.813016,0.745548,0.689383,0.704705
7,0.0181,1.854552,0.824931,0.785885,0.71689,0.735293
8,0.018,1.986037,0.807516,0.742408,0.691591,0.701945
9,0.0173,2.053723,0.79835,0.755444,0.702437,0.713865
10,0.017,1.927908,0.796517,0.730559,0.675687,0.691474


[I 2025-03-16 06:56:44,788] Trial 123 pruned. 


Trial 124 with params: {'learning_rate': 0.0038585271781310534, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 53}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7096,1.044104,0.804766,0.794262,0.678847,0.717127
2,0.0571,1.219378,0.812099,0.757717,0.674305,0.699736
3,0.0276,1.359851,0.8011,0.751714,0.705191,0.716961
4,0.0214,1.410977,0.811182,0.799757,0.745966,0.756373
5,0.0221,1.51815,0.820348,0.818373,0.753434,0.769409
6,0.0191,1.508345,0.830431,0.767799,0.745068,0.743288
7,0.0143,1.520913,0.828598,0.817005,0.757928,0.773165
8,0.012,1.697963,0.830431,0.767395,0.76115,0.751022
9,0.0116,1.795294,0.826764,0.800945,0.742889,0.758367
10,0.0109,1.944749,0.823098,0.828162,0.752291,0.768426


[I 2025-03-16 07:06:02,153] Trial 124 finished with value: 0.7591124687215205 and parameters: {'learning_rate': 0.0038585271781310534, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 53}. Best is trial 22 with value: 0.777016449838325.


Trial 125 with params: {'learning_rate': 0.002566553696300354, 'weight_decay': 0.001, 'adam_beta1': 0.91, 'warmup_steps': 50}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8098,1.009427,0.8011,0.761161,0.669376,0.699032
2,0.0644,1.182362,0.805683,0.769163,0.711748,0.71941
3,0.0252,1.23216,0.818515,0.791802,0.737503,0.745316
4,0.0174,1.328749,0.819432,0.780873,0.710508,0.72161
5,0.0165,1.368809,0.809349,0.738855,0.704488,0.704368
6,0.0133,1.445978,0.821265,0.778858,0.707977,0.72944
7,0.0102,1.55129,0.820348,0.818487,0.728679,0.747735
8,0.0086,1.652106,0.819432,0.763837,0.711634,0.72517
9,0.0099,1.632591,0.817599,0.774995,0.737119,0.743106
10,0.0074,1.653111,0.817599,0.787808,0.722118,0.740035


[I 2025-03-16 07:12:03,758] Trial 125 pruned. 


Trial 126 with params: {'learning_rate': 0.001655075653763441, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 53}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9396,0.966789,0.789184,0.722154,0.626858,0.656163
2,0.0861,1.153314,0.800183,0.765926,0.69278,0.710664
3,0.0324,1.202118,0.806599,0.758703,0.720953,0.727075
4,0.0163,1.251616,0.812099,0.749329,0.708558,0.717302
5,0.0135,1.486575,0.807516,0.77348,0.696396,0.71835
6,0.0133,1.384272,0.822181,0.763153,0.704926,0.720383
7,0.0092,1.494486,0.821265,0.799155,0.726626,0.745886
8,0.0064,1.610919,0.817599,0.772582,0.709656,0.72559
9,0.0076,1.560459,0.812099,0.733647,0.681894,0.688799
10,0.0057,1.634653,0.813932,0.777686,0.708291,0.725548


[I 2025-03-16 07:15:31,983] Trial 126 pruned. 


Trial 127 with params: {'learning_rate': 0.0026924579668531137, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 50}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7936,0.97148,0.813932,0.756462,0.688373,0.707865
2,0.0606,1.104771,0.812099,0.819757,0.721397,0.749206
3,0.0248,1.182287,0.819432,0.766948,0.72968,0.734769
4,0.019,1.357416,0.813016,0.771545,0.693001,0.712437
5,0.0158,1.30662,0.822181,0.779842,0.716682,0.730315
6,0.0128,1.537905,0.822181,0.799646,0.732955,0.750102
7,0.0122,1.597875,0.824015,0.773367,0.720985,0.726442
8,0.0099,1.603937,0.815765,0.792443,0.723311,0.74129
9,0.0101,1.548744,0.819432,0.765581,0.719522,0.729715
10,0.0058,1.664382,0.827681,0.779663,0.713292,0.72819


[I 2025-03-16 07:24:48,410] Trial 127 finished with value: 0.7501322528795054 and parameters: {'learning_rate': 0.0026924579668531137, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 50}. Best is trial 22 with value: 0.777016449838325.


Trial 128 with params: {'learning_rate': 0.00458003365200007, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 53}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7169,1.051328,0.813932,0.788421,0.697406,0.725466
2,0.0579,1.21825,0.806599,0.783972,0.687166,0.712496
3,0.0315,1.522348,0.809349,0.772411,0.709362,0.726609
4,0.0288,1.616464,0.808433,0.767381,0.687281,0.709869
5,0.0277,1.694208,0.812099,0.820816,0.743791,0.76464
6,0.0184,1.796224,0.812099,0.79951,0.697269,0.728998
7,0.019,1.835498,0.821265,0.754447,0.72791,0.722772
8,0.0173,2.094683,0.816682,0.806042,0.728917,0.752916
9,0.0185,1.961818,0.810266,0.760726,0.707923,0.717682
10,0.0134,1.942478,0.833181,0.774211,0.726751,0.73585


[I 2025-03-16 07:30:52,049] Trial 128 pruned. 


Trial 129 with params: {'learning_rate': 0.004732194814967369, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 52}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6689,1.137443,0.806599,0.732616,0.660576,0.677379
2,0.0577,1.121029,0.810266,0.753163,0.700707,0.706467
3,0.0313,1.362295,0.824015,0.784431,0.743747,0.751069
4,0.0289,1.506167,0.822181,0.76587,0.73833,0.736462
5,0.0253,1.713694,0.813932,0.789312,0.729678,0.738756
6,0.0248,1.587117,0.813016,0.768818,0.700136,0.717027
7,0.0191,1.868495,0.815765,0.726539,0.661761,0.679246
8,0.0138,2.147075,0.800183,0.757945,0.677514,0.695667
9,0.0161,1.909133,0.812099,0.741575,0.689718,0.702624
10,0.0111,2.138232,0.805683,0.759527,0.701441,0.718767


[I 2025-03-16 07:34:00,274] Trial 129 pruned. 


Trial 130 with params: {'learning_rate': 0.004395622399382944, 'weight_decay': 0.01, 'adam_beta1': 0.92, 'warmup_steps': 40}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6892,1.000178,0.811182,0.742353,0.674116,0.692021
2,0.0564,1.165069,0.821265,0.75678,0.691912,0.711405
3,0.0318,1.321529,0.820348,0.781028,0.724906,0.737397
4,0.0283,1.447094,0.823098,0.756981,0.726663,0.726185
5,0.023,1.431603,0.824015,0.784373,0.703625,0.726029
6,0.0191,1.746036,0.814849,0.710292,0.670971,0.674673
7,0.0187,1.674886,0.820348,0.759652,0.7206,0.725728
8,0.0127,1.726438,0.824015,0.76227,0.704641,0.717004
9,0.0127,1.816871,0.815765,0.757413,0.68612,0.705781
10,0.015,1.891561,0.824015,0.787122,0.712172,0.73426


[I 2025-03-16 07:43:00,631] Trial 130 finished with value: 0.7305639167875856 and parameters: {'learning_rate': 0.004395622399382944, 'weight_decay': 0.01, 'adam_beta1': 0.92, 'warmup_steps': 40}. Best is trial 22 with value: 0.777016449838325.


Trial 131 with params: {'learning_rate': 0.00121679416733383, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 35}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0333,0.993043,0.766269,0.624483,0.558073,0.575609
2,0.127,1.10481,0.799267,0.719971,0.679523,0.684129
3,0.0408,1.244856,0.80385,0.748813,0.698062,0.7091
4,0.0212,1.310305,0.822181,0.749907,0.714703,0.721218
5,0.0142,1.322279,0.813016,0.693739,0.689158,0.677981
6,0.0094,1.397002,0.821265,0.749047,0.712769,0.719405
7,0.0106,1.394297,0.809349,0.737126,0.713548,0.706815
8,0.0089,1.461807,0.814849,0.780856,0.713629,0.734443
9,0.0075,1.454562,0.811182,0.762769,0.700309,0.719524
10,0.0029,1.565568,0.812099,0.71736,0.706254,0.703799


[I 2025-03-16 07:48:49,374] Trial 131 pruned. 


Trial 132 with params: {'learning_rate': 0.0021970060612283102, 'weight_decay': 0.008, 'adam_beta1': 0.93, 'warmup_steps': 41}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.875,1.019487,0.797434,0.709793,0.651679,0.664571
2,0.072,1.194766,0.802016,0.774111,0.710671,0.724929
3,0.0271,1.265846,0.813016,0.77115,0.735398,0.74063
4,0.0181,1.405792,0.808433,0.794392,0.692599,0.721033
5,0.0116,1.526867,0.819432,0.780624,0.718114,0.738032
6,0.0133,1.509616,0.823098,0.785773,0.715152,0.736258
7,0.0121,1.423903,0.817599,0.773312,0.72049,0.730774
8,0.008,1.72763,0.807516,0.747353,0.68556,0.699978
9,0.0073,1.686567,0.813016,0.772997,0.705361,0.725073
10,0.0066,1.739756,0.826764,0.759049,0.723146,0.727197


[I 2025-03-16 07:54:52,175] Trial 132 pruned. 


Trial 133 with params: {'learning_rate': 0.0027268198035749073, 'weight_decay': 0.01, 'adam_beta1': 0.99, 'warmup_steps': 14}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0601,1.052324,0.782768,0.665104,0.591074,0.610195
2,0.1145,1.167373,0.808433,0.787727,0.711453,0.733015
3,0.0331,1.211309,0.821265,0.719222,0.679895,0.687061
4,0.016,1.424948,0.816682,0.774744,0.704018,0.718686
5,0.0121,1.506668,0.819432,0.759248,0.686136,0.705834
6,0.0132,1.57204,0.816682,0.777473,0.715162,0.728811
7,0.0155,1.733211,0.812099,0.770161,0.681013,0.707612
8,0.0124,1.687837,0.814849,0.74358,0.709916,0.711787
9,0.0106,1.73593,0.815765,0.765613,0.70877,0.726559
10,0.0061,1.835586,0.813932,0.746159,0.714563,0.717208


[I 2025-03-16 08:00:52,321] Trial 133 pruned. 


Trial 134 with params: {'learning_rate': 0.0012505577383353551, 'weight_decay': 0.007, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 30}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.068,0.994484,0.769936,0.609352,0.567853,0.572135
2,0.1293,1.178466,0.793767,0.752193,0.671203,0.687179
3,0.0438,1.269909,0.797434,0.700032,0.67383,0.675735
4,0.0225,1.343758,0.802016,0.777695,0.701933,0.721557
5,0.0131,1.467103,0.79835,0.760345,0.676974,0.698363


[I 2025-03-16 08:02:15,358] Trial 134 pruned. 


Trial 135 with params: {'learning_rate': 0.0015651995499490585, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 36}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9495,0.983479,0.783685,0.705598,0.602895,0.627681
2,0.0954,1.164577,0.80385,0.735281,0.673899,0.684812
3,0.0316,1.275005,0.809349,0.755609,0.671415,0.696813
4,0.0182,1.397682,0.811182,0.745824,0.662055,0.689247
5,0.0131,1.272735,0.827681,0.79313,0.737703,0.754725
6,0.0098,1.468376,0.811182,0.74406,0.690259,0.700395
7,0.01,1.523238,0.812099,0.776009,0.710585,0.727076
8,0.0079,1.535938,0.804766,0.753989,0.683577,0.700091
9,0.0087,1.643216,0.804766,0.745963,0.697585,0.703373
10,0.0055,1.648413,0.813932,0.79182,0.727878,0.740179


[I 2025-03-16 08:11:19,837] Trial 135 finished with value: 0.7461503704273683 and parameters: {'learning_rate': 0.0015651995499490585, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 36}. Best is trial 22 with value: 0.777016449838325.


Trial 136 with params: {'learning_rate': 0.004671247634031377, 'weight_decay': 0.01, 'adam_beta1': 0.98, 'warmup_steps': 9}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8094,1.113727,0.7956,0.713824,0.636106,0.650406
2,0.0775,1.384334,0.796517,0.67457,0.655286,0.65218
3,0.0361,1.432321,0.821265,0.753639,0.694106,0.70803
4,0.0279,1.513089,0.815765,0.766352,0.692829,0.712005
5,0.0262,1.627241,0.814849,0.766711,0.658773,0.687587


[I 2025-03-16 08:12:45,378] Trial 136 pruned. 


Trial 137 with params: {'learning_rate': 0.004959689319927438, 'weight_decay': 0.007, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 32}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.703,1.068765,0.808433,0.739314,0.691733,0.70046
2,0.0626,1.226081,0.824015,0.773066,0.707669,0.724573
3,0.0364,1.284275,0.813016,0.733469,0.692524,0.7019
4,0.0243,1.538509,0.813932,0.775617,0.727713,0.731839
5,0.033,1.478074,0.820348,0.766779,0.725231,0.725811
6,0.0201,1.762818,0.810266,0.715026,0.66763,0.67853
7,0.0239,1.638288,0.813016,0.806189,0.720917,0.745861
8,0.024,1.641599,0.819432,0.766522,0.728192,0.735128
9,0.0153,1.989128,0.811182,0.775922,0.702613,0.723949
10,0.0143,1.886569,0.818515,0.79819,0.71609,0.738456


[I 2025-03-16 08:18:47,282] Trial 137 pruned. 


Trial 138 with params: {'learning_rate': 0.0034510961143013547, 'weight_decay': 0.01, 'adam_beta1': 0.99, 'warmup_steps': 5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8904,1.133276,0.79835,0.737286,0.644625,0.669583
2,0.0889,1.197198,0.79835,0.712821,0.645529,0.66321
3,0.0302,1.374525,0.813016,0.761046,0.684403,0.706102
4,0.0225,1.64397,0.818515,0.735489,0.698744,0.700865
5,0.0192,1.522056,0.816682,0.781465,0.707675,0.729437
6,0.0162,1.685503,0.817599,0.744518,0.703168,0.70997
7,0.0156,1.707868,0.807516,0.747821,0.665039,0.685014
8,0.0145,1.926503,0.814849,0.758295,0.698052,0.712215
9,0.0127,1.742767,0.813016,0.751066,0.698283,0.708151
10,0.0106,1.781993,0.819432,0.765625,0.703049,0.717572


[I 2025-03-16 08:25:13,141] Trial 138 pruned. 


Trial 139 with params: {'learning_rate': 1.1619982946199605e-05, 'weight_decay': 0.001, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 47}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3925,3.035507,0.27681,0.022299,0.047486,0.027605
2,2.7868,2.711981,0.329056,0.035488,0.064391,0.039778
3,2.5438,2.53318,0.375802,0.037222,0.078483,0.050033
4,2.3797,2.400928,0.39505,0.040307,0.084946,0.054403
5,2.2444,2.293384,0.418882,0.06669,0.093477,0.065784


[I 2025-03-16 08:26:53,191] Trial 139 pruned. 


Trial 140 with params: {'learning_rate': 0.0011907383945652516, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 29}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0069,0.973374,0.780018,0.658657,0.591914,0.606174
2,0.1294,1.158479,0.797434,0.754251,0.680933,0.691471
3,0.0411,1.218321,0.808433,0.742512,0.668007,0.69073
4,0.0225,1.324093,0.808433,0.782443,0.711296,0.733357
5,0.0131,1.301827,0.819432,0.759462,0.710574,0.722459
6,0.0108,1.407618,0.811182,0.786132,0.689902,0.721272
7,0.0094,1.52597,0.813932,0.781371,0.689942,0.71323
8,0.0099,1.567954,0.810266,0.749918,0.687621,0.705215
9,0.0085,1.553444,0.813016,0.796834,0.698604,0.728045
10,0.0053,1.433309,0.828598,0.80778,0.743541,0.760051


[I 2025-03-16 08:33:04,062] Trial 140 pruned. 


Trial 141 with params: {'learning_rate': 0.003080378734957893, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 22}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7409,0.931856,0.813016,0.715558,0.674078,0.681069
2,0.0607,1.127593,0.815765,0.76461,0.712511,0.722693
3,0.0266,1.282953,0.816682,0.771309,0.685954,0.710593
4,0.0194,1.305614,0.819432,0.778769,0.716221,0.729545
5,0.0152,1.48367,0.817599,0.723618,0.717041,0.710555
6,0.0149,1.505176,0.824931,0.763419,0.691659,0.705492
7,0.0133,1.681187,0.823098,0.781558,0.690195,0.719842
8,0.0074,1.646847,0.832264,0.793274,0.700124,0.728732
9,0.0076,1.64239,0.832264,0.782483,0.725186,0.737824
10,0.0138,1.668617,0.815765,0.779484,0.704008,0.727815


[I 2025-03-16 08:42:18,701] Trial 141 finished with value: 0.7583140123717402 and parameters: {'learning_rate': 0.003080378734957893, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 22}. Best is trial 22 with value: 0.777016449838325.


Trial 142 with params: {'learning_rate': 0.004449056322159956, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 12}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6414,1.033037,0.813016,0.782282,0.713166,0.732411
2,0.0607,1.225968,0.814849,0.805001,0.704485,0.728187
3,0.0319,1.335853,0.810266,0.732775,0.696259,0.698593
4,0.0327,1.348841,0.825848,0.778067,0.731016,0.74098
5,0.0194,1.446527,0.827681,0.790049,0.730566,0.737838
6,0.0175,1.491928,0.819432,0.783676,0.730164,0.738
7,0.0217,1.746181,0.828598,0.756603,0.70579,0.716834
8,0.0161,1.908298,0.818515,0.7515,0.720666,0.723374
9,0.0194,1.943783,0.823098,0.80466,0.714101,0.738263
10,0.0113,1.902535,0.83593,0.792182,0.719128,0.738998


[I 2025-03-16 08:51:15,282] Trial 142 finished with value: 0.7584623293812705 and parameters: {'learning_rate': 0.004449056322159956, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 12}. Best is trial 22 with value: 0.777016449838325.


Trial 143 with params: {'learning_rate': 0.0025880167028150746, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 10}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7415,1.034526,0.796517,0.719893,0.655414,0.668318
2,0.0632,1.144267,0.812099,0.784396,0.711918,0.728239
3,0.0274,1.306384,0.824931,0.755836,0.703835,0.715577
4,0.0193,1.292846,0.834097,0.802489,0.756937,0.764582
5,0.0109,1.492305,0.825848,0.798961,0.733094,0.751498
6,0.0162,1.421775,0.83868,0.785434,0.718003,0.733636
7,0.0106,1.455866,0.826764,0.770391,0.736517,0.740566
8,0.0104,1.477847,0.830431,0.769386,0.722016,0.730362
9,0.0087,1.646907,0.824015,0.7579,0.71555,0.71557
10,0.0087,1.742141,0.820348,0.780486,0.733431,0.737322


[I 2025-03-16 08:57:27,339] Trial 143 pruned. 


Trial 144 with params: {'learning_rate': 0.00438937660258649, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 12}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.634,0.992672,0.802016,0.757033,0.714258,0.718698
2,0.0635,1.182387,0.820348,0.765404,0.723735,0.723906
3,0.0279,1.259485,0.811182,0.751302,0.709785,0.713575
4,0.031,1.467391,0.810266,0.792386,0.693437,0.723115
5,0.0245,1.623215,0.823098,0.789795,0.721564,0.738792
6,0.0212,1.556507,0.821265,0.787585,0.722129,0.741185
7,0.013,1.754889,0.823098,0.803458,0.737043,0.749536
8,0.0181,1.832505,0.814849,0.777298,0.712249,0.724984
9,0.0135,1.677758,0.829514,0.767073,0.746741,0.742331
10,0.0089,1.832171,0.829514,0.78526,0.724831,0.740725


[I 2025-03-16 09:06:29,000] Trial 144 finished with value: 0.740271390576774 and parameters: {'learning_rate': 0.00438937660258649, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 12}. Best is trial 22 with value: 0.777016449838325.


Trial 145 with params: {'learning_rate': 0.0029639571582593762, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 29}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7506,1.034349,0.808433,0.768544,0.693607,0.710429
2,0.061,1.20732,0.799267,0.753916,0.695512,0.70973
3,0.0273,1.387851,0.812099,0.73134,0.673713,0.681659
4,0.0198,1.458403,0.824931,0.812375,0.705986,0.73716
5,0.0143,1.561587,0.818515,0.766645,0.718035,0.723681
6,0.0151,1.755501,0.807516,0.768015,0.681097,0.704758
7,0.0105,1.653929,0.811182,0.793031,0.7067,0.733936
8,0.0127,1.674949,0.825848,0.816542,0.710855,0.739663
9,0.0109,1.675884,0.824015,0.736803,0.706402,0.709939
10,0.0088,1.79299,0.822181,0.764563,0.713475,0.723971


[I 2025-03-16 09:12:44,860] Trial 145 pruned. 


Trial 146 with params: {'learning_rate': 0.003023138214394254, 'weight_decay': 0.001, 'adam_beta1': 0.91, 'warmup_steps': 22}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7319,1.033331,0.815765,0.773523,0.680925,0.705635
2,0.061,1.069592,0.822181,0.775974,0.713778,0.725641
3,0.0253,1.263649,0.816682,0.735613,0.70208,0.704143
4,0.0192,1.47057,0.816682,0.780618,0.694518,0.718303
5,0.0193,1.433349,0.814849,0.768482,0.711894,0.725174
6,0.0121,1.615777,0.812099,0.776204,0.697695,0.715029
7,0.0091,1.684529,0.815765,0.773772,0.711697,0.723604
8,0.011,1.816837,0.805683,0.775765,0.674093,0.70467
9,0.0112,1.779478,0.805683,0.700944,0.697968,0.682699
10,0.0103,1.811532,0.809349,0.734112,0.672964,0.686144


[I 2025-03-16 09:15:40,344] Trial 146 pruned. 


Trial 147 with params: {'learning_rate': 0.0008998164965906249, 'weight_decay': 0.001, 'adam_beta1': 0.93, 'warmup_steps': 14}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1686,1.016308,0.759853,0.608166,0.519079,0.536064
2,0.1914,1.119272,0.791017,0.732901,0.67411,0.687036
3,0.0601,1.252266,0.791934,0.715821,0.651883,0.667395
4,0.0295,1.359776,0.794684,0.714152,0.665522,0.676854
5,0.0195,1.399134,0.809349,0.715604,0.678886,0.685956
6,0.0117,1.407349,0.8011,0.729777,0.658816,0.680187
7,0.0104,1.449019,0.811182,0.728097,0.681633,0.692263
8,0.0084,1.603237,0.809349,0.743491,0.702493,0.709882
9,0.0083,1.561291,0.79835,0.691373,0.669285,0.665877
10,0.0052,1.676724,0.812099,0.753385,0.676462,0.698237


[I 2025-03-16 09:18:44,735] Trial 147 pruned. 


Trial 148 with params: {'learning_rate': 0.0007989056410281233, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1481,0.959305,0.752521,0.607598,0.531593,0.552298
2,0.215,1.202959,0.767186,0.690664,0.622546,0.631773
3,0.0728,1.326272,0.770852,0.732857,0.637746,0.667055
4,0.0361,1.309523,0.781852,0.692728,0.616629,0.638941
5,0.0193,1.488183,0.777269,0.671487,0.644221,0.642742


[I 2025-03-16 09:20:16,470] Trial 148 pruned. 


Trial 149 with params: {'learning_rate': 0.0035402095463311564, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 25}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7243,1.080336,0.802016,0.70505,0.681962,0.681439
2,0.0612,1.119594,0.824931,0.749952,0.716785,0.719584
3,0.028,1.214155,0.818515,0.818062,0.750939,0.768778
4,0.0208,1.43604,0.808433,0.744527,0.702617,0.712024
5,0.0131,1.453037,0.819432,0.736543,0.727146,0.721623
6,0.0164,1.488394,0.818515,0.776342,0.703548,0.723873
7,0.0177,1.503127,0.820348,0.787713,0.729339,0.747126
8,0.0133,1.560423,0.823098,0.773747,0.705068,0.722077
9,0.0116,1.639396,0.819432,0.758279,0.708749,0.718563
10,0.0097,1.933671,0.813016,0.748431,0.698886,0.70512


[I 2025-03-16 09:29:44,156] Trial 149 finished with value: 0.7551578131716231 and parameters: {'learning_rate': 0.0035402095463311564, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 25}. Best is trial 22 with value: 0.777016449838325.


In [64]:
print(best_trial3)

BestRun(run_id='22', objective=0.777016449838325, hyperparameters={'learning_rate': 0.0026826241523527678, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.92, 'warmup_steps': 27}, run_summary=None)


In [65]:
base.reset_seed()

In [66]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_fine_aug_hp-search", logging_dir=f"~/logs/{DATASET}/bilstm-distill_fine_aug_hp-search", remove_unused_columns=False, epochs=num_epochs, batch_size=batch_size)

In [67]:
def hp_space(trial):
    params =  {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-3, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0, 1e-2, step=1e-3),
        "adam_beta1" : trial.suggest_float("adam_beta1", 0.9, 0.99, step=0.01),
        "warmup_steps" : trial.suggest_int("warmup_steps", 0, warm_up),
        "lambda_param": trial.suggest_float("lambda_param",0,1,step=.1),
        "temperature": trial.suggest_float("temperature", 2,7, step=.5)
    }
    print(f"Trial {trial.number} with params: {params}")
    return params

In [68]:
pruner = optuna.pruners.HyperbandPruner(min_resource=min_r, max_resource=max_r, reduction_factor=2, bootstrap_count=2)
sampler = optuna.samplers.TPESampler(seed=42, multivariate=True)



In [69]:
trainer = base.DistilTrainer(
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    model_init = lambda: get_BiLSTM(),
    #callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)
  

In [70]:
best_trial4 = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    compute_objective=lambda metrics: metrics["eval_f1"],
    pruner=pruner,
    sampler=sampler,
    study_name="Test-Distill-aug",
    n_trials=150
)

[I 2025-03-16 09:29:44,460] A new study created in memory with name: Test-Distill-aug


Trial 0 with params: {'learning_rate': 0.0001025350969016849, 'weight_decay': 0.01, 'adam_beta1': 0.97, 'warmup_steps': 32, 'lambda_param': 0.1, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.642,1.280759,0.503208,0.132114,0.135255,0.109066
2,1.0551,1.018855,0.63428,0.237968,0.229314,0.211692
3,0.8116,0.87881,0.679193,0.305719,0.289219,0.283682
4,0.6637,0.816765,0.703941,0.336859,0.324906,0.315638
5,0.5642,0.766644,0.715857,0.404125,0.351942,0.354265


[I 2025-03-16 09:31:15,774] Trial 0 pruned. 


Trial 1 with params: {'learning_rate': 1.4347159517201392e-05, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.96, 'warmup_steps': 38, 'lambda_param': 0.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1589,1.921956,0.296059,0.037453,0.055045,0.026692
2,1.7453,1.680798,0.35472,0.034637,0.071704,0.044916
3,1.5751,1.566267,0.3978,0.044182,0.08574,0.055066
4,1.473,1.484092,0.428048,0.072603,0.096845,0.069587
5,1.3908,1.422888,0.446379,0.067487,0.103243,0.075309
6,1.3287,1.368327,0.470211,0.099661,0.114131,0.087634
7,1.2789,1.326956,0.488543,0.125524,0.124908,0.100363
8,1.2363,1.295577,0.494959,0.128698,0.129415,0.104876
9,1.2018,1.264966,0.510541,0.15276,0.13924,0.117818
10,1.1697,1.243351,0.52154,0.143612,0.146523,0.126863


[I 2025-03-16 09:34:12,058] Trial 1 pruned. 


Trial 2 with params: {'learning_rate': 0.001764971584817572, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 9, 'lambda_param': 0.30000000000000004, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.65,0.571268,0.80385,0.655001,0.560219,0.582614
2,0.1466,0.516468,0.819432,0.786711,0.708242,0.732165
3,0.0908,0.50991,0.830431,0.795225,0.702146,0.728329
4,0.079,0.531261,0.821265,0.799195,0.706025,0.730563
5,0.0744,0.510043,0.834097,0.817024,0.73367,0.759853
6,0.072,0.516507,0.822181,0.805875,0.705664,0.734277
7,0.0679,0.509737,0.831347,0.844654,0.72766,0.764533
8,0.0663,0.523162,0.824931,0.803381,0.7049,0.733285
9,0.0672,0.516094,0.828598,0.835807,0.742295,0.77145
10,0.0704,0.507807,0.831347,0.813937,0.710959,0.742943


[I 2025-03-16 09:40:20,604] Trial 2 pruned. 


Trial 3 with params: {'learning_rate': 0.0001464895513280072, 'weight_decay': 0.003, 'adam_beta1': 0.96, 'warmup_steps': 7, 'lambda_param': 0.30000000000000004, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4869,1.151296,0.583868,0.164925,0.185238,0.162491
2,0.8954,0.889281,0.672777,0.314841,0.288206,0.281238
3,0.6585,0.78747,0.715857,0.403784,0.346839,0.354746
4,0.5178,0.732436,0.732356,0.47134,0.375526,0.388786
5,0.4194,0.699739,0.751604,0.511699,0.440311,0.453245


[I 2025-03-16 09:41:53,416] Trial 3 pruned. 


Trial 4 with params: {'learning_rate': 0.00017018418817029164, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 27, 'lambda_param': 0.6000000000000001, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4081,1.075705,0.601283,0.192477,0.202192,0.176032
2,0.8151,0.840154,0.693859,0.315082,0.309679,0.30017
3,0.5914,0.749153,0.732356,0.44818,0.370601,0.379822
4,0.4574,0.701456,0.741522,0.51758,0.416005,0.438019
5,0.3637,0.673472,0.761687,0.530694,0.46976,0.487813
6,0.2985,0.646373,0.771769,0.566948,0.493179,0.511373
7,0.2489,0.632497,0.787351,0.602817,0.536409,0.555232
8,0.2136,0.617353,0.799267,0.664209,0.589478,0.610796
9,0.1868,0.610547,0.80385,0.676724,0.604329,0.623809
10,0.1655,0.612723,0.797434,0.674886,0.584557,0.612206


[I 2025-03-16 09:45:04,633] Trial 4 pruned. 


Trial 5 with params: {'learning_rate': 0.00043625993625605574, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 51, 'lambda_param': 1.0, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0809,0.764838,0.72319,0.38936,0.360561,0.35936
2,0.4201,0.619729,0.775435,0.527575,0.484107,0.490303
3,0.2384,0.588675,0.800183,0.672163,0.595801,0.6203
4,0.1609,0.566182,0.807516,0.749176,0.643773,0.679327
5,0.1251,0.549591,0.821265,0.776159,0.682076,0.709524
6,0.1055,0.544373,0.826764,0.811758,0.699414,0.733166
7,0.0946,0.543244,0.824931,0.788171,0.699632,0.726463
8,0.0871,0.534758,0.822181,0.804187,0.696076,0.730619
9,0.0807,0.536927,0.821265,0.815681,0.706007,0.741359
10,0.0775,0.545819,0.821265,0.782864,0.684307,0.714519


[I 2025-03-16 09:48:15,226] Trial 5 pruned. 


Trial 6 with params: {'learning_rate': 6.639623079859457e-05, 'weight_decay': 0.001, 'adam_beta1': 0.96, 'warmup_steps': 23, 'lambda_param': 0.1, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7381,1.396327,0.465628,0.093359,0.110499,0.083437
2,1.2063,1.176108,0.555454,0.155396,0.169876,0.148853
3,1.0123,1.036697,0.627864,0.217773,0.223979,0.201858
4,0.8644,0.944071,0.648029,0.26478,0.249447,0.236636
5,0.7516,0.874028,0.683776,0.30889,0.294808,0.286852
6,0.6677,0.832813,0.697525,0.33813,0.313208,0.311057
7,0.6026,0.805825,0.712191,0.363087,0.336599,0.334522
8,0.5494,0.786012,0.71769,0.406803,0.347799,0.35054
9,0.5086,0.768822,0.728689,0.436524,0.374587,0.380919
10,0.4701,0.756144,0.733272,0.454064,0.383402,0.394098


[I 2025-03-16 09:54:42,673] Trial 6 pruned. 


Trial 7 with params: {'learning_rate': 1.2382649697023537e-05, 'weight_decay': 0.01, 'adam_beta1': 0.92, 'warmup_steps': 35, 'lambda_param': 0.30000000000000004, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1628,1.940821,0.295142,0.03851,0.054506,0.02798
2,1.7797,1.721373,0.337305,0.035115,0.066645,0.04155
3,1.6199,1.609785,0.382218,0.037971,0.080867,0.051289
4,1.5196,1.529963,0.407883,0.074473,0.088659,0.058888
5,1.4422,1.468945,0.437214,0.072396,0.099613,0.071821
6,1.3818,1.418436,0.448213,0.066784,0.104188,0.075975
7,1.3316,1.375832,0.469294,0.099362,0.114212,0.088525
8,1.2899,1.343943,0.483043,0.103365,0.121883,0.09627
9,1.2563,1.315035,0.495875,0.136989,0.128803,0.104242
10,1.2253,1.289248,0.499542,0.123516,0.131251,0.10565


[I 2025-03-16 09:57:50,385] Trial 7 pruned. 


Trial 8 with params: {'learning_rate': 0.00029891977384598987, 'weight_decay': 0.002, 'adam_beta1': 0.99, 'warmup_steps': 41, 'lambda_param': 1.0, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4671,1.067496,0.597617,0.20362,0.200703,0.179967
2,0.752,0.759013,0.724106,0.387891,0.360572,0.357575
3,0.453,0.659371,0.762603,0.491741,0.450758,0.457756
4,0.3017,0.621473,0.779102,0.547335,0.50817,0.516497
5,0.2146,0.595164,0.788268,0.634146,0.55373,0.576677
6,0.165,0.584968,0.80385,0.682242,0.61769,0.635516
7,0.1353,0.565177,0.809349,0.708308,0.62733,0.64914
8,0.1163,0.566603,0.802016,0.724072,0.633813,0.663085
9,0.1058,0.562452,0.809349,0.720293,0.644808,0.669254
10,0.0959,0.552287,0.813016,0.722402,0.654551,0.676006


[I 2025-03-16 10:04:03,386] Trial 8 pruned. 


Trial 9 with params: {'learning_rate': 0.00041087915453240814, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'warmup_steps': 10, 'lambda_param': 0.0, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0709,0.787457,0.713107,0.409364,0.345093,0.350542
2,0.449,0.629333,0.783685,0.534094,0.495334,0.503514
3,0.2604,0.591683,0.797434,0.659445,0.574666,0.601931
4,0.1748,0.577151,0.796517,0.684162,0.59878,0.626064
5,0.1334,0.561875,0.815765,0.658919,0.614139,0.628816
6,0.111,0.544819,0.823098,0.791924,0.693783,0.727743
7,0.0987,0.554487,0.819432,0.79925,0.68443,0.720404
8,0.0899,0.541546,0.823098,0.772357,0.680613,0.71058
9,0.0836,0.548209,0.824015,0.758215,0.679977,0.706437
10,0.0797,0.543991,0.815765,0.765273,0.67146,0.70223


[I 2025-03-16 10:10:01,564] Trial 9 pruned. 


Trial 10 with params: {'learning_rate': 0.003234505082297928, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 16, 'lambda_param': 0.2, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5488,0.507974,0.823098,0.746333,0.654912,0.685842
2,0.1139,0.48056,0.843263,0.830972,0.742887,0.766218
3,0.0822,0.479825,0.837764,0.8279,0.737479,0.766734
4,0.0763,0.469996,0.850596,0.841534,0.737093,0.765455
5,0.0721,0.475711,0.84143,0.860014,0.734079,0.777686
6,0.0718,0.468485,0.852429,0.849682,0.735716,0.773096
7,0.0713,0.491185,0.840513,0.847115,0.736695,0.770284
8,0.0687,0.469588,0.84418,0.846888,0.730076,0.766938
9,0.0674,0.476233,0.84143,0.83061,0.744739,0.771176
10,0.069,0.470648,0.846013,0.835719,0.741294,0.768098


[I 2025-03-16 10:13:13,635] Trial 10 pruned. 


Trial 11 with params: {'learning_rate': 0.0015915550792002763, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9, 'warmup_steps': 6, 'lambda_param': 0.0, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6665,0.56867,0.807516,0.621083,0.559166,0.570902
2,0.1541,0.518841,0.813016,0.775056,0.685091,0.714207
3,0.0952,0.515338,0.812099,0.784484,0.681422,0.719139
4,0.0809,0.507157,0.827681,0.810429,0.708922,0.741461
5,0.0739,0.509386,0.826764,0.825916,0.712729,0.749335
6,0.0704,0.485224,0.835014,0.853076,0.726773,0.770063
7,0.0697,0.509531,0.825848,0.829829,0.723151,0.757395
8,0.0688,0.5157,0.827681,0.818779,0.70222,0.740268
9,0.0668,0.485166,0.839597,0.834446,0.722646,0.758834
10,0.0649,0.508129,0.825848,0.8263,0.706442,0.745509


[I 2025-03-16 10:22:02,259] Trial 11 finished with value: 0.7839749315494502 and parameters: {'learning_rate': 0.0015915550792002763, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9, 'warmup_steps': 6, 'lambda_param': 0.0, 'temperature': 5.5}. Best is trial 11 with value: 0.7839749315494502.


Trial 12 with params: {'learning_rate': 0.0005669752171100055, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'warmup_steps': 8, 'lambda_param': 0.30000000000000004, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.953,0.712801,0.739688,0.456623,0.408499,0.414048
2,0.3356,0.592191,0.799267,0.610344,0.556957,0.570447
3,0.1831,0.561069,0.807516,0.660137,0.600416,0.616559
4,0.1283,0.548613,0.814849,0.7607,0.667379,0.698596
5,0.1041,0.544005,0.817599,0.721821,0.649363,0.673996
6,0.091,0.530679,0.819432,0.774868,0.681129,0.712367
7,0.0844,0.534791,0.827681,0.777877,0.693646,0.72415
8,0.0801,0.520186,0.824015,0.792745,0.694324,0.72838
9,0.0742,0.532308,0.826764,0.777,0.691825,0.722429
10,0.0713,0.528676,0.822181,0.770568,0.693375,0.720399


[I 2025-03-16 10:25:10,274] Trial 12 pruned. 


Trial 13 with params: {'learning_rate': 0.003876462851269062, 'weight_decay': 0.003, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 10, 'lambda_param': 0.4, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5547,0.51568,0.825848,0.752897,0.683744,0.703038
2,0.1172,0.495359,0.829514,0.805233,0.729866,0.750644
3,0.0868,0.482645,0.83593,0.781229,0.724303,0.738632
4,0.0779,0.469769,0.843263,0.840557,0.743002,0.773452
5,0.074,0.483683,0.83593,0.844514,0.736667,0.768904
6,0.0756,0.492723,0.834097,0.818166,0.740697,0.764955
7,0.0736,0.483213,0.846013,0.845516,0.750556,0.780285
8,0.0708,0.464519,0.850596,0.834653,0.762441,0.785278
9,0.0683,0.480824,0.84418,0.824413,0.74258,0.764478
10,0.0733,0.494884,0.834097,0.807837,0.761944,0.772302


[I 2025-03-16 10:31:43,975] Trial 13 pruned. 


Trial 14 with params: {'learning_rate': 0.00380815486697971, 'weight_decay': 0.004, 'adam_beta1': 0.96, 'warmup_steps': 14, 'lambda_param': 0.4, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5875,0.517202,0.830431,0.767193,0.668833,0.699745
2,0.1196,0.462891,0.84143,0.853544,0.757511,0.787441
3,0.0851,0.464663,0.843263,0.848474,0.740837,0.777873
4,0.0759,0.459024,0.848763,0.859634,0.757505,0.790303
5,0.0744,0.49343,0.830431,0.830639,0.731504,0.760807
6,0.0754,0.468437,0.846013,0.854584,0.742225,0.780952
7,0.0734,0.481365,0.836847,0.824769,0.754369,0.773998
8,0.0717,0.483579,0.839597,0.8344,0.747913,0.774226
9,0.0703,0.472815,0.846013,0.837435,0.734611,0.765813
10,0.0684,0.478898,0.836847,0.830811,0.736433,0.763281


[I 2025-03-16 10:37:41,359] Trial 14 pruned. 


Trial 15 with params: {'learning_rate': 0.001694010332382728, 'weight_decay': 0.004, 'adam_beta1': 0.92, 'warmup_steps': 22, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6895,0.582428,0.802933,0.633118,0.542551,0.563881
2,0.1501,0.514668,0.820348,0.74517,0.673471,0.687174
3,0.0936,0.489854,0.831347,0.79193,0.716368,0.738132
4,0.0787,0.497613,0.834097,0.841108,0.722602,0.758111
5,0.0738,0.500796,0.829514,0.819704,0.72299,0.749941
6,0.0722,0.517887,0.824931,0.791843,0.713718,0.734985
7,0.0722,0.500821,0.825848,0.821549,0.717307,0.748442
8,0.0666,0.512972,0.824931,0.805629,0.695647,0.726892
9,0.0655,0.494327,0.824931,0.816299,0.713251,0.743273
10,0.0643,0.492543,0.831347,0.822535,0.731027,0.758624


[I 2025-03-16 10:43:40,779] Trial 15 pruned. 


Trial 16 with params: {'learning_rate': 0.004830881932580404, 'weight_decay': 0.01, 'adam_beta1': 0.92, 'warmup_steps': 23, 'lambda_param': 0.1, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5094,0.50063,0.828598,0.753951,0.672941,0.698014
2,0.1105,0.47353,0.83868,0.837254,0.736316,0.766034
3,0.0853,0.477531,0.84418,0.84016,0.765588,0.787145
4,0.0774,0.468224,0.848763,0.830644,0.755855,0.778172
5,0.0771,0.474249,0.84143,0.840641,0.762012,0.785084
6,0.0771,0.466759,0.848763,0.845961,0.75343,0.782376
7,0.0738,0.48923,0.83593,0.841954,0.743917,0.776387
8,0.0696,0.490775,0.835014,0.846511,0.751101,0.779203
9,0.0697,0.47468,0.837764,0.842966,0.744635,0.776303
10,0.0712,0.519214,0.830431,0.830322,0.738747,0.767182


[I 2025-03-16 10:52:46,576] Trial 16 finished with value: 0.7825578448435283 and parameters: {'learning_rate': 0.004830881932580404, 'weight_decay': 0.01, 'adam_beta1': 0.92, 'warmup_steps': 23, 'lambda_param': 0.1, 'temperature': 4.5}. Best is trial 11 with value: 0.7839749315494502.


Trial 17 with params: {'learning_rate': 0.0015458091554145527, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.91, 'warmup_steps': 39, 'lambda_param': 0.0, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7101,0.567059,0.7956,0.611644,0.527352,0.550184
2,0.1553,0.524465,0.813932,0.741837,0.657628,0.685272
3,0.0961,0.502344,0.827681,0.76231,0.675815,0.706286
4,0.082,0.508496,0.824931,0.807008,0.713325,0.742613
5,0.0743,0.499536,0.829514,0.800752,0.704854,0.738358
6,0.0698,0.495347,0.834097,0.833479,0.724144,0.758496
7,0.0674,0.504956,0.823098,0.804903,0.700302,0.735881
8,0.0656,0.50247,0.825848,0.798133,0.709531,0.740359
9,0.0723,0.511782,0.828598,0.771315,0.710649,0.730301
10,0.0688,0.494439,0.83868,0.796059,0.721051,0.744292


[I 2025-03-16 10:58:54,696] Trial 17 pruned. 


Trial 18 with params: {'learning_rate': 0.0043856905635156025, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 20, 'lambda_param': 0.4, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5142,0.481571,0.840513,0.78501,0.695395,0.725943
2,0.1115,0.474428,0.833181,0.829212,0.726447,0.76213
3,0.0835,0.470439,0.84418,0.846145,0.753651,0.783795
4,0.0789,0.491927,0.832264,0.824289,0.736628,0.764351
5,0.0797,0.460609,0.847846,0.838157,0.755028,0.77967
6,0.0725,0.459147,0.845096,0.842394,0.761954,0.783709
7,0.0706,0.469342,0.846929,0.838133,0.758457,0.784372
8,0.0715,0.482675,0.84143,0.833157,0.744738,0.773381
9,0.0754,0.481125,0.851512,0.847732,0.749939,0.779447
10,0.0694,0.457676,0.854262,0.852927,0.754885,0.786914


[I 2025-03-16 11:04:51,916] Trial 18 pruned. 


Trial 19 with params: {'learning_rate': 0.003409001089603001, 'weight_decay': 0.01, 'adam_beta1': 0.92, 'warmup_steps': 22, 'lambda_param': 0.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5755,0.524294,0.812099,0.742332,0.648208,0.681051
2,0.1176,0.509138,0.826764,0.833039,0.739624,0.771381
3,0.0833,0.485109,0.83868,0.780195,0.737619,0.750843
4,0.0761,0.490733,0.842346,0.845652,0.759533,0.789247
5,0.0733,0.487359,0.837764,0.821438,0.743695,0.768364
6,0.0767,0.51338,0.832264,0.820399,0.733646,0.761691
7,0.0757,0.520315,0.835014,0.814572,0.713366,0.747311
8,0.0702,0.505921,0.846013,0.846215,0.768804,0.792173
9,0.0692,0.491665,0.840513,0.848162,0.755197,0.785371
10,0.0666,0.490675,0.83868,0.827602,0.740663,0.766978


[I 2025-03-16 11:13:59,453] Trial 19 finished with value: 0.7944977652982129 and parameters: {'learning_rate': 0.003409001089603001, 'weight_decay': 0.01, 'adam_beta1': 0.92, 'warmup_steps': 22, 'lambda_param': 0.0, 'temperature': 7.0}. Best is trial 19 with value: 0.7944977652982129.


Trial 20 with params: {'learning_rate': 0.0014540694031265975, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.98, 'warmup_steps': 32, 'lambda_param': 0.1, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9082,0.629678,0.779102,0.54709,0.493731,0.503069
2,0.2116,0.521933,0.823098,0.772094,0.67439,0.703643
3,0.1079,0.511896,0.820348,0.784579,0.676523,0.712139
4,0.0854,0.484984,0.834097,0.801782,0.70901,0.737502
5,0.0744,0.478493,0.83593,0.817383,0.726356,0.756843
6,0.0704,0.478611,0.842346,0.828908,0.726513,0.759623
7,0.0711,0.496011,0.83868,0.79702,0.705082,0.735219
8,0.0708,0.486937,0.837764,0.825438,0.737272,0.767329
9,0.0691,0.496098,0.832264,0.832568,0.709614,0.746029
10,0.0657,0.474148,0.845096,0.845013,0.740632,0.777843


[I 2025-03-16 11:23:12,474] Trial 20 finished with value: 0.7834282269650078 and parameters: {'learning_rate': 0.0014540694031265975, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.98, 'warmup_steps': 32, 'lambda_param': 0.1, 'temperature': 6.5}. Best is trial 19 with value: 0.7944977652982129.


Trial 21 with params: {'learning_rate': 0.0014853357475626738, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.97, 'warmup_steps': 40, 'lambda_param': 0.0, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8596,0.61807,0.781852,0.54347,0.509996,0.513802
2,0.1889,0.523302,0.824015,0.768795,0.690814,0.713857
3,0.1032,0.514549,0.831347,0.789088,0.70758,0.735948
4,0.0837,0.496155,0.83868,0.817285,0.725808,0.757837
5,0.0747,0.489989,0.842346,0.831762,0.733428,0.769191
6,0.0712,0.48658,0.842346,0.845985,0.740775,0.778636
7,0.0699,0.49112,0.843263,0.831394,0.742854,0.771723
8,0.0684,0.493431,0.842346,0.833915,0.742104,0.773923
9,0.0671,0.496369,0.839597,0.822372,0.732065,0.761427
10,0.0662,0.47997,0.842346,0.828486,0.736064,0.767794


[I 2025-03-16 11:29:08,487] Trial 21 pruned. 


Trial 22 with params: {'learning_rate': 0.0033316248739208884, 'weight_decay': 0.008, 'adam_beta1': 0.92, 'warmup_steps': 28, 'lambda_param': 0.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5808,0.52568,0.830431,0.762467,0.677709,0.704939
2,0.1171,0.485085,0.83593,0.799127,0.705498,0.737247
3,0.0834,0.488495,0.837764,0.776703,0.707303,0.726994
4,0.0781,0.47737,0.840513,0.837431,0.7166,0.755192
5,0.0726,0.493806,0.845096,0.844513,0.726499,0.762166
6,0.0715,0.519708,0.825848,0.825619,0.721861,0.753233
7,0.0736,0.500987,0.835014,0.813661,0.709744,0.738954
8,0.0698,0.481939,0.84418,0.81831,0.734293,0.759721
9,0.0677,0.475211,0.843263,0.82966,0.735653,0.763933
10,0.0656,0.486391,0.839597,0.820458,0.725084,0.754168


[I 2025-03-16 11:32:13,756] Trial 22 pruned. 


Trial 23 with params: {'learning_rate': 0.0014994027584695777, 'weight_decay': 0.008, 'adam_beta1': 0.99, 'warmup_steps': 32, 'lambda_param': 0.30000000000000004, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9893,0.679132,0.756187,0.496338,0.447568,0.45341
2,0.2706,0.564151,0.813016,0.693276,0.62395,0.644677
3,0.1237,0.498607,0.825848,0.786061,0.708304,0.732806
4,0.0895,0.4954,0.834097,0.803202,0.71642,0.739435
5,0.0783,0.489527,0.831347,0.796738,0.72426,0.744743
6,0.0717,0.474005,0.836847,0.805787,0.714948,0.743001
7,0.0688,0.480083,0.831347,0.817322,0.725188,0.752321
8,0.068,0.478147,0.83593,0.798136,0.720796,0.744903
9,0.0687,0.486567,0.83868,0.791849,0.716753,0.741094
10,0.0706,0.50079,0.831347,0.810573,0.715073,0.746169


[I 2025-03-16 11:35:24,294] Trial 23 pruned. 


Trial 24 with params: {'learning_rate': 0.0014276478158167997, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.93, 'warmup_steps': 11, 'lambda_param': 0.0, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7307,0.587868,0.794684,0.574848,0.534878,0.538785
2,0.1718,0.524028,0.815765,0.754677,0.665767,0.693342
3,0.1015,0.509172,0.822181,0.788722,0.688657,0.718335
4,0.0839,0.512539,0.825848,0.800772,0.715323,0.742203
5,0.0761,0.515583,0.818515,0.813313,0.69587,0.731362
6,0.0728,0.499947,0.826764,0.803322,0.702942,0.731622
7,0.0686,0.491259,0.839597,0.809875,0.729124,0.752526
8,0.0697,0.52594,0.817599,0.808564,0.69614,0.731471
9,0.0697,0.507046,0.826764,0.83461,0.716463,0.752256
10,0.0662,0.507607,0.824931,0.814861,0.724424,0.752008


[I 2025-03-16 11:41:20,695] Trial 24 pruned. 


Trial 25 with params: {'learning_rate': 0.001267436688682732, 'weight_decay': 0.007, 'adam_beta1': 0.91, 'warmup_steps': 2, 'lambda_param': 0.4, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7262,0.592529,0.791017,0.58059,0.526643,0.535244
2,0.1811,0.529684,0.816682,0.777831,0.669875,0.700235
3,0.106,0.5172,0.823098,0.79368,0.69324,0.727723
4,0.0862,0.515282,0.823098,0.780977,0.69838,0.724491
5,0.077,0.51109,0.827681,0.788823,0.705566,0.732548
6,0.0724,0.514816,0.822181,0.78834,0.69654,0.724973
7,0.0699,0.500191,0.833181,0.82068,0.714608,0.747831
8,0.0699,0.517973,0.831347,0.838367,0.718428,0.753127
9,0.0675,0.498407,0.836847,0.842538,0.7225,0.759039
10,0.0653,0.493193,0.837764,0.838933,0.734362,0.76568


[I 2025-03-16 11:47:25,859] Trial 25 pruned. 


Trial 26 with params: {'learning_rate': 0.0036729601856127186, 'weight_decay': 0.01, 'adam_beta1': 0.96, 'warmup_steps': 4, 'lambda_param': 0.30000000000000004, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5858,0.540459,0.820348,0.713745,0.642256,0.663305
2,0.1225,0.483244,0.836847,0.800386,0.727804,0.750649
3,0.0865,0.495643,0.83593,0.802733,0.718719,0.742372
4,0.077,0.509933,0.833181,0.825763,0.743867,0.769229
5,0.0751,0.47146,0.837764,0.81286,0.728262,0.7559
6,0.0718,0.480034,0.839597,0.809058,0.74005,0.761077
7,0.0707,0.485568,0.846929,0.815,0.726187,0.752488
8,0.0707,0.479342,0.839597,0.828269,0.743973,0.767987
9,0.0723,0.486381,0.839597,0.809786,0.731648,0.7575
10,0.0702,0.483033,0.839597,0.835085,0.73271,0.762606


[I 2025-03-16 11:56:49,646] Trial 26 finished with value: 0.78499016164738 and parameters: {'learning_rate': 0.0036729601856127186, 'weight_decay': 0.01, 'adam_beta1': 0.96, 'warmup_steps': 4, 'lambda_param': 0.30000000000000004, 'temperature': 7.0}. Best is trial 19 with value: 0.7944977652982129.


Trial 27 with params: {'learning_rate': 0.0045648712915771685, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.539,0.50414,0.834097,0.794201,0.680875,0.719828
2,0.1205,0.488283,0.827681,0.797467,0.713971,0.740914
3,0.089,0.485325,0.84418,0.827767,0.746537,0.769558
4,0.0801,0.465148,0.846929,0.8641,0.758105,0.792164
5,0.0806,0.484236,0.840513,0.862737,0.759931,0.796998
6,0.0783,0.482632,0.851512,0.841631,0.752323,0.77913
7,0.0778,0.491878,0.846013,0.864249,0.773524,0.803009
8,0.0762,0.479725,0.847846,0.835069,0.768962,0.790576
9,0.0732,0.483517,0.84418,0.837996,0.743799,0.773159
10,0.0708,0.482131,0.847846,0.843346,0.745695,0.779634


[I 2025-03-16 12:05:57,981] Trial 27 finished with value: 0.794223610735292 and parameters: {'learning_rate': 0.0045648712915771685, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 6.0}. Best is trial 19 with value: 0.7944977652982129.


Trial 28 with params: {'learning_rate': 0.0008962293991234616, 'weight_decay': 0.01, 'adam_beta1': 0.96, 'warmup_steps': 1, 'lambda_param': 0.4, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8857,0.659102,0.767186,0.453104,0.445341,0.443624
2,0.2619,0.571481,0.802933,0.646306,0.585791,0.604551
3,0.1359,0.528551,0.822181,0.743902,0.684914,0.703369
4,0.0988,0.523396,0.812099,0.80635,0.67838,0.719891
5,0.0854,0.515124,0.825848,0.837763,0.70284,0.746769
6,0.0783,0.510843,0.826764,0.819918,0.718124,0.748746
7,0.0755,0.527624,0.816682,0.81172,0.699496,0.733127
8,0.0712,0.511817,0.819432,0.797621,0.690501,0.728689
9,0.069,0.508132,0.823098,0.821955,0.702879,0.743001
10,0.0672,0.509873,0.825848,0.820438,0.717943,0.752116


[I 2025-03-16 12:09:06,829] Trial 28 pruned. 


Trial 29 with params: {'learning_rate': 0.0035392841103002374, 'weight_decay': 0.008, 'adam_beta1': 0.96, 'warmup_steps': 7, 'lambda_param': 0.1, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6109,0.529423,0.819432,0.704164,0.633613,0.654724
2,0.124,0.494671,0.831347,0.795906,0.734021,0.752029
3,0.0868,0.482256,0.835014,0.797313,0.733282,0.753206
4,0.0764,0.493239,0.834097,0.801576,0.739936,0.758062
5,0.0736,0.4946,0.830431,0.778193,0.740423,0.749347
6,0.076,0.503679,0.826764,0.795843,0.720656,0.744977
7,0.0763,0.479072,0.831347,0.804592,0.729121,0.752256
8,0.0705,0.49144,0.829514,0.825348,0.73647,0.764002
9,0.0675,0.483723,0.834097,0.818122,0.752496,0.771281
10,0.0658,0.482069,0.836847,0.832229,0.74707,0.772924


[I 2025-03-16 12:14:59,368] Trial 29 pruned. 


Trial 30 with params: {'learning_rate': 0.004370372737209459, 'weight_decay': 0.006, 'adam_beta1': 0.98, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6217,0.53624,0.820348,0.711769,0.619273,0.646573
2,0.1356,0.47612,0.840513,0.807411,0.729892,0.752431
3,0.0925,0.459327,0.845096,0.786748,0.705528,0.733834
4,0.0818,0.472457,0.842346,0.849073,0.749824,0.780242
5,0.0788,0.479913,0.83593,0.813108,0.731342,0.757061
6,0.0775,0.49214,0.836847,0.809982,0.69244,0.730582
7,0.0755,0.477156,0.846929,0.829425,0.748581,0.77169
8,0.0775,0.503497,0.834097,0.809615,0.7186,0.74629
9,0.0766,0.486496,0.83868,0.795601,0.719471,0.742216
10,0.0722,0.503579,0.829514,0.822783,0.719141,0.752592


[I 2025-03-16 12:18:02,206] Trial 30 pruned. 


Trial 31 with params: {'learning_rate': 0.00438266440200875, 'weight_decay': 0.01, 'adam_beta1': 0.91, 'warmup_steps': 18, 'lambda_param': 0.1, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5158,0.507229,0.83868,0.809167,0.712521,0.745761
2,0.1117,0.485411,0.83868,0.821543,0.714036,0.749451
3,0.0848,0.492057,0.836847,0.80068,0.726739,0.747116
4,0.0777,0.466911,0.853346,0.840332,0.749863,0.778011
5,0.0739,0.466118,0.854262,0.828966,0.769074,0.786139
6,0.0741,0.479865,0.842346,0.833275,0.750473,0.774584
7,0.0765,0.472438,0.84418,0.816496,0.728397,0.757128
8,0.0725,0.462485,0.847846,0.797578,0.750338,0.761838
9,0.0705,0.479539,0.839597,0.811914,0.735708,0.76062
10,0.0667,0.469244,0.843263,0.844083,0.739055,0.769935


[I 2025-03-16 12:27:09,779] Trial 31 finished with value: 0.7822032975925864 and parameters: {'learning_rate': 0.00438266440200875, 'weight_decay': 0.01, 'adam_beta1': 0.91, 'warmup_steps': 18, 'lambda_param': 0.1, 'temperature': 6.5}. Best is trial 19 with value: 0.7944977652982129.


Trial 32 with params: {'learning_rate': 0.004633969888911568, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.92, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4894,0.506833,0.833181,0.796124,0.728367,0.74705
2,0.1104,0.477482,0.843263,0.8285,0.749373,0.771127
3,0.0846,0.468342,0.845096,0.828811,0.752886,0.777978
4,0.0775,0.484358,0.840513,0.824098,0.758101,0.778076
5,0.0803,0.490593,0.845096,0.826529,0.751091,0.773269
6,0.0746,0.474174,0.845096,0.841559,0.74875,0.778721
7,0.0724,0.477786,0.83868,0.832813,0.734919,0.76499
8,0.0744,0.480882,0.846013,0.821579,0.752069,0.774416
9,0.0735,0.493357,0.84143,0.834653,0.75471,0.781219
10,0.071,0.49339,0.839597,0.826809,0.74624,0.772453


[I 2025-03-16 12:36:37,984] Trial 32 finished with value: 0.7858291589133458 and parameters: {'learning_rate': 0.004633969888911568, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.92, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 5.5}. Best is trial 19 with value: 0.7944977652982129.


Trial 33 with params: {'learning_rate': 0.0037857960331369204, 'weight_decay': 0.01, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 2, 'lambda_param': 0.4, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5685,0.533048,0.814849,0.728074,0.661581,0.684504
2,0.1208,0.516987,0.824931,0.808395,0.711418,0.738464
3,0.0865,0.482008,0.846013,0.852008,0.750784,0.7865
4,0.078,0.511176,0.83868,0.847843,0.752543,0.781686
5,0.0755,0.478,0.843263,0.857296,0.753585,0.789632
6,0.0746,0.46954,0.848763,0.843724,0.75907,0.789334
7,0.0729,0.480964,0.845096,0.849886,0.76271,0.791915
8,0.0758,0.499886,0.846929,0.861066,0.758622,0.793528
9,0.0705,0.496289,0.84143,0.859505,0.7431,0.78455
10,0.0681,0.47855,0.846929,0.84412,0.763697,0.789936


[I 2025-03-16 12:45:58,729] Trial 33 finished with value: 0.8044066847863501 and parameters: {'learning_rate': 0.0037857960331369204, 'weight_decay': 0.01, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 2, 'lambda_param': 0.4, 'temperature': 7.0}. Best is trial 33 with value: 0.8044066847863501.


Trial 34 with params: {'learning_rate': 0.003304920509483724, 'weight_decay': 0.01, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.548,0.518433,0.826764,0.779375,0.648195,0.688433
2,0.1171,0.481714,0.83868,0.826442,0.760875,0.780556
3,0.0842,0.470152,0.843263,0.832901,0.743964,0.773404
4,0.0773,0.485829,0.837764,0.827576,0.738634,0.766654
5,0.0761,0.485708,0.84143,0.834212,0.759901,0.782577
6,0.0735,0.488586,0.843263,0.827031,0.738933,0.763392
7,0.0738,0.486624,0.84418,0.834786,0.741989,0.773438
8,0.0698,0.479757,0.84418,0.828791,0.762493,0.78276
9,0.069,0.467269,0.847846,0.835326,0.767707,0.790015
10,0.0665,0.480578,0.84418,0.850117,0.745344,0.775783


[I 2025-03-16 12:55:08,358] Trial 34 finished with value: 0.7722737248579123 and parameters: {'learning_rate': 0.003304920509483724, 'weight_decay': 0.01, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 6.5}. Best is trial 33 with value: 0.8044066847863501.


Trial 35 with params: {'learning_rate': 0.004291504500308519, 'weight_decay': 0.01, 'adam_beta1': 0.98, 'warmup_steps': 0, 'lambda_param': 0.7000000000000001, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.626,0.536833,0.818515,0.698215,0.633111,0.652826
2,0.137,0.477971,0.83868,0.81218,0.728106,0.75593
3,0.0939,0.466796,0.846013,0.851334,0.750262,0.781853
4,0.082,0.456905,0.847846,0.855151,0.750715,0.783806
5,0.0776,0.476277,0.84418,0.847464,0.755073,0.783928
6,0.0776,0.488201,0.84143,0.851079,0.753991,0.78694
7,0.0764,0.477716,0.846929,0.853138,0.752335,0.785254
8,0.0754,0.456763,0.850596,0.84095,0.758582,0.782423
9,0.072,0.474347,0.845096,0.84249,0.768725,0.791778
10,0.0732,0.470673,0.847846,0.848611,0.748008,0.781551


[I 2025-03-16 13:04:20,735] Trial 35 finished with value: 0.7973777866138214 and parameters: {'learning_rate': 0.004291504500308519, 'weight_decay': 0.01, 'adam_beta1': 0.98, 'warmup_steps': 0, 'lambda_param': 0.7000000000000001, 'temperature': 5.0}. Best is trial 33 with value: 0.8044066847863501.


Trial 36 with params: {'learning_rate': 0.00359216902503096, 'weight_decay': 0.006, 'adam_beta1': 0.93, 'warmup_steps': 9, 'lambda_param': 0.5, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5542,0.524555,0.812099,0.721707,0.623996,0.659834
2,0.1167,0.521854,0.820348,0.808285,0.699787,0.732431
3,0.085,0.470863,0.840513,0.810102,0.741315,0.761618
4,0.0751,0.469765,0.847846,0.855485,0.752254,0.783644
5,0.0735,0.487454,0.84418,0.833334,0.746682,0.77023
6,0.0766,0.495291,0.839597,0.816736,0.715929,0.747419
7,0.0742,0.488798,0.839597,0.843354,0.729799,0.76497
8,0.0703,0.503334,0.828598,0.842664,0.715776,0.752558
9,0.0687,0.477724,0.840513,0.831128,0.732483,0.764884
10,0.0698,0.485713,0.840513,0.799895,0.73757,0.753506


[I 2025-03-16 13:07:34,624] Trial 36 pruned. 


Trial 37 with params: {'learning_rate': 0.0040380338245852, 'weight_decay': 0.008, 'adam_beta1': 0.98, 'warmup_steps': 13, 'lambda_param': 0.8, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6518,0.536062,0.818515,0.727686,0.626846,0.653335
2,0.1352,0.50188,0.829514,0.784234,0.712975,0.734453
3,0.0896,0.46765,0.84418,0.831922,0.739517,0.770747
4,0.0793,0.492384,0.837764,0.814761,0.732058,0.759631
5,0.0749,0.478835,0.842346,0.815142,0.733522,0.759867
6,0.0735,0.475038,0.840513,0.821776,0.735276,0.762385
7,0.0734,0.486922,0.835014,0.805042,0.723492,0.74691
8,0.0745,0.532943,0.822181,0.809335,0.709397,0.737433
9,0.0731,0.522777,0.829514,0.839198,0.732529,0.761535
10,0.0703,0.485608,0.840513,0.844942,0.745317,0.775662


[I 2025-03-16 13:16:36,032] Trial 37 finished with value: 0.7780341030349011 and parameters: {'learning_rate': 0.0040380338245852, 'weight_decay': 0.008, 'adam_beta1': 0.98, 'warmup_steps': 13, 'lambda_param': 0.8, 'temperature': 4.0}. Best is trial 33 with value: 0.8044066847863501.


Trial 38 with params: {'learning_rate': 0.0017905922066498681, 'weight_decay': 0.01, 'adam_beta1': 0.97, 'warmup_steps': 1, 'lambda_param': 1.0, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7503,0.594118,0.793767,0.560458,0.555133,0.551806
2,0.1683,0.527257,0.819432,0.756111,0.671791,0.696067
3,0.0981,0.521522,0.822181,0.819572,0.706438,0.739964
4,0.081,0.507684,0.819432,0.789766,0.701066,0.72798
5,0.0768,0.499197,0.830431,0.79967,0.718629,0.743343
6,0.0715,0.5103,0.826764,0.831958,0.712899,0.751014
7,0.0705,0.513899,0.827681,0.790156,0.725178,0.744581
8,0.0704,0.517162,0.822181,0.784715,0.705987,0.730059
9,0.0689,0.502997,0.826764,0.777415,0.712246,0.730641
10,0.0674,0.526432,0.829514,0.833397,0.727203,0.762005


[I 2025-03-16 13:25:19,727] Trial 38 finished with value: 0.7846434340379821 and parameters: {'learning_rate': 0.0017905922066498681, 'weight_decay': 0.01, 'adam_beta1': 0.97, 'warmup_steps': 1, 'lambda_param': 1.0, 'temperature': 6.0}. Best is trial 33 with value: 0.8044066847863501.


Trial 39 with params: {'learning_rate': 0.003955513044831546, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.98, 'warmup_steps': 4, 'lambda_param': 0.6000000000000001, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6222,0.555988,0.813932,0.691893,0.616849,0.639485
2,0.1326,0.509327,0.83593,0.808041,0.718242,0.746362
3,0.0886,0.498167,0.835014,0.81917,0.730977,0.757671
4,0.0788,0.50417,0.83868,0.829958,0.734629,0.764587
5,0.0759,0.502458,0.83868,0.799098,0.735878,0.755561
6,0.0753,0.497242,0.835014,0.838232,0.731785,0.763069
7,0.0767,0.499957,0.839597,0.838167,0.732429,0.766444
8,0.0729,0.493784,0.850596,0.813744,0.750015,0.768963
9,0.0695,0.494552,0.837764,0.816341,0.734317,0.757695
10,0.0673,0.504025,0.843263,0.841343,0.745681,0.776522


[I 2025-03-16 13:34:23,808] Trial 39 finished with value: 0.7849638814471962 and parameters: {'learning_rate': 0.003955513044831546, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.98, 'warmup_steps': 4, 'lambda_param': 0.6000000000000001, 'temperature': 5.5}. Best is trial 33 with value: 0.8044066847863501.


Trial 40 with params: {'learning_rate': 1.1139092500128487e-05, 'weight_decay': 0.004, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 44, 'lambda_param': 1.0, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2007,1.999238,0.27406,0.021848,0.046834,0.026963
2,1.8273,1.759578,0.324473,0.035355,0.063279,0.038604
3,1.6597,1.644393,0.373969,0.036713,0.078062,0.049355
4,1.5583,1.565212,0.3978,0.044097,0.085517,0.054981
5,1.4802,1.505844,0.421632,0.067432,0.093795,0.065699
6,1.4197,1.452938,0.439963,0.067956,0.100663,0.072801
7,1.3703,1.410809,0.449129,0.064961,0.104457,0.076615
8,1.3281,1.378975,0.461045,0.093101,0.11005,0.084004
9,1.2944,1.347696,0.482126,0.125349,0.121225,0.096272
10,1.2638,1.324279,0.490376,0.112882,0.125092,0.099693


[I 2025-03-16 13:37:14,470] Trial 40 pruned. 


Trial 41 with params: {'learning_rate': 1.1995080817095684e-05, 'weight_decay': 0.003, 'adam_beta1': 0.99, 'warmup_steps': 14, 'lambda_param': 0.2, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2309,2.061919,0.176902,0.003538,0.02,0.006012
2,1.9092,1.800239,0.310724,0.039365,0.058421,0.033731
3,1.6744,1.64836,0.368469,0.036024,0.076823,0.048473
4,1.5504,1.555817,0.399633,0.060463,0.086061,0.055539
5,1.4662,1.490502,0.422548,0.067489,0.095162,0.068474
6,1.4031,1.436427,0.439047,0.06746,0.100676,0.073802
7,1.3495,1.38998,0.457379,0.092462,0.108509,0.082166
8,1.3043,1.355162,0.472044,0.115988,0.115861,0.089951
9,1.2681,1.322755,0.490376,0.131767,0.126009,0.101183
10,1.2371,1.296134,0.496792,0.126255,0.129564,0.105145


[I 2025-03-16 13:43:24,798] Trial 41 pruned. 


Trial 42 with params: {'learning_rate': 0.0035947083151584374, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.98, 'warmup_steps': 1, 'lambda_param': 0.5, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6416,0.543048,0.815765,0.700121,0.631591,0.652072
2,0.134,0.495822,0.828598,0.771884,0.714487,0.729443
3,0.0888,0.48003,0.837764,0.805514,0.731751,0.752346
4,0.08,0.481473,0.83593,0.818459,0.725212,0.75453
5,0.0754,0.466668,0.845096,0.835573,0.745773,0.773976
6,0.0748,0.504904,0.834097,0.810047,0.698757,0.738482
7,0.0735,0.482389,0.834097,0.819657,0.728044,0.758154
8,0.0717,0.507115,0.83593,0.828085,0.721424,0.757321
9,0.0723,0.498255,0.829514,0.810457,0.725849,0.753859
10,0.0741,0.485163,0.832264,0.820544,0.722024,0.752987


[I 2025-03-16 13:46:38,635] Trial 42 pruned. 


Trial 43 with params: {'learning_rate': 3.383524580670182e-05, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 26, 'lambda_param': 0.5, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8963,1.613514,0.389551,0.038918,0.083679,0.052832
2,1.4449,1.406631,0.452796,0.070898,0.106746,0.079676
3,1.2783,1.293831,0.494042,0.120211,0.129422,0.105217
4,1.1676,1.207518,0.552704,0.153054,0.170945,0.150622
5,1.0775,1.139838,0.589368,0.188371,0.19469,0.175274
6,1.0005,1.081673,0.611366,0.216427,0.212083,0.191888
7,0.9333,1.034491,0.626948,0.225637,0.225755,0.205988
8,0.8743,0.994093,0.63703,0.272239,0.238845,0.22523
9,0.8237,0.959249,0.658112,0.285609,0.258139,0.247395
10,0.7783,0.936189,0.668194,0.29895,0.271541,0.264612


[I 2025-03-16 13:49:43,863] Trial 43 pruned. 


Trial 44 with params: {'learning_rate': 0.004442958236800047, 'weight_decay': 0.008, 'adam_beta1': 0.93, 'warmup_steps': 2, 'lambda_param': 0.30000000000000004, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5004,0.498865,0.834097,0.777324,0.68097,0.714222
2,0.1113,0.472534,0.833181,0.80717,0.717113,0.745708
3,0.0833,0.469002,0.848763,0.810976,0.741327,0.759629
4,0.0766,0.475288,0.846013,0.82355,0.756101,0.776683
5,0.0776,0.505046,0.830431,0.827838,0.740011,0.767165
6,0.0751,0.486618,0.84143,0.811632,0.755686,0.77024
7,0.0751,0.505124,0.835014,0.835174,0.749073,0.776807
8,0.0715,0.48439,0.845096,0.826914,0.746542,0.773079
9,0.0701,0.503477,0.83868,0.817869,0.748258,0.769484
10,0.0697,0.508249,0.832264,0.828112,0.739193,0.768522


[I 2025-03-16 13:58:57,883] Trial 44 finished with value: 0.7915023811962298 and parameters: {'learning_rate': 0.004442958236800047, 'weight_decay': 0.008, 'adam_beta1': 0.93, 'warmup_steps': 2, 'lambda_param': 0.30000000000000004, 'temperature': 5.5}. Best is trial 33 with value: 0.8044066847863501.


Trial 45 with params: {'learning_rate': 0.0019482900703823118, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 9, 'lambda_param': 0.4, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.675,0.57819,0.8011,0.588265,0.551461,0.551706
2,0.1442,0.526096,0.817599,0.822936,0.71698,0.749272
3,0.0899,0.504493,0.828598,0.797357,0.717324,0.741729
4,0.0783,0.500948,0.827681,0.824223,0.715224,0.748497
5,0.0742,0.47311,0.842346,0.81707,0.70806,0.742109


[I 2025-03-16 14:00:28,577] Trial 45 pruned. 


Trial 46 with params: {'learning_rate': 0.00354211111280027, 'weight_decay': 0.01, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 14, 'lambda_param': 0.5, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5821,0.516617,0.826764,0.742767,0.658175,0.679821
2,0.119,0.490895,0.833181,0.822025,0.730467,0.760569
3,0.0849,0.483552,0.84418,0.844765,0.743422,0.77531
4,0.0774,0.47725,0.843263,0.852451,0.729698,0.766473
5,0.0755,0.467298,0.848763,0.842722,0.750895,0.78063
6,0.0728,0.471545,0.850596,0.850789,0.750929,0.780995
7,0.0732,0.480145,0.845096,0.850264,0.739523,0.773008
8,0.0705,0.4695,0.853346,0.84722,0.757521,0.785552
9,0.0712,0.485257,0.839597,0.847611,0.720461,0.760085
10,0.0694,0.486406,0.836847,0.827741,0.721996,0.753764


[I 2025-03-16 14:03:29,005] Trial 46 pruned. 


Trial 47 with params: {'learning_rate': 0.0019984412666101963, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.93, 'warmup_steps': 2, 'lambda_param': 0.4, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6379,0.5701,0.800183,0.675643,0.579107,0.604863
2,0.1405,0.511048,0.830431,0.80392,0.702105,0.731495
3,0.0898,0.507902,0.826764,0.807352,0.700654,0.731912
4,0.0788,0.508939,0.828598,0.801544,0.688982,0.72353
5,0.0723,0.50604,0.829514,0.810192,0.72323,0.748432
6,0.0697,0.491573,0.839597,0.798242,0.721185,0.743382
7,0.0736,0.505075,0.839597,0.824894,0.700154,0.737535
8,0.0691,0.498761,0.83868,0.829187,0.725884,0.75934
9,0.0663,0.495861,0.83593,0.819987,0.738207,0.765236
10,0.0643,0.489126,0.835014,0.817987,0.72341,0.753702


[I 2025-03-16 14:13:05,474] Trial 47 finished with value: 0.7656655629328185 and parameters: {'learning_rate': 0.0019984412666101963, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.93, 'warmup_steps': 2, 'lambda_param': 0.4, 'temperature': 6.0}. Best is trial 33 with value: 0.8044066847863501.


Trial 48 with params: {'learning_rate': 2.157692365351646e-05, 'weight_decay': 0.004, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 53, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0589,1.776078,0.321723,0.036422,0.06281,0.03767
2,1.6006,1.548266,0.401467,0.067411,0.086856,0.057178
3,1.433,1.433384,0.442713,0.066918,0.102324,0.074724
4,1.3234,1.349075,0.480293,0.133113,0.120583,0.095584
5,1.2437,1.288688,0.496792,0.126642,0.129559,0.104697
6,1.1818,1.239656,0.537122,0.153677,0.156436,0.135757
7,1.129,1.197419,0.551787,0.153623,0.170595,0.150857
8,1.081,1.160032,0.571036,0.163718,0.181822,0.160532
9,1.0377,1.126201,0.5967,0.190032,0.199674,0.178802
10,0.9952,1.096747,0.604033,0.211562,0.208028,0.188144


[I 2025-03-16 14:16:16,418] Trial 48 pruned. 


Trial 49 with params: {'learning_rate': 0.0016844707344939778, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 3, 'lambda_param': 0.8, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7048,0.586553,0.789184,0.553732,0.500308,0.512062
2,0.1592,0.517351,0.825848,0.772748,0.695455,0.719383
3,0.0977,0.510903,0.826764,0.820933,0.708504,0.744369
4,0.0814,0.510601,0.824931,0.820765,0.701373,0.740745
5,0.0742,0.491593,0.833181,0.783847,0.717608,0.737454


[I 2025-03-16 14:17:51,840] Trial 49 pruned. 


Trial 50 with params: {'learning_rate': 0.00015012253574062651, 'weight_decay': 0.002, 'adam_beta1': 0.98, 'warmup_steps': 48, 'lambda_param': 0.30000000000000004, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5925,1.193623,0.542621,0.146576,0.157071,0.132245
2,0.9227,0.888592,0.672777,0.310479,0.275293,0.26434
3,0.6615,0.780943,0.712191,0.388414,0.342816,0.342907
4,0.5099,0.73318,0.72594,0.441332,0.369397,0.377946
5,0.409,0.699716,0.752521,0.50679,0.459788,0.468906


[I 2025-03-16 14:19:23,212] Trial 50 pruned. 


Trial 51 with params: {'learning_rate': 6.656664117824731e-05, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 15, 'lambda_param': 0.8, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7029,1.385528,0.462878,0.095552,0.110719,0.084717
2,1.2043,1.178575,0.560037,0.161998,0.174206,0.154187
3,1.0141,1.051789,0.613199,0.215523,0.215278,0.195396
4,0.8716,0.952251,0.650779,0.278473,0.248048,0.236368
5,0.76,0.887702,0.676444,0.313838,0.283279,0.280982
6,0.6748,0.842546,0.696609,0.343201,0.311476,0.30862
7,0.6076,0.817108,0.714024,0.373373,0.338913,0.339367
8,0.5565,0.790948,0.716774,0.371091,0.341524,0.343452
9,0.5132,0.773842,0.731439,0.440464,0.368782,0.373339
10,0.4752,0.762105,0.731439,0.447877,0.37697,0.387288


[I 2025-03-16 14:25:26,333] Trial 51 pruned. 


Trial 52 with params: {'learning_rate': 0.00365756476902964, 'weight_decay': 0.007, 'adam_beta1': 0.92, 'warmup_steps': 8, 'lambda_param': 0.2, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5437,0.521268,0.825848,0.782902,0.669398,0.709077
2,0.1148,0.485226,0.836847,0.834059,0.728303,0.764071
3,0.0845,0.476142,0.837764,0.839055,0.743255,0.773474
4,0.0758,0.481222,0.83868,0.8416,0.750062,0.778448
5,0.0712,0.465727,0.846013,0.832709,0.749254,0.775796
6,0.0693,0.482854,0.837764,0.826477,0.740939,0.769516
7,0.0814,0.523382,0.828598,0.851052,0.724025,0.763751
8,0.0727,0.517924,0.839597,0.825043,0.739018,0.762681
9,0.0691,0.496974,0.84143,0.838086,0.744688,0.772981
10,0.0669,0.503071,0.833181,0.843502,0.740135,0.771199


[I 2025-03-16 14:31:17,776] Trial 52 pruned. 


Trial 53 with params: {'learning_rate': 0.003969475548609047, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.93, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5194,0.50417,0.833181,0.759894,0.673889,0.70088
2,0.1141,0.471273,0.843263,0.838046,0.727911,0.76752
3,0.0832,0.459378,0.84418,0.826322,0.739058,0.766329
4,0.0766,0.480925,0.839597,0.824964,0.724488,0.758034
5,0.0755,0.483862,0.837764,0.813448,0.732742,0.761806
6,0.0761,0.495861,0.83868,0.81894,0.728531,0.754569
7,0.0766,0.493725,0.840513,0.835647,0.739387,0.768095
8,0.0706,0.494923,0.837764,0.795284,0.736335,0.753286
9,0.0686,0.481136,0.839597,0.795526,0.747752,0.759063
10,0.0686,0.479764,0.848763,0.800941,0.742454,0.757609


[I 2025-03-16 14:40:25,129] Trial 53 finished with value: 0.7890348921695874 and parameters: {'learning_rate': 0.003969475548609047, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.93, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 5.0}. Best is trial 33 with value: 0.8044066847863501.


Trial 54 with params: {'learning_rate': 0.0005463733553772348, 'weight_decay': 0.01, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0059,0.733214,0.733272,0.416383,0.376023,0.379561
2,0.3646,0.623287,0.789184,0.573402,0.54479,0.549772
3,0.1995,0.569291,0.802933,0.656996,0.605497,0.620147
4,0.1356,0.562635,0.802933,0.71543,0.635372,0.660947
5,0.1072,0.568834,0.806599,0.711934,0.64036,0.666003
6,0.0953,0.552189,0.819432,0.825648,0.695151,0.738604
7,0.0852,0.555091,0.815765,0.817098,0.6934,0.735115
8,0.0798,0.542575,0.820348,0.80484,0.676298,0.720102
9,0.0756,0.538414,0.825848,0.795105,0.694054,0.727535
10,0.0748,0.552175,0.813932,0.776596,0.6918,0.718593


[I 2025-03-16 14:43:28,143] Trial 54 pruned. 


Trial 55 with params: {'learning_rate': 0.002064198781575967, 'weight_decay': 0.007, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 4, 'lambda_param': 0.0, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6508,0.561925,0.805683,0.606414,0.559521,0.567707
2,0.14,0.515914,0.830431,0.763381,0.709338,0.724461
3,0.0898,0.488689,0.84143,0.799247,0.740234,0.759411
4,0.0781,0.48356,0.842346,0.802619,0.722627,0.750763
5,0.074,0.475001,0.843263,0.794104,0.742852,0.75804
6,0.0712,0.495638,0.83868,0.81824,0.727833,0.756508
7,0.0732,0.491828,0.840513,0.802206,0.736125,0.758403
8,0.068,0.478251,0.83868,0.818509,0.746108,0.766392
9,0.0691,0.466917,0.854262,0.836497,0.75952,0.783046
10,0.0659,0.470696,0.839597,0.803474,0.740315,0.761016


[I 2025-03-16 14:52:29,718] Trial 55 finished with value: 0.7875201927245282 and parameters: {'learning_rate': 0.002064198781575967, 'weight_decay': 0.007, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 4, 'lambda_param': 0.0, 'temperature': 4.5}. Best is trial 33 with value: 0.8044066847863501.


Trial 56 with params: {'learning_rate': 0.0007595963489699833, 'weight_decay': 0.01, 'adam_beta1': 0.91, 'warmup_steps': 29, 'lambda_param': 0.1, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8869,0.647027,0.769936,0.48933,0.452205,0.459933
2,0.2593,0.572419,0.804766,0.681896,0.622816,0.636584
3,0.1415,0.540364,0.811182,0.751692,0.644768,0.679337
4,0.1038,0.536837,0.812099,0.760391,0.65784,0.689892
5,0.0886,0.529188,0.817599,0.768995,0.688939,0.713145
6,0.0809,0.524331,0.819432,0.805071,0.68668,0.723441
7,0.0753,0.518442,0.820348,0.791648,0.685902,0.719504
8,0.0706,0.510165,0.822181,0.811945,0.699335,0.735604
9,0.0687,0.529226,0.822181,0.814991,0.702538,0.740584
10,0.068,0.524421,0.823098,0.795967,0.699853,0.728809


[I 2025-03-16 14:55:35,494] Trial 56 pruned. 


Trial 57 with params: {'learning_rate': 0.0027752672905678195, 'weight_decay': 0.01, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 19, 'lambda_param': 0.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6252,0.541337,0.813016,0.741539,0.651802,0.67879
2,0.124,0.486618,0.833181,0.810004,0.72345,0.7531
3,0.0858,0.478281,0.846013,0.828482,0.731241,0.763973
4,0.0751,0.480583,0.83593,0.835055,0.737564,0.767215
5,0.0743,0.473589,0.840513,0.832255,0.738252,0.768865
6,0.0695,0.467011,0.845096,0.808787,0.73748,0.760354
7,0.0684,0.473653,0.842346,0.836943,0.735085,0.76728
8,0.0677,0.483163,0.836847,0.812666,0.734414,0.75764
9,0.0701,0.514977,0.829514,0.824418,0.728689,0.760695
10,0.0733,0.49668,0.827681,0.824814,0.718513,0.752741


[I 2025-03-16 14:58:40,953] Trial 57 pruned. 


Trial 58 with params: {'learning_rate': 0.0001202975463488601, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 15, 'lambda_param': 1.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5127,1.199153,0.549038,0.159719,0.162409,0.137387
2,0.9759,0.958345,0.647113,0.29359,0.251711,0.24272
3,0.7421,0.837838,0.695692,0.325374,0.306594,0.299667
4,0.6004,0.781555,0.718607,0.394539,0.343629,0.347282
5,0.5031,0.739672,0.734189,0.443098,0.378791,0.387113


[I 2025-03-16 15:00:13,862] Trial 58 pruned. 


Trial 59 with params: {'learning_rate': 0.00015745385629682237, 'weight_decay': 0.007, 'adam_beta1': 0.97, 'warmup_steps': 23, 'lambda_param': 0.9, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5061,1.135493,0.588451,0.162056,0.191155,0.168421
2,0.8699,0.866329,0.678277,0.308599,0.295004,0.289174
3,0.6285,0.774889,0.707608,0.410714,0.343102,0.352783
4,0.4892,0.728871,0.737855,0.486577,0.397428,0.408915
5,0.3925,0.692281,0.76352,0.507632,0.468381,0.472493
6,0.3245,0.663646,0.764436,0.511669,0.472672,0.483142
7,0.272,0.64634,0.775435,0.536765,0.486701,0.49383
8,0.2316,0.639306,0.780018,0.566634,0.508921,0.520778
9,0.2035,0.633523,0.789184,0.639825,0.568031,0.588675
10,0.18,0.638126,0.787351,0.625667,0.577654,0.590169


[I 2025-03-16 15:06:13,372] Trial 59 pruned. 


Trial 60 with params: {'learning_rate': 0.003039497949685994, 'weight_decay': 0.006, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5499,0.520155,0.812099,0.728314,0.657872,0.680434
2,0.119,0.474626,0.83593,0.821061,0.735862,0.759481
3,0.0863,0.464474,0.847846,0.843521,0.758407,0.784669
4,0.0775,0.461443,0.842346,0.822157,0.750998,0.773991
5,0.0743,0.465765,0.848763,0.820301,0.768693,0.78298
6,0.0718,0.472872,0.837764,0.823713,0.751884,0.773431
7,0.0695,0.471319,0.835014,0.844792,0.739495,0.772482
8,0.0686,0.485106,0.846929,0.822001,0.752363,0.774465
9,0.0766,0.501506,0.831347,0.808382,0.738469,0.759954
10,0.0693,0.480111,0.842346,0.820237,0.751122,0.772908


[I 2025-03-16 15:15:10,418] Trial 60 finished with value: 0.7926911751994209 and parameters: {'learning_rate': 0.003039497949685994, 'weight_decay': 0.006, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 5.5}. Best is trial 33 with value: 0.8044066847863501.


Trial 61 with params: {'learning_rate': 0.003987005511035391, 'weight_decay': 0.007, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 6, 'lambda_param': 0.4, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5437,0.50959,0.835014,0.790464,0.700923,0.725852
2,0.1172,0.480215,0.835014,0.821826,0.723108,0.752944
3,0.0857,0.491582,0.836847,0.818125,0.734585,0.761812
4,0.0777,0.475159,0.842346,0.836765,0.759565,0.783104
5,0.0746,0.4937,0.835014,0.846157,0.76251,0.789706
6,0.0799,0.500564,0.836847,0.845634,0.755038,0.785618
7,0.0752,0.490513,0.840513,0.835888,0.744864,0.773472
8,0.074,0.499504,0.840513,0.847073,0.756345,0.787808
9,0.0685,0.485426,0.840513,0.837334,0.75806,0.783644
10,0.0667,0.485226,0.839597,0.857303,0.740813,0.779512


[I 2025-03-16 15:24:22,948] Trial 61 finished with value: 0.7908447695758244 and parameters: {'learning_rate': 0.003987005511035391, 'weight_decay': 0.007, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 6, 'lambda_param': 0.4, 'temperature': 5.5}. Best is trial 33 with value: 0.8044066847863501.


Trial 62 with params: {'learning_rate': 0.0009773318085512794, 'weight_decay': 0.004, 'adam_beta1': 0.96, 'warmup_steps': 17, 'lambda_param': 0.1, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9005,0.657535,0.769936,0.483293,0.455271,0.460195
2,0.2424,0.571226,0.799267,0.693871,0.63155,0.647709
3,0.1267,0.541118,0.815765,0.782183,0.695773,0.723588
4,0.0958,0.529659,0.818515,0.807646,0.67952,0.720128
5,0.0826,0.522584,0.820348,0.825066,0.693923,0.733721
6,0.0752,0.510851,0.823098,0.828073,0.709278,0.746675
7,0.0719,0.523292,0.819432,0.835784,0.696052,0.740177
8,0.0701,0.542901,0.823098,0.830019,0.701359,0.743604
9,0.0706,0.521672,0.819432,0.82918,0.698707,0.740497
10,0.0699,0.514854,0.822181,0.832622,0.726272,0.763649


[I 2025-03-16 15:30:14,627] Trial 62 pruned. 


Trial 63 with params: {'learning_rate': 0.0025891564798974113, 'weight_decay': 0.006, 'adam_beta1': 0.93, 'warmup_steps': 4, 'lambda_param': 0.30000000000000004, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5925,0.541964,0.817599,0.740331,0.638301,0.673377
2,0.1249,0.508311,0.825848,0.790956,0.68518,0.72044
3,0.0858,0.490817,0.835014,0.799107,0.719737,0.74547
4,0.0751,0.484638,0.846929,0.835547,0.743023,0.769367
5,0.0735,0.50412,0.831347,0.79422,0.710947,0.734548
6,0.0745,0.487806,0.840513,0.822988,0.737515,0.759529
7,0.0713,0.479897,0.842346,0.845464,0.745498,0.773767
8,0.0684,0.480163,0.84143,0.856539,0.748322,0.781728
9,0.0682,0.474393,0.845096,0.823371,0.73743,0.764617
10,0.0653,0.493856,0.836847,0.812534,0.738809,0.762095


[I 2025-03-16 15:39:21,925] Trial 63 finished with value: 0.7968639613649594 and parameters: {'learning_rate': 0.0025891564798974113, 'weight_decay': 0.006, 'adam_beta1': 0.93, 'warmup_steps': 4, 'lambda_param': 0.30000000000000004, 'temperature': 5.0}. Best is trial 33 with value: 0.8044066847863501.


Trial 64 with params: {'learning_rate': 0.0021832342147182294, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 9, 'lambda_param': 0.2, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.619,0.55227,0.806599,0.718736,0.624958,0.650649
2,0.1303,0.516519,0.825848,0.799112,0.710549,0.739728
3,0.0888,0.492468,0.829514,0.798377,0.716681,0.742709
4,0.0764,0.494684,0.830431,0.816025,0.72981,0.756055
5,0.0712,0.478117,0.829514,0.834129,0.754721,0.780835
6,0.0702,0.476052,0.837764,0.827928,0.741933,0.767012
7,0.069,0.497055,0.832264,0.823725,0.744491,0.766233
8,0.0683,0.496985,0.834097,0.832964,0.748348,0.772055
9,0.0674,0.505251,0.83593,0.829575,0.767524,0.786248
10,0.0689,0.493649,0.83593,0.827187,0.74564,0.77144


[I 2025-03-16 15:48:19,240] Trial 64 finished with value: 0.7849079237258973 and parameters: {'learning_rate': 0.0021832342147182294, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 9, 'lambda_param': 0.2, 'temperature': 5.0}. Best is trial 33 with value: 0.8044066847863501.


Trial 65 with params: {'learning_rate': 0.0012132004339717262, 'weight_decay': 0.005, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7626,0.595652,0.791017,0.535864,0.512159,0.519584
2,0.1945,0.526928,0.829514,0.756325,0.686972,0.707041
3,0.1093,0.50268,0.831347,0.786252,0.714148,0.738156
4,0.0878,0.507781,0.828598,0.81167,0.710242,0.744043
5,0.0807,0.493084,0.839597,0.837279,0.733894,0.767865
6,0.0725,0.500268,0.827681,0.810635,0.707681,0.739045
7,0.0703,0.494848,0.829514,0.833975,0.72687,0.763775
8,0.0706,0.49312,0.839597,0.809157,0.722045,0.751244
9,0.0682,0.491204,0.834097,0.814315,0.722602,0.753758
10,0.0703,0.484756,0.83868,0.834098,0.738611,0.769722


[I 2025-03-16 15:54:20,626] Trial 65 pruned. 


Trial 66 with params: {'learning_rate': 0.0009316576771192103, 'weight_decay': 0.004, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0, 'lambda_param': 0.5, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8332,0.637017,0.764436,0.488709,0.462526,0.469063
2,0.2359,0.561163,0.804766,0.68807,0.623529,0.63763
3,0.1276,0.549916,0.802016,0.739192,0.64598,0.673334
4,0.0971,0.526462,0.817599,0.804588,0.692271,0.728261
5,0.0849,0.506193,0.824015,0.815089,0.70642,0.739575
6,0.0767,0.508101,0.833181,0.810078,0.712751,0.741935
7,0.0731,0.520722,0.817599,0.79868,0.707403,0.733588
8,0.0734,0.53012,0.813932,0.811043,0.703986,0.735539
9,0.0692,0.507404,0.830431,0.821775,0.719222,0.753064
10,0.067,0.511505,0.828598,0.82583,0.712704,0.749508


[I 2025-03-16 15:57:24,813] Trial 66 pruned. 


Trial 67 with params: {'learning_rate': 0.0022438797366839285, 'weight_decay': 0.01, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 1, 'lambda_param': 0.4, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6429,0.554542,0.809349,0.672411,0.598502,0.613506
2,0.1366,0.508286,0.821265,0.740861,0.670771,0.688506
3,0.0892,0.499477,0.829514,0.780935,0.724781,0.740521
4,0.0774,0.502029,0.831347,0.83573,0.733874,0.76722
5,0.0725,0.491772,0.835014,0.823314,0.731085,0.760919
6,0.0734,0.500147,0.845096,0.825093,0.744519,0.76783
7,0.0706,0.485098,0.839597,0.839975,0.742622,0.774331
8,0.0665,0.483117,0.836847,0.831839,0.738897,0.767682
9,0.067,0.492521,0.836847,0.822508,0.731865,0.762142
10,0.0649,0.483817,0.837764,0.84781,0.729515,0.7662


[I 2025-03-16 16:00:19,373] Trial 67 pruned. 


Trial 68 with params: {'learning_rate': 0.0036445336057384947, 'weight_decay': 0.005, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5404,0.535008,0.818515,0.709883,0.647839,0.663756
2,0.1192,0.47451,0.843263,0.851957,0.744914,0.776285
3,0.0874,0.500142,0.837764,0.824607,0.741497,0.767983
4,0.0806,0.476316,0.846929,0.825892,0.737424,0.761564
5,0.0756,0.46507,0.846013,0.834225,0.750514,0.772883
6,0.0713,0.468328,0.846929,0.830206,0.741711,0.769728
7,0.0727,0.49826,0.846929,0.855009,0.76402,0.790503
8,0.0779,0.497206,0.839597,0.812433,0.738014,0.754095
9,0.0714,0.476337,0.850596,0.837186,0.755145,0.77952
10,0.0674,0.486404,0.846929,0.830185,0.750127,0.774322


[I 2025-03-16 16:06:07,901] Trial 68 pruned. 


Trial 69 with params: {'learning_rate': 8.217756913819257e-05, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 53, 'lambda_param': 0.9, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6599,1.325202,0.494042,0.112473,0.128383,0.102012
2,1.1251,1.095723,0.601283,0.192717,0.205045,0.181719
3,0.9113,0.960916,0.651696,0.287592,0.25252,0.241876
4,0.7599,0.873821,0.690192,0.332492,0.297321,0.298221
5,0.6523,0.825734,0.704858,0.341528,0.322285,0.319328
6,0.5773,0.784099,0.71769,0.369522,0.345049,0.343187
7,0.5141,0.764756,0.730522,0.423815,0.365886,0.373956
8,0.4648,0.737442,0.742438,0.494281,0.406949,0.422834
9,0.4243,0.724561,0.750687,0.479633,0.433097,0.442459
10,0.3882,0.7203,0.746104,0.516235,0.437799,0.456536


[I 2025-03-16 16:09:14,349] Trial 69 pruned. 


Trial 70 with params: {'learning_rate': 0.0010460019818400222, 'weight_decay': 0.008, 'adam_beta1': 0.96, 'warmup_steps': 1, 'lambda_param': 0.2, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8383,0.639226,0.774519,0.521535,0.478767,0.487865
2,0.2292,0.556214,0.813016,0.72941,0.647417,0.670284
3,0.1224,0.52494,0.819432,0.754421,0.680877,0.70466
4,0.0927,0.504689,0.834097,0.809556,0.714652,0.746121
5,0.0827,0.501061,0.835014,0.829015,0.722242,0.759411
6,0.0741,0.499937,0.833181,0.829654,0.72857,0.761853
7,0.0715,0.499834,0.832264,0.833914,0.721883,0.760722
8,0.0693,0.503194,0.835014,0.840862,0.725538,0.765404
9,0.0722,0.517148,0.831347,0.839721,0.729092,0.76753
10,0.0682,0.506851,0.835014,0.829747,0.720316,0.755029


[I 2025-03-16 16:18:03,479] Trial 70 finished with value: 0.7653436673929629 and parameters: {'learning_rate': 0.0010460019818400222, 'weight_decay': 0.008, 'adam_beta1': 0.96, 'warmup_steps': 1, 'lambda_param': 0.2, 'temperature': 7.0}. Best is trial 33 with value: 0.8044066847863501.


Trial 71 with params: {'learning_rate': 0.00013384909531240569, 'weight_decay': 0.005, 'adam_beta1': 0.93, 'warmup_steps': 53, 'lambda_param': 0.1, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5164,1.155206,0.567369,0.167819,0.174502,0.1539
2,0.9141,0.907423,0.662695,0.291957,0.269813,0.256917
3,0.6823,0.805447,0.703941,0.346102,0.318284,0.314495
4,0.5478,0.752039,0.726856,0.416583,0.35912,0.367203
5,0.4523,0.717216,0.747021,0.495066,0.425057,0.438959
6,0.3824,0.687069,0.75527,0.510057,0.437411,0.456383
7,0.3276,0.673192,0.762603,0.536404,0.475029,0.490302
8,0.2867,0.655469,0.779102,0.584568,0.516139,0.532393
9,0.2515,0.646928,0.784601,0.591697,0.523034,0.53999
10,0.2237,0.641051,0.784601,0.636328,0.54211,0.57422


[I 2025-03-16 16:24:04,817] Trial 71 pruned. 


Trial 72 with params: {'learning_rate': 0.001437468332037258, 'weight_decay': 0.006, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1, 'lambda_param': 0.4, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.72,0.600813,0.787351,0.561905,0.514158,0.526606
2,0.1729,0.536416,0.818515,0.77237,0.676156,0.705795
3,0.1034,0.517387,0.828598,0.774136,0.709401,0.72726
4,0.0842,0.510931,0.827681,0.818186,0.704296,0.739447
5,0.0757,0.498836,0.832264,0.823566,0.707037,0.741738
6,0.0695,0.500769,0.829514,0.822626,0.709315,0.745315
7,0.0709,0.521205,0.824931,0.824095,0.706635,0.746756
8,0.0716,0.501155,0.827681,0.808866,0.700287,0.73079
9,0.068,0.503481,0.833181,0.819643,0.707777,0.739844
10,0.0664,0.490635,0.83868,0.839843,0.719164,0.757011


[I 2025-03-16 16:29:56,752] Trial 72 pruned. 


Trial 73 with params: {'learning_rate': 0.004186424002957262, 'weight_decay': 0.006, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 10, 'lambda_param': 0.5, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5572,0.521002,0.825848,0.761492,0.683494,0.704804
2,0.1166,0.497288,0.833181,0.818255,0.733404,0.760703
3,0.0858,0.498462,0.827681,0.804218,0.734392,0.756993
4,0.0778,0.509477,0.840513,0.84091,0.744952,0.777543
5,0.0771,0.485986,0.840513,0.836837,0.736921,0.768532
6,0.0765,0.506016,0.832264,0.842071,0.728862,0.763391
7,0.077,0.509379,0.836847,0.844564,0.735483,0.76863
8,0.0711,0.50568,0.833181,0.821435,0.73988,0.764822
9,0.069,0.491822,0.835014,0.8338,0.736138,0.761698
10,0.0691,0.498898,0.839597,0.824211,0.753995,0.776142


[I 2025-03-16 16:36:00,307] Trial 73 pruned. 


Trial 74 with params: {'learning_rate': 0.004331494090250828, 'weight_decay': 0.005, 'adam_beta1': 0.92, 'warmup_steps': 3, 'lambda_param': 0.5, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5186,0.487793,0.829514,0.734429,0.675341,0.693085
2,0.1148,0.453954,0.84143,0.806829,0.709646,0.739784
3,0.0859,0.463464,0.839597,0.836714,0.730378,0.76357
4,0.0781,0.463247,0.854262,0.840335,0.755404,0.785631
5,0.075,0.458176,0.850596,0.82916,0.750603,0.775636
6,0.0791,0.491446,0.833181,0.823135,0.735771,0.762991
7,0.0777,0.486042,0.843263,0.809723,0.74602,0.764042
8,0.0724,0.475022,0.848763,0.84309,0.745437,0.778067
9,0.0686,0.476823,0.845096,0.809227,0.715095,0.745266
10,0.0672,0.471325,0.852429,0.832451,0.760075,0.785615


[I 2025-03-16 16:45:28,247] Trial 74 finished with value: 0.7865752218106604 and parameters: {'learning_rate': 0.004331494090250828, 'weight_decay': 0.005, 'adam_beta1': 0.92, 'warmup_steps': 3, 'lambda_param': 0.5, 'temperature': 5.0}. Best is trial 33 with value: 0.8044066847863501.


Trial 75 with params: {'learning_rate': 0.002646043024714258, 'weight_decay': 0.01, 'adam_beta1': 0.99, 'warmup_steps': 4, 'lambda_param': 0.7000000000000001, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7736,0.602259,0.787351,0.496709,0.477565,0.480822
2,0.1838,0.526098,0.813932,0.714759,0.650171,0.666444
3,0.0987,0.486949,0.830431,0.789126,0.726849,0.744801
4,0.0807,0.48576,0.830431,0.791838,0.721385,0.741432
5,0.0741,0.481087,0.831347,0.807856,0.729663,0.753015
6,0.0707,0.491441,0.830431,0.811303,0.726211,0.750313
7,0.0701,0.492635,0.830431,0.832061,0.730597,0.762857
8,0.0703,0.508073,0.823098,0.795704,0.722297,0.744564
9,0.071,0.502014,0.827681,0.810522,0.719077,0.749792
10,0.0693,0.499899,0.832264,0.812258,0.720023,0.750155


[I 2025-03-16 16:48:41,374] Trial 75 pruned. 


Trial 76 with params: {'learning_rate': 0.002394046892310894, 'weight_decay': 0.01, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0, 'lambda_param': 0.7000000000000001, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6399,0.552881,0.812099,0.666786,0.605957,0.621099
2,0.1357,0.482498,0.843263,0.813607,0.730771,0.756878
3,0.0919,0.488587,0.833181,0.777036,0.70917,0.729825
4,0.0779,0.47591,0.837764,0.805352,0.698056,0.733387
5,0.0751,0.482215,0.836847,0.797713,0.734903,0.755574


[I 2025-03-16 16:50:17,663] Trial 76 pruned. 


Trial 77 with params: {'learning_rate': 0.0020667864960216644, 'weight_decay': 0.008, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 11, 'lambda_param': 0.30000000000000004, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6642,0.572808,0.804766,0.652589,0.586049,0.601517
2,0.139,0.517781,0.826764,0.823553,0.713242,0.748488
3,0.0892,0.509862,0.820348,0.785714,0.704552,0.732552
4,0.0784,0.494885,0.825848,0.824485,0.724302,0.758798
5,0.073,0.493664,0.830431,0.809158,0.728056,0.753393
6,0.0706,0.491942,0.835014,0.817202,0.726656,0.754508
7,0.0689,0.504093,0.832264,0.836997,0.735064,0.76481
8,0.0703,0.516533,0.825848,0.809378,0.697288,0.725581
9,0.0679,0.476512,0.83868,0.837903,0.739284,0.767724
10,0.0671,0.506265,0.83593,0.796482,0.738145,0.753648


[I 2025-03-16 16:56:22,767] Trial 77 pruned. 


Trial 78 with params: {'learning_rate': 1.3245726232440102e-05, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.96, 'warmup_steps': 6, 'lambda_param': 0.30000000000000004, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1622,1.943465,0.296059,0.018758,0.054557,0.027477
2,1.7675,1.704826,0.340972,0.033904,0.067654,0.042054
3,1.6015,1.591852,0.393217,0.039557,0.084404,0.05363
4,1.5023,1.512035,0.412466,0.064437,0.090108,0.060407
5,1.4216,1.450048,0.44088,0.069788,0.101302,0.073771
6,1.3596,1.395708,0.454629,0.067453,0.106412,0.078343
7,1.3097,1.354448,0.48121,0.135844,0.121318,0.097422
8,1.2678,1.323457,0.488543,0.12083,0.125076,0.100222
9,1.2342,1.292403,0.499542,0.126434,0.13234,0.108609
10,1.2025,1.269913,0.510541,0.139025,0.137888,0.114816


[I 2025-03-16 16:59:23,637] Trial 78 pruned. 


Trial 79 with params: {'learning_rate': 0.0001802416671442023, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.96, 'warmup_steps': 25, 'lambda_param': 1.0, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4402,1.070611,0.610449,0.210743,0.204212,0.182848
2,0.806,0.832627,0.697525,0.331891,0.315811,0.309425
3,0.5723,0.739962,0.728689,0.414187,0.37233,0.376855
4,0.4349,0.693314,0.751604,0.518725,0.432003,0.448655
5,0.3403,0.664153,0.769019,0.519661,0.483212,0.489221


[I 2025-03-16 17:00:54,819] Trial 79 pruned. 


Trial 80 with params: {'learning_rate': 0.0044824907456591555, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.97, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5712,0.53058,0.818515,0.73891,0.660521,0.688123
2,0.1225,0.486721,0.836847,0.796008,0.711187,0.739696
3,0.085,0.455345,0.84418,0.849273,0.748349,0.782907
4,0.0774,0.471032,0.839597,0.832947,0.723858,0.757764
5,0.0739,0.460866,0.846013,0.844658,0.745742,0.776778
6,0.0772,0.513699,0.83593,0.847508,0.732617,0.77163
7,0.0776,0.507434,0.828598,0.833925,0.723908,0.757919
8,0.0726,0.501564,0.833181,0.846408,0.73423,0.771443
9,0.0709,0.522147,0.825848,0.825855,0.735535,0.764468
10,0.0709,0.507374,0.833181,0.85109,0.740552,0.778739


[I 2025-03-16 17:09:55,617] Trial 80 finished with value: 0.7827433427827111 and parameters: {'learning_rate': 0.0044824907456591555, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.97, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 5.5}. Best is trial 33 with value: 0.8044066847863501.


Trial 81 with params: {'learning_rate': 0.0019385661340414991, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 25, 'lambda_param': 1.0, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6531,0.58171,0.794684,0.653945,0.549851,0.578711
2,0.1361,0.51511,0.826764,0.795231,0.702412,0.726106
3,0.089,0.497441,0.828598,0.780553,0.709908,0.732421
4,0.0771,0.503468,0.828598,0.797354,0.690457,0.722326
5,0.0728,0.501347,0.83593,0.830975,0.727021,0.759751
6,0.0714,0.494634,0.833181,0.816424,0.718209,0.744518
7,0.0705,0.481999,0.836847,0.813213,0.726733,0.750022
8,0.0671,0.500755,0.833181,0.823373,0.732141,0.759313
9,0.0668,0.517554,0.824015,0.813757,0.731651,0.757267
10,0.0676,0.491224,0.832264,0.822614,0.734843,0.759113


[I 2025-03-16 17:12:59,266] Trial 81 pruned. 


Trial 82 with params: {'learning_rate': 0.0043152219138711595, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.93, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5337,0.507529,0.830431,0.770735,0.684795,0.712774
2,0.1174,0.47526,0.84418,0.828247,0.737196,0.764299
3,0.0869,0.473408,0.846013,0.81566,0.75505,0.77184
4,0.0823,0.481184,0.843263,0.794535,0.729434,0.74984
5,0.0756,0.477529,0.847846,0.833784,0.768095,0.785604
6,0.0767,0.491154,0.843263,0.831456,0.747869,0.774296
7,0.0761,0.494475,0.846013,0.809549,0.727443,0.753086
8,0.0727,0.500331,0.834097,0.812767,0.736905,0.760443
9,0.0739,0.507485,0.832264,0.824856,0.745913,0.770863
10,0.0695,0.499254,0.836847,0.823644,0.734433,0.762542


[I 2025-03-16 17:15:54,003] Trial 82 pruned. 


Trial 83 with params: {'learning_rate': 8.639644976082937e-05, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 36, 'lambda_param': 0.0, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6325,1.305257,0.501375,0.119108,0.132103,0.105686
2,1.1029,1.075065,0.612282,0.175265,0.210747,0.185131
3,0.8877,0.94624,0.649863,0.28386,0.251899,0.242033
4,0.7389,0.859224,0.693859,0.329906,0.302265,0.300399
5,0.6335,0.814992,0.705775,0.34224,0.327388,0.326086


[I 2025-03-16 17:17:23,277] Trial 83 pruned. 


Trial 84 with params: {'learning_rate': 0.0048826470826988785, 'weight_decay': 0.01, 'adam_beta1': 0.93, 'warmup_steps': 29, 'lambda_param': 0.0, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5398,0.520157,0.828598,0.753653,0.687825,0.705292
2,0.1161,0.492806,0.830431,0.795807,0.714339,0.740626
3,0.0858,0.486693,0.83868,0.798361,0.732999,0.751243
4,0.0792,0.466053,0.842346,0.837287,0.736228,0.767107
5,0.0827,0.479727,0.84418,0.842545,0.740406,0.771653
6,0.0773,0.482292,0.83868,0.805778,0.750341,0.763125
7,0.0735,0.499805,0.839597,0.812133,0.734092,0.759851
8,0.0731,0.508594,0.835014,0.801408,0.732133,0.7509
9,0.0709,0.486423,0.837764,0.774631,0.741693,0.746585
10,0.0699,0.519975,0.833181,0.786792,0.741843,0.753406


[I 2025-03-16 17:23:25,269] Trial 84 pruned. 


Trial 85 with params: {'learning_rate': 0.0027553156002651353, 'weight_decay': 0.01, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5847,0.552228,0.808433,0.706552,0.625844,0.649881
2,0.1216,0.51281,0.827681,0.778598,0.693136,0.71987
3,0.086,0.501084,0.834097,0.782319,0.715947,0.734603
4,0.0768,0.505064,0.83593,0.790818,0.733672,0.748866
5,0.0732,0.470041,0.84143,0.784565,0.720578,0.737531
6,0.071,0.480328,0.84143,0.810031,0.728127,0.753899
7,0.0722,0.493345,0.839597,0.82525,0.740737,0.764056
8,0.0689,0.478196,0.835014,0.783849,0.716947,0.737905
9,0.0688,0.482667,0.834097,0.796642,0.724054,0.74485
10,0.0688,0.47152,0.849679,0.811712,0.735875,0.762258


[I 2025-03-16 17:32:38,450] Trial 85 finished with value: 0.7825323547652685 and parameters: {'learning_rate': 0.0027553156002651353, 'weight_decay': 0.01, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 5.5}. Best is trial 33 with value: 0.8044066847863501.


Trial 86 with params: {'learning_rate': 0.00024696163656226093, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 46, 'lambda_param': 0.2, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2887,0.931977,0.656279,0.286988,0.251614,0.241264
2,0.6468,0.734799,0.745188,0.443301,0.384669,0.394803
3,0.4312,0.668123,0.771769,0.507802,0.469669,0.477849
4,0.3084,0.647916,0.769019,0.543834,0.4844,0.499337
5,0.2316,0.617114,0.79835,0.643145,0.575018,0.592367


[I 2025-03-16 17:34:01,377] Trial 86 pruned. 


Trial 87 with params: {'learning_rate': 9.679939970386113e-05, 'weight_decay': 0.008, 'adam_beta1': 0.99, 'warmup_steps': 50, 'lambda_param': 1.0, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7856,1.416313,0.457379,0.08505,0.10837,0.079897
2,1.1735,1.098628,0.603116,0.18409,0.200321,0.176991
3,0.8995,0.920502,0.659945,0.293687,0.261811,0.251096
4,0.7292,0.839208,0.687443,0.316213,0.30128,0.29623
5,0.6138,0.783335,0.712191,0.418234,0.343265,0.355077
6,0.5327,0.753167,0.715857,0.422957,0.359648,0.371059
7,0.4652,0.720286,0.737855,0.433632,0.388231,0.394673
8,0.4113,0.71025,0.747938,0.458079,0.415631,0.423005
9,0.3664,0.689475,0.76077,0.506804,0.456287,0.465834
10,0.3288,0.696511,0.75527,0.490627,0.460809,0.462182


[I 2025-03-16 17:37:02,287] Trial 87 pruned. 


Trial 88 with params: {'learning_rate': 0.0017372714220887152, 'weight_decay': 0.007, 'adam_beta1': 0.93, 'warmup_steps': 52, 'lambda_param': 0.4, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7181,0.569956,0.79835,0.664025,0.576394,0.602025
2,0.1488,0.508906,0.821265,0.755036,0.665079,0.689249
3,0.0934,0.511847,0.822181,0.798572,0.69631,0.727953
4,0.0788,0.510307,0.824015,0.809828,0.71521,0.742989
5,0.0728,0.493525,0.835014,0.799242,0.721898,0.747081


[I 2025-03-16 17:38:37,411] Trial 88 pruned. 


Trial 89 with params: {'learning_rate': 0.0049123312957182795, 'weight_decay': 0.003, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 18, 'lambda_param': 0.9, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5384,0.513971,0.829514,0.749063,0.684633,0.704872
2,0.1148,0.483774,0.840513,0.828496,0.735208,0.767353
3,0.0857,0.458532,0.845096,0.823148,0.742473,0.76865
4,0.0782,0.484181,0.84143,0.850239,0.731575,0.769058
5,0.0784,0.463892,0.847846,0.854841,0.742348,0.776614
6,0.0799,0.499783,0.840513,0.826488,0.723877,0.757277
7,0.076,0.470282,0.847846,0.841907,0.752699,0.782388
8,0.0703,0.483153,0.843263,0.846242,0.748909,0.779962
9,0.0695,0.493986,0.831347,0.820132,0.710477,0.745861
10,0.0679,0.490016,0.836847,0.849479,0.746501,0.778707


[I 2025-03-16 17:47:38,081] Trial 89 finished with value: 0.7974334678892321 and parameters: {'learning_rate': 0.0049123312957182795, 'weight_decay': 0.003, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 18, 'lambda_param': 0.9, 'temperature': 2.5}. Best is trial 33 with value: 0.8044066847863501.


Trial 90 with params: {'learning_rate': 0.004562454726203069, 'weight_decay': 0.002, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 17, 'lambda_param': 0.8, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5483,0.4941,0.840513,0.820875,0.707945,0.741608
2,0.1164,0.480166,0.83868,0.819725,0.734142,0.758891
3,0.0862,0.469584,0.846929,0.835834,0.745475,0.772369
4,0.0778,0.468409,0.846013,0.834448,0.750213,0.776578
5,0.0784,0.500012,0.845096,0.829936,0.746078,0.771842
6,0.0738,0.471138,0.846929,0.850824,0.751456,0.781863
7,0.074,0.489322,0.836847,0.829231,0.734672,0.765301
8,0.0713,0.483483,0.837764,0.835103,0.742746,0.772778
9,0.0734,0.503844,0.83593,0.84633,0.738308,0.77286
10,0.0733,0.525427,0.828598,0.850917,0.731311,0.771279


[I 2025-03-16 17:56:58,527] Trial 90 finished with value: 0.7875499246221664 and parameters: {'learning_rate': 0.004562454726203069, 'weight_decay': 0.002, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 17, 'lambda_param': 0.8, 'temperature': 3.5}. Best is trial 33 with value: 0.8044066847863501.


Trial 91 with params: {'learning_rate': 0.0026012306665513757, 'weight_decay': 0.003, 'adam_beta1': 0.97, 'warmup_steps': 14, 'lambda_param': 0.6000000000000001, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7017,0.546128,0.814849,0.614632,0.600586,0.598207
2,0.1401,0.497292,0.83593,0.763898,0.693089,0.71483
3,0.089,0.491372,0.829514,0.783875,0.705199,0.730367
4,0.0775,0.489473,0.83868,0.796764,0.739892,0.754299
5,0.0739,0.496397,0.837764,0.798807,0.739888,0.758268
6,0.0699,0.489968,0.842346,0.833369,0.755932,0.781293
7,0.071,0.504767,0.837764,0.837317,0.739552,0.773045
8,0.0706,0.492461,0.839597,0.804173,0.730101,0.750637
9,0.0697,0.514953,0.835014,0.81071,0.72789,0.756145
10,0.0679,0.489683,0.840513,0.832045,0.734383,0.767984


[I 2025-03-16 18:03:04,442] Trial 91 pruned. 


Trial 92 with params: {'learning_rate': 0.0037941864974843607, 'weight_decay': 0.001, 'adam_beta1': 0.98, 'warmup_steps': 30, 'lambda_param': 0.9, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6957,0.537628,0.813016,0.65819,0.592521,0.609248
2,0.1411,0.481582,0.836847,0.800949,0.704794,0.735862
3,0.0898,0.488088,0.836847,0.822547,0.723383,0.756112
4,0.0812,0.479019,0.845096,0.83208,0.731973,0.76352
5,0.0777,0.466019,0.847846,0.826494,0.741799,0.768442
6,0.0744,0.468324,0.847846,0.856091,0.747736,0.780291
7,0.0731,0.477732,0.846013,0.836843,0.7391,0.769503
8,0.0735,0.474781,0.839597,0.826569,0.730509,0.756192
9,0.0722,0.478366,0.848763,0.814756,0.743567,0.764478
10,0.0724,0.51055,0.832264,0.837451,0.739992,0.771956


[I 2025-03-16 18:11:54,371] Trial 92 finished with value: 0.7941084235913941 and parameters: {'learning_rate': 0.0037941864974843607, 'weight_decay': 0.001, 'adam_beta1': 0.98, 'warmup_steps': 30, 'lambda_param': 0.9, 'temperature': 3.0}. Best is trial 33 with value: 0.8044066847863501.


Trial 93 with params: {'learning_rate': 0.0014629208348970798, 'weight_decay': 0.004, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 11, 'lambda_param': 0.9, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7356,0.583058,0.794684,0.563335,0.514699,0.525776
2,0.1694,0.5294,0.822181,0.750255,0.663028,0.689133
3,0.0994,0.501013,0.824931,0.788984,0.69324,0.722848
4,0.0839,0.51457,0.824015,0.834104,0.707974,0.748713
5,0.0753,0.501418,0.839597,0.836993,0.739546,0.767617
6,0.0703,0.497298,0.830431,0.8238,0.738938,0.764992
7,0.0714,0.512072,0.832264,0.842618,0.716009,0.75222
8,0.0676,0.517063,0.824931,0.821371,0.734055,0.761279
9,0.0686,0.505332,0.83593,0.831962,0.736469,0.766352
10,0.0664,0.503541,0.836847,0.812098,0.727597,0.751912


[I 2025-03-16 18:18:11,165] Trial 93 pruned. 


Trial 94 with params: {'learning_rate': 0.00013873790734943777, 'weight_decay': 0.005, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 27, 'lambda_param': 0.30000000000000004, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5055,1.15414,0.569203,0.156565,0.17721,0.155278
2,0.9024,0.897395,0.669111,0.307184,0.278287,0.267085
3,0.6714,0.800237,0.707608,0.35489,0.327419,0.327342
4,0.5352,0.7488,0.728689,0.398707,0.359058,0.362558
5,0.4414,0.714033,0.746104,0.527533,0.436302,0.452085
6,0.3708,0.680725,0.76077,0.512382,0.453159,0.465173
7,0.3161,0.672373,0.772686,0.506834,0.482675,0.484744
8,0.2754,0.654735,0.773602,0.57816,0.500628,0.517759
9,0.2437,0.64897,0.781852,0.598599,0.538519,0.554545
10,0.216,0.651158,0.780018,0.640251,0.531266,0.563346


[I 2025-03-16 18:21:16,744] Trial 94 pruned. 


Trial 95 with params: {'learning_rate': 0.003932031392656559, 'weight_decay': 0.003, 'adam_beta1': 0.97, 'warmup_steps': 32, 'lambda_param': 1.0, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6442,0.538777,0.821265,0.718093,0.650084,0.667304
2,0.1269,0.49189,0.831347,0.833465,0.727709,0.763489
3,0.0883,0.477132,0.845096,0.835444,0.735339,0.770099
4,0.0773,0.484766,0.83593,0.827558,0.730215,0.764281
5,0.0749,0.482089,0.846013,0.827131,0.748394,0.775725
6,0.0739,0.491697,0.833181,0.841213,0.744925,0.775117
7,0.0758,0.505054,0.840513,0.818034,0.732096,0.759902
8,0.0719,0.492648,0.839597,0.863723,0.744818,0.784683
9,0.0693,0.485881,0.850596,0.844564,0.742764,0.774727
10,0.0674,0.4978,0.84418,0.812381,0.736104,0.759815


[I 2025-03-16 18:27:14,819] Trial 95 pruned. 


Trial 96 with params: {'learning_rate': 0.0011421392282510965, 'weight_decay': 0.002, 'adam_beta1': 0.98, 'warmup_steps': 35, 'lambda_param': 1.0, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9732,0.663166,0.76077,0.453108,0.434949,0.434488
2,0.252,0.539437,0.807516,0.653728,0.618788,0.622142
3,0.1212,0.518737,0.824931,0.798385,0.69108,0.723762
4,0.0924,0.493431,0.828598,0.806293,0.718648,0.745685
5,0.079,0.502605,0.833181,0.817449,0.727372,0.756909
6,0.0722,0.500921,0.828598,0.799127,0.70325,0.734866
7,0.0712,0.507426,0.824015,0.834936,0.700701,0.742537
8,0.0687,0.498603,0.834097,0.837399,0.732028,0.767613
9,0.0697,0.513888,0.820348,0.779646,0.690664,0.72058
10,0.0706,0.516357,0.820348,0.811492,0.726558,0.753447


[I 2025-03-16 18:30:20,182] Trial 96 pruned. 


Trial 97 with params: {'learning_rate': 0.004030974531882211, 'weight_decay': 0.0, 'adam_beta1': 0.96, 'warmup_steps': 28, 'lambda_param': 0.7000000000000001, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6227,0.539983,0.814849,0.728399,0.645917,0.671587
2,0.1236,0.47434,0.846013,0.817062,0.740146,0.762419
3,0.088,0.481731,0.835014,0.787601,0.723191,0.740839
4,0.078,0.472725,0.84418,0.83756,0.759081,0.780247
5,0.0773,0.476274,0.842346,0.825138,0.736461,0.766594
6,0.0744,0.508529,0.832264,0.840069,0.740495,0.772745
7,0.072,0.490459,0.837764,0.807662,0.741554,0.761512
8,0.0772,0.5048,0.831347,0.809845,0.728464,0.755194
9,0.0731,0.495206,0.835014,0.836286,0.747741,0.777288
10,0.0703,0.494263,0.839597,0.842475,0.752942,0.781536


[I 2025-03-16 18:36:52,586] Trial 97 pruned. 


Trial 98 with params: {'learning_rate': 0.0005081851661719769, 'weight_decay': 0.0, 'adam_beta1': 0.98, 'warmup_steps': 29, 'lambda_param': 1.0, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2001,0.815618,0.698442,0.351525,0.319164,0.311056
2,0.4599,0.631649,0.777269,0.514664,0.505901,0.500554
3,0.2396,0.593419,0.789184,0.625713,0.579224,0.594226
4,0.1539,0.558615,0.805683,0.711,0.648408,0.665347
5,0.1166,0.553436,0.812099,0.750981,0.670738,0.696531


[I 2025-03-16 18:38:17,642] Trial 98 pruned. 


Trial 99 with params: {'learning_rate': 0.0027455525273075926, 'weight_decay': 0.0, 'adam_beta1': 0.99, 'warmup_steps': 27, 'lambda_param': 0.6000000000000001, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8244,0.617664,0.786434,0.528643,0.489056,0.495592
2,0.1894,0.515992,0.831347,0.806763,0.723703,0.748327
3,0.1,0.488897,0.831347,0.798598,0.723442,0.7478
4,0.0823,0.498433,0.828598,0.819335,0.732478,0.760617
5,0.0763,0.49978,0.834097,0.798147,0.73479,0.753712


[I 2025-03-16 18:39:47,219] Trial 99 pruned. 


Trial 100 with params: {'learning_rate': 0.0038592562136140316, 'weight_decay': 0.002, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 21, 'lambda_param': 1.0, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5687,0.498543,0.837764,0.750562,0.674574,0.698348
2,0.1163,0.461147,0.84143,0.844038,0.744942,0.776173
3,0.0844,0.469308,0.845096,0.82672,0.749218,0.776114
4,0.0761,0.472124,0.846013,0.842489,0.765421,0.790149
5,0.0739,0.4686,0.837764,0.848721,0.750841,0.784745
6,0.0782,0.488494,0.842346,0.855269,0.749888,0.783947
7,0.0744,0.478165,0.850596,0.829766,0.753734,0.778151
8,0.0695,0.492201,0.842346,0.83044,0.762988,0.785704
9,0.0678,0.484306,0.842346,0.835751,0.757227,0.781933
10,0.0675,0.497283,0.83593,0.820248,0.724664,0.754907


[I 2025-03-16 18:45:59,384] Trial 100 pruned. 


Trial 101 with params: {'learning_rate': 3.652719949721788e-05, 'weight_decay': 0.002, 'adam_beta1': 0.99, 'warmup_steps': 3, 'lambda_param': 0.8, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0054,1.708061,0.332722,0.03402,0.06603,0.040115
2,1.4854,1.408685,0.44363,0.067548,0.101654,0.075296
3,1.27,1.274092,0.508708,0.142991,0.136785,0.112539
4,1.1512,1.180791,0.554537,0.169569,0.168697,0.15021
5,1.0535,1.106813,0.593034,0.19178,0.196156,0.174884
6,0.9707,1.04253,0.615032,0.214801,0.21279,0.191674
7,0.8975,0.995948,0.641613,0.253824,0.234464,0.218442
8,0.8377,0.95765,0.648946,0.259696,0.249811,0.236016
9,0.7854,0.916312,0.664528,0.309713,0.268153,0.260932
10,0.7392,0.893942,0.673694,0.306444,0.283119,0.277886


[I 2025-03-16 18:48:59,753] Trial 101 pruned. 


Trial 102 with params: {'learning_rate': 0.0024465803871279654, 'weight_decay': 0.002, 'adam_beta1': 0.98, 'warmup_steps': 23, 'lambda_param': 0.8, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7519,0.588392,0.796517,0.582233,0.542341,0.549121
2,0.1571,0.499324,0.827681,0.736144,0.688029,0.701678
3,0.092,0.47866,0.83593,0.83848,0.743558,0.775519
4,0.0795,0.473775,0.840513,0.806772,0.744529,0.762975
5,0.0759,0.487334,0.831347,0.832661,0.736761,0.766191
6,0.0711,0.511958,0.826764,0.793475,0.70045,0.730193
7,0.0712,0.477966,0.836847,0.838804,0.743232,0.775252
8,0.0701,0.504375,0.829514,0.820063,0.735885,0.763224
9,0.0686,0.500557,0.834097,0.83079,0.729023,0.758731
10,0.0672,0.488968,0.836847,0.844933,0.738098,0.771943


[I 2025-03-16 18:55:09,877] Trial 102 pruned. 


Trial 103 with params: {'learning_rate': 0.004039802662888082, 'weight_decay': 0.008, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 3, 'lambda_param': 0.30000000000000004, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5481,0.510619,0.835014,0.773263,0.689208,0.716328
2,0.1167,0.462367,0.843263,0.834481,0.730866,0.766579
3,0.0857,0.492472,0.84418,0.831992,0.739561,0.769561
4,0.0779,0.466966,0.846013,0.839515,0.749471,0.778483
5,0.0756,0.482935,0.840513,0.838452,0.752962,0.780262
6,0.0782,0.500635,0.84143,0.824375,0.741212,0.765674
7,0.0788,0.480018,0.840513,0.831739,0.746323,0.771571
8,0.0722,0.475834,0.839597,0.790885,0.727126,0.747644
9,0.0696,0.482127,0.846013,0.822036,0.766501,0.780694
10,0.0678,0.462072,0.847846,0.827213,0.756379,0.780543


[I 2025-03-16 19:04:11,904] Trial 103 finished with value: 0.7912796781827296 and parameters: {'learning_rate': 0.004039802662888082, 'weight_decay': 0.008, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 3, 'lambda_param': 0.30000000000000004, 'temperature': 5.5}. Best is trial 33 with value: 0.8044066847863501.


Trial 104 with params: {'learning_rate': 0.0032690390859847567, 'weight_decay': 0.008, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 5, 'lambda_param': 0.30000000000000004, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5676,0.516977,0.825848,0.770462,0.683042,0.710987
2,0.1192,0.483574,0.836847,0.790331,0.719691,0.736998
3,0.0855,0.490906,0.842346,0.810543,0.738677,0.76152
4,0.0766,0.481906,0.843263,0.837571,0.742151,0.774814
5,0.0734,0.481228,0.84143,0.819408,0.732801,0.759473
6,0.0717,0.476958,0.839597,0.828225,0.735555,0.762156
7,0.0716,0.490123,0.845096,0.830335,0.748938,0.773345
8,0.0719,0.492646,0.842346,0.800801,0.729375,0.750078
9,0.0676,0.476521,0.84418,0.826228,0.744213,0.770646
10,0.0656,0.484792,0.842346,0.824962,0.738828,0.768207


[I 2025-03-16 19:13:20,080] Trial 104 finished with value: 0.7801824597822544 and parameters: {'learning_rate': 0.0032690390859847567, 'weight_decay': 0.008, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 5, 'lambda_param': 0.30000000000000004, 'temperature': 6.5}. Best is trial 33 with value: 0.8044066847863501.


Trial 105 with params: {'learning_rate': 0.0035212827408528883, 'weight_decay': 0.008, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.5, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5353,0.519903,0.814849,0.731659,0.644941,0.664525
2,0.1194,0.489458,0.83593,0.800832,0.718385,0.740711
3,0.0871,0.472126,0.837764,0.798226,0.72578,0.748478
4,0.0783,0.474702,0.829514,0.810821,0.703844,0.737317
5,0.0746,0.458521,0.846013,0.83209,0.748621,0.773582
6,0.0748,0.478563,0.846929,0.829255,0.752634,0.776278
7,0.0751,0.482208,0.829514,0.813807,0.701939,0.737583
8,0.0724,0.486437,0.84143,0.839015,0.747287,0.777424
9,0.0687,0.464271,0.843263,0.836482,0.74437,0.775254
10,0.0669,0.472757,0.843263,0.853944,0.745826,0.780792


[I 2025-03-16 19:19:12,923] Trial 105 pruned. 


Trial 106 with params: {'learning_rate': 0.004533105118239366, 'weight_decay': 0.008, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss


[W 2025-03-16 19:19:16,028] Trial 106 failed with parameters: {'learning_rate': 0.004533105118239366, 'weight_decay': 0.008, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 6.0} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/usr/local/lib/python3.10/dist-packages/transformers/integrations/integration_utils.py", line 250, in _objective
    trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2241, in train
    return inner_training_loop(
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2581, in _inner_training_loop
    _grad_norm = self.accelerator.clip_grad_norm_(
  File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 2509,

KeyboardInterrupt: 

In [71]:
print(best_trial4)

NameError: name 'best_trial4' is not defined

In [72]:
print("Best normal training score: ", best_trial)
print("Best distilation trianing score: ", best_trial2)
print("Best normal training score with augmentations: ", best_trial3)
print("Best distilation trianing score with augmentations: ",best_trial4)

Best normal training score:  BestRun(run_id='45', objective=0.7063463805279003, hyperparameters={'learning_rate': 0.004693546493886514, 'weight_decay': 0.002, 'adam_beta1': 0.9, 'warmup_steps': 1}, run_summary=None)
Best distilation trianing score:  BestRun(run_id='105', objective=0.48810144914827314, hyperparameters={'learning_rate': 0.004107822923895355, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 2.5}, run_summary=None)
Best normal training score with augmentations:  BestRun(run_id='22', objective=0.777016449838325, hyperparameters={'learning_rate': 0.0026826241523527678, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.92, 'warmup_steps': 27}, run_summary=None)


NameError: name 'best_trial4' is not defined