In [1]:
from transformers import BasicTokenizer, Trainer
from datasets import concatenate_datasets, load_from_disk
import kagglehub
import optuna
import torch
import math
import base

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [2]:
base.reset_seed()

In [3]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


In [4]:
my_glove = kagglehub.dataset_download("thanakomsn/glove6b300dtxt")
print(my_glove)

/home/jovyan/.cache/kagglehub/datasets/thanakomsn/glove6b300dtxt/versions/1


In [5]:
GLOVE_FILE = f"{my_glove}/glove.6B.300d.txt"
DATASET = "sst2"

In [6]:
train_data = load_from_disk(f"~/data/{DATASET}/train-logits")
eval_data = load_from_disk(f"~/data/{DATASET}/eval-logits")
test_data = load_from_disk(f"~/data/{DATASET}/test-logits")

all_train_data = load_from_disk(f"~/data/{DATASET}/train-logits-augmented")

all_data = concatenate_datasets([load_from_disk(file) for file in [f"~/data/{DATASET}/eval-logits", f"~/data/{DATASET}/test-logits", f"~/data/{DATASET}/train-logits-augmented"]])
tokenizer = BasicTokenizer(do_lower_case=True)

In [7]:
train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), train_data))
eval_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), eval_data))
test_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), test_data))

all_train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_train_data))

all_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_data))

In [8]:
vocab = base.get_vocab(all_data_tokens)

In [9]:
word_index = dict(zip(vocab, range(len(vocab))))

In [10]:
embeddings_index = base.get_embeddings_indeces(GLOVE_FILE)

Found 400000 word vectors.


In [11]:
print(len(vocab))
num_tokens = len(vocab) + 2
embedding_dim = 300

14621


In [12]:
embedding_matrix = base.get_embedding_matrix(num_tokens, embedding_dim, word_index, embeddings_index)

Converted 14305 words (316) misses


In [13]:
train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),train_data_tokens))
eval_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),eval_data_tokens))
test_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),test_data_tokens))

all_train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),all_train_data_tokens))

In [14]:
train_padded_data = list(map(lambda x: base.padd(x,60), train_data_index))
eval_padded_data = list(map(lambda x: base.padd(x,60), eval_data_index))
test_padded_data = list(map(lambda x: base.padd(x,60), test_data_index))

all_train_padded_data = list(map(lambda x: base.padd(x,60), all_train_data_index))

In [15]:
train_data = train_data.add_column("input_ids", train_padded_data)
eval_data = eval_data.add_column("input_ids", eval_padded_data)
test_data = test_data.add_column("input_ids", test_padded_data)

all_train_data = all_train_data.add_column("input_ids", all_train_padded_data)

In [16]:
num_epochs = 15
batch_size = 128

In [17]:
#Nápočet epoch na steps
data_length = len(train_data)
min_r = math.ceil(data_length/batch_size)*5
max_r = math.ceil(data_length/batch_size)*num_epochs
warm_up = math.ceil(data_length/batch_size/10)

In [18]:
def hp_space(trial):
    params =  {
        "learning_rate": trial.suggest_float("learning_rate", 5e-5, 5e-3, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0, 1e-2, step=1e-3),
        "warmup_steps" : trial.suggest_int("warmup_steps", 0, warm_up)
    }   
    print(f"Trial {trial.number} with params: {params}")
    return params

In [19]:
pruner = optuna.pruners.HyperbandPruner(min_resource=min_r, max_resource=max_r, reduction_factor=2, bootstrap_count=2)
sampler = optuna.samplers.TPESampler(seed=42, multivariate=True)



In [20]:
def get_BiLSTM():
    return base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=2, freeze_embed=False)

In [21]:
base.reset_seed()

In [22]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base-embedd_hp-search", logging_dir=f"~/logs/{DATASET}/bilstm-base-embedd_hp-search", epochs=num_epochs, batch_size=batch_size)

In [23]:
trainer = Trainer(
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    model_init = lambda: get_BiLSTM()
)
  

In [24]:
best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    compute_objective=lambda metrics: metrics["eval_f1"],
    pruner=pruner,
    sampler=sampler,
    study_name="Base-embedd",
    n_trials=150
)

[I 2025-03-23 01:16:23,863] A new study created in memory with name: Base-embedd


Trial 0 with params: {'learning_rate': 0.0002805758207667253, 'weight_decay': 0.01, 'warmup_steps': 32}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3856,0.407834,0.81422,0.821123,0.815547,0.813607
2,0.2428,0.46182,0.815367,0.821979,0.814021,0.813916
3,0.1847,0.481049,0.829128,0.833275,0.828082,0.828232
4,0.1484,0.618027,0.81078,0.822933,0.808969,0.808309
5,0.1217,0.552868,0.830275,0.830239,0.830176,0.830203
6,0.1005,0.640374,0.832569,0.834543,0.831839,0.83206
7,0.0843,0.635275,0.836009,0.836348,0.83568,0.835828
8,0.0728,0.660173,0.826835,0.829696,0.827682,0.826669
9,0.062,0.724519,0.830275,0.832481,0.831018,0.830167
10,0.0536,0.779018,0.819954,0.824356,0.821009,0.819629


[I 2025-03-23 01:18:16,235] Trial 0 pruned. 


Trial 1 with params: {'learning_rate': 0.0007875660249889869, 'weight_decay': 0.001, 'warmup_steps': 6}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3273,0.532216,0.801606,0.821728,0.803875,0.799222
2,0.1851,0.428706,0.838303,0.839973,0.837638,0.837871
3,0.1257,0.414811,0.841743,0.841947,0.841479,0.841602
4,0.0911,0.535608,0.844037,0.844097,0.843858,0.843937
5,0.0675,0.672862,0.818807,0.825289,0.817483,0.81742


[I 2025-03-23 01:19:16,643] Trial 1 pruned. 


Trial 2 with params: {'learning_rate': 6.533369619026643e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 26}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4806,0.453303,0.78555,0.785965,0.78512,0.785243
2,0.3387,0.452103,0.803899,0.806471,0.803012,0.803116
3,0.3021,0.449593,0.798165,0.801358,0.797171,0.797205
4,0.2762,0.46708,0.811927,0.814459,0.811063,0.811203
5,0.254,0.427872,0.818807,0.818856,0.818957,0.818799
6,0.2361,0.441448,0.816514,0.816447,0.816494,0.816466
7,0.2235,0.466256,0.819954,0.820032,0.820125,0.819948
8,0.2094,0.483396,0.811927,0.81574,0.812916,0.81164
9,0.1995,0.492738,0.821101,0.821137,0.820914,0.820987
10,0.19,0.499643,0.813073,0.818122,0.81421,0.812659


[I 2025-03-23 01:21:34,146] Trial 2 pruned. 


Trial 3 with params: {'learning_rate': 0.0013035123791853842, 'weight_decay': 0.0, 'warmup_steps': 42}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3212,0.535271,0.81422,0.831612,0.816305,0.812395
2,0.1652,0.441162,0.845183,0.845277,0.844984,0.845076
3,0.1056,0.408245,0.849771,0.849762,0.849657,0.849699
4,0.0704,0.629548,0.834862,0.840039,0.833712,0.833852
5,0.048,0.800593,0.840596,0.840536,0.840606,0.840561


[I 2025-03-23 01:22:30,493] Trial 3 pruned. 


Trial 4 with params: {'learning_rate': 0.002311294500510415, 'weight_decay': 0.002, 'warmup_steps': 8}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2941,0.421498,0.837156,0.841206,0.838154,0.836908
2,0.1447,0.452097,0.844037,0.846367,0.843269,0.843522
3,0.0866,0.467586,0.840596,0.841671,0.840059,0.840277
4,0.0542,0.68991,0.831422,0.831753,0.831723,0.831422
5,0.0345,0.909269,0.819954,0.820032,0.820125,0.819948
6,0.0224,1.099464,0.830275,0.837502,0.828913,0.828906
7,0.0168,1.066942,0.823394,0.823756,0.82304,0.823185
8,0.0124,1.329543,0.826835,0.827385,0.826419,0.826587
9,0.0088,1.363034,0.831422,0.831571,0.831639,0.83142
10,0.006,1.407543,0.822248,0.824252,0.821493,0.821685


[I 2025-03-23 01:25:47,987] Trial 4 finished with value: 0.8230929747275678 and parameters: {'learning_rate': 0.002311294500510415, 'weight_decay': 0.002, 'warmup_steps': 8}. Best is trial 4 with value: 0.8230929747275678.


Trial 5 with params: {'learning_rate': 0.00011635338541918901, 'weight_decay': 0.003, 'warmup_steps': 23}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4377,0.427165,0.78555,0.788969,0.78651,0.785243
2,0.303,0.436175,0.818807,0.819058,0.818494,0.81862
3,0.2572,0.490136,0.813073,0.818791,0.811811,0.811754
4,0.2257,0.562226,0.815367,0.824353,0.81381,0.813511
5,0.2009,0.462195,0.818807,0.819058,0.818494,0.81862


[I 2025-03-23 01:26:54,353] Trial 5 pruned. 


Trial 6 with params: {'learning_rate': 0.0003654769917956456, 'weight_decay': 0.003, 'warmup_steps': 26}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3689,0.422478,0.816514,0.824378,0.817925,0.815807
2,0.2249,0.484178,0.817661,0.825242,0.816231,0.816074
3,0.1648,0.475814,0.827982,0.829269,0.827376,0.827582
4,0.1293,0.570459,0.821101,0.829022,0.819651,0.819505
5,0.1027,0.581857,0.830275,0.83046,0.830008,0.830124
6,0.0838,0.646307,0.841743,0.844324,0.840932,0.841178
7,0.068,0.719631,0.830275,0.831998,0.829587,0.829802
8,0.057,0.718277,0.826835,0.828441,0.827471,0.826769
9,0.0469,0.774219,0.834862,0.836387,0.83548,0.834807
10,0.0386,0.801349,0.825688,0.830333,0.826766,0.825356


[I 2025-03-23 01:30:22,905] Trial 6 finished with value: 0.8348484848484848 and parameters: {'learning_rate': 0.0003654769917956456, 'weight_decay': 0.003, 'warmup_steps': 26}. Best is trial 6 with value: 0.8348484848484848.


Trial 7 with params: {'learning_rate': 9.505122659935192e-05, 'weight_decay': 0.003, 'warmup_steps': 16}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4499,0.432522,0.799312,0.801034,0.799981,0.799217
2,0.3159,0.451132,0.801606,0.804151,0.800718,0.800813
3,0.2735,0.475677,0.81422,0.818638,0.813105,0.81315
4,0.2435,0.538856,0.81078,0.819611,0.809222,0.808877
5,0.2198,0.446257,0.821101,0.821035,0.821083,0.821055
6,0.2004,0.480851,0.813073,0.814432,0.813663,0.813018
7,0.1844,0.522103,0.813073,0.813462,0.812695,0.812837
8,0.1711,0.531393,0.818807,0.820283,0.81942,0.818746
9,0.16,0.554654,0.816514,0.816456,0.816536,0.816479
10,0.1497,0.582711,0.81078,0.814746,0.81179,0.810474


[I 2025-03-23 01:32:22,537] Trial 7 pruned. 


Trial 8 with params: {'learning_rate': 0.00040842279473800845, 'weight_decay': 0.008, 'warmup_steps': 8}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3601,0.421573,0.817661,0.823973,0.818925,0.817129
2,0.2204,0.467739,0.825688,0.830313,0.824577,0.824684
3,0.1597,0.465168,0.829128,0.829978,0.828629,0.82882
4,0.1242,0.553264,0.834862,0.838584,0.833881,0.834077
5,0.0974,0.606038,0.826835,0.827385,0.826419,0.826587
6,0.0791,0.670378,0.840596,0.842064,0.839974,0.840208
7,0.0641,0.748194,0.833716,0.834289,0.833302,0.833477
8,0.0529,0.735345,0.827982,0.829485,0.828597,0.827924
9,0.0431,0.808669,0.827982,0.829932,0.828681,0.827891
10,0.0352,0.852866,0.822248,0.825674,0.823177,0.822023


[I 2025-03-23 01:34:19,344] Trial 8 pruned. 


Trial 9 with params: {'learning_rate': 0.0005338741354740678, 'weight_decay': 0.006, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3423,0.474165,0.813073,0.823136,0.814673,0.812093
2,0.2041,0.458852,0.821101,0.825643,0.819988,0.82007
3,0.1435,0.431953,0.836009,0.836348,0.83568,0.835828
4,0.1088,0.538424,0.84289,0.844593,0.842227,0.84247
5,0.084,0.610925,0.824541,0.826332,0.82383,0.82403
6,0.0663,0.713132,0.84289,0.843801,0.842395,0.842607
7,0.0522,0.77128,0.836009,0.83659,0.835596,0.835774
8,0.0413,0.806443,0.837156,0.837205,0.837312,0.837148
9,0.032,0.877942,0.837156,0.839144,0.837859,0.83707
10,0.0249,0.854038,0.826835,0.827164,0.827134,0.826835


[I 2025-03-23 01:38:13,530] Trial 9 finished with value: 0.8279490687151426 and parameters: {'learning_rate': 0.0005338741354740678, 'weight_decay': 0.006, 'warmup_steps': 2}. Best is trial 6 with value: 0.8348484848484848.


Trial 10 with params: {'learning_rate': 0.002185432916630353, 'weight_decay': 0.005, 'warmup_steps': 34}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3044,0.424719,0.832569,0.833354,0.833018,0.832555
2,0.1475,0.426202,0.848624,0.850037,0.848026,0.848272
3,0.0897,0.423796,0.853211,0.853226,0.853077,0.853134
4,0.0572,0.764753,0.822248,0.829962,0.82082,0.820701
5,0.0376,0.888197,0.833716,0.834582,0.834186,0.833698
6,0.0253,0.999529,0.836009,0.835963,0.835933,0.835947
7,0.0168,1.310692,0.837156,0.837209,0.836975,0.837052
8,0.0116,1.200541,0.840596,0.84186,0.840016,0.840243
9,0.0086,1.212975,0.834862,0.834862,0.834975,0.834848
10,0.006,1.519237,0.840596,0.841671,0.840059,0.840277


[I 2025-03-23 01:41:18,384] Trial 10 finished with value: 0.8369363674673409 and parameters: {'learning_rate': 0.002185432916630353, 'weight_decay': 0.005, 'warmup_steps': 34}. Best is trial 10 with value: 0.8369363674673409.


Trial 11 with params: {'learning_rate': 0.004345544743062486, 'weight_decay': 0.006, 'warmup_steps': 23}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2888,0.454401,0.831422,0.83403,0.832228,0.831283
2,0.1325,0.47233,0.833716,0.834586,0.833218,0.833416
3,0.0753,0.532436,0.836009,0.839582,0.835049,0.835255
4,0.0461,0.822198,0.827982,0.82947,0.827334,0.827543
5,0.0294,1.158791,0.816514,0.816478,0.816578,0.81649
6,0.0196,1.035473,0.826835,0.826983,0.82705,0.826833
7,0.0132,1.280626,0.832569,0.832848,0.83226,0.832396
8,0.009,1.204178,0.833716,0.833697,0.833596,0.833637
9,0.0071,1.471045,0.833716,0.834286,0.834102,0.83371
10,0.0049,1.573462,0.833716,0.833668,0.833765,0.833689


[I 2025-03-23 01:44:52,441] Trial 11 finished with value: 0.8348484848484848 and parameters: {'learning_rate': 0.004345544743062486, 'weight_decay': 0.006, 'warmup_steps': 23}. Best is trial 10 with value: 0.8369363674673409.


Trial 12 with params: {'learning_rate': 0.002163019453168294, 'weight_decay': 0.006, 'warmup_steps': 41}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.305,0.467681,0.829128,0.836532,0.830481,0.828542
2,0.1471,0.439692,0.838303,0.838646,0.837975,0.838124
3,0.0889,0.411001,0.847477,0.847467,0.847363,0.847405
4,0.0566,0.667423,0.844037,0.846108,0.843311,0.843563
5,0.036,0.868377,0.84633,0.846983,0.846742,0.846323
6,0.0252,0.919882,0.845183,0.845172,0.845068,0.84511
7,0.0167,1.04086,0.84289,0.843493,0.842479,0.842665
8,0.0112,1.160527,0.844037,0.845814,0.8447,0.84397
9,0.0097,1.297938,0.834862,0.83709,0.835607,0.834757
10,0.0063,1.518605,0.844037,0.844451,0.843689,0.843852


[I 2025-03-23 01:48:05,318] Trial 12 finished with value: 0.8439702128779307 and parameters: {'learning_rate': 0.002163019453168294, 'weight_decay': 0.006, 'warmup_steps': 41}. Best is trial 12 with value: 0.8439702128779307.


Trial 13 with params: {'learning_rate': 0.0019787415933797555, 'weight_decay': 0.005, 'warmup_steps': 41}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3073,0.454518,0.827982,0.8356,0.829355,0.827368
2,0.1494,0.430281,0.847477,0.84899,0.846857,0.847105
3,0.0915,0.398571,0.850917,0.850858,0.850909,0.850879
4,0.0593,0.614836,0.834862,0.835147,0.834554,0.834692
5,0.0378,0.873947,0.824541,0.825932,0.825135,0.824489
6,0.026,1.053363,0.837156,0.841987,0.836049,0.836218
7,0.0183,1.09357,0.834862,0.835055,0.834596,0.834715
8,0.0134,1.026469,0.847477,0.847436,0.847405,0.847419
9,0.0087,1.233366,0.841743,0.842391,0.842153,0.841736
10,0.0071,1.289683,0.833716,0.836352,0.832881,0.833099


[I 2025-03-23 01:51:42,334] Trial 13 finished with value: 0.8358742706568794 and parameters: {'learning_rate': 0.0019787415933797555, 'weight_decay': 0.005, 'warmup_steps': 41}. Best is trial 12 with value: 0.8439702128779307.


Trial 14 with params: {'learning_rate': 0.0029431852786214706, 'weight_decay': 0.01, 'warmup_steps': 43}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3026,0.438311,0.824541,0.825394,0.825008,0.824523
2,0.142,0.448588,0.840596,0.842516,0.83989,0.840132
3,0.0838,0.443005,0.83945,0.83964,0.83969,0.839449
4,0.0514,0.740376,0.803899,0.810643,0.802507,0.802276
5,0.0336,0.839376,0.837156,0.837123,0.837059,0.837087
6,0.021,1.000984,0.826835,0.82888,0.826082,0.826286
7,0.0143,1.22201,0.83945,0.841482,0.838722,0.838962
8,0.0109,1.24233,0.845183,0.845124,0.845194,0.845149
9,0.0081,1.329222,0.833716,0.833668,0.833765,0.833689
10,0.0044,1.809021,0.824541,0.825535,0.823998,0.82419


[I 2025-03-23 01:53:47,184] Trial 14 pruned. 


Trial 15 with params: {'learning_rate': 0.0005455772575156466, 'weight_decay': 0.005, 'warmup_steps': 38}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3536,0.505858,0.81078,0.824918,0.812674,0.809293
2,0.2017,0.453115,0.825688,0.829953,0.824619,0.824744
3,0.1416,0.450608,0.826835,0.828213,0.826208,0.826413
4,0.1066,0.537307,0.844037,0.847891,0.843058,0.843295
5,0.0813,0.616942,0.826835,0.829677,0.825956,0.826143
6,0.0639,0.684133,0.834862,0.836398,0.834217,0.834441
7,0.05,0.788784,0.834862,0.835147,0.834554,0.834692
8,0.0385,0.83469,0.837156,0.837205,0.837312,0.837148
9,0.0301,0.860531,0.838303,0.840175,0.838985,0.838226
10,0.0232,0.911635,0.837156,0.837346,0.837396,0.837155


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--accuracy/f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Sat Oct 12 13:56:14 2024) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.
[I 2025-03-23 01:57:33,451] Trial 15 finished with value: 0.8255963283437546 and parameters: {'learning_rate': 0.0005455772575156466, 'weight_decay': 0.005, 'warmup_steps': 38}. Best is trial 12 with value: 0.8439702128779307.


Trial 16 with params: {'learning_rate': 0.002679548297868375, 'weight_decay': 0.003, 'warmup_steps': 31}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.301,0.451788,0.826835,0.830298,0.827766,0.826616
2,0.1427,0.469149,0.847477,0.84899,0.846857,0.847105
3,0.0835,0.424354,0.852064,0.852242,0.851825,0.851943
4,0.0528,0.769493,0.833716,0.837936,0.83267,0.832843
5,0.0345,0.815044,0.833716,0.834427,0.834144,0.833705
6,0.0207,1.169438,0.853211,0.853649,0.852867,0.853037
7,0.0162,1.029959,0.832569,0.832512,0.832512,0.832512
8,0.0113,1.215224,0.838303,0.838537,0.838564,0.838303
9,0.0084,1.180789,0.833716,0.833696,0.833807,0.833698
10,0.0054,1.483639,0.827982,0.827944,0.827882,0.827908


[I 2025-03-23 02:01:01,939] Trial 16 finished with value: 0.8266157332606465 and parameters: {'learning_rate': 0.002679548297868375, 'weight_decay': 0.003, 'warmup_steps': 31}. Best is trial 12 with value: 0.8439702128779307.


Trial 17 with params: {'learning_rate': 0.0018490521152754369, 'weight_decay': 0.009000000000000001, 'warmup_steps': 27}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3066,0.447245,0.825688,0.834237,0.827145,0.824966
2,0.1533,0.437408,0.847477,0.848095,0.847068,0.847258
3,0.0956,0.393144,0.849771,0.849808,0.849615,0.849683
4,0.0614,0.643426,0.832569,0.834543,0.831839,0.83206
5,0.0402,0.910264,0.826835,0.827275,0.827176,0.826833
6,0.0274,0.949322,0.822248,0.825036,0.821367,0.821538
7,0.0195,1.064293,0.827982,0.828355,0.827629,0.827778
8,0.013,1.210517,0.833716,0.83395,0.833428,0.833556
9,0.0085,1.19807,0.830275,0.830555,0.830555,0.830275
10,0.006,1.496848,0.829128,0.829354,0.828839,0.828964


[I 2025-03-23 02:04:09,529] Trial 17 finished with value: 0.8278041074249605 and parameters: {'learning_rate': 0.0018490521152754369, 'weight_decay': 0.009000000000000001, 'warmup_steps': 27}. Best is trial 12 with value: 0.8439702128779307.


Trial 18 with params: {'learning_rate': 0.004789101188854444, 'weight_decay': 0.006, 'warmup_steps': 35}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2932,0.458573,0.834862,0.836841,0.835565,0.834775
2,0.1334,0.45925,0.833716,0.837257,0.832754,0.832951
3,0.0776,0.501961,0.84289,0.844593,0.842227,0.84247
4,0.0463,0.720977,0.830275,0.83046,0.830008,0.830124
5,0.0292,0.92737,0.816514,0.817135,0.816915,0.816505
6,0.02,1.241577,0.825688,0.825682,0.825545,0.825596
7,0.013,1.307124,0.829128,0.829277,0.829345,0.829126
8,0.0092,1.231633,0.821101,0.821059,0.820999,0.821025
9,0.0063,1.489166,0.815367,0.815301,0.815326,0.815312
10,0.0047,1.658287,0.813073,0.813041,0.812947,0.812985


[I 2025-03-23 02:06:17,636] Trial 18 pruned. 


Trial 19 with params: {'learning_rate': 0.000626859107464978, 'weight_decay': 0.006, 'warmup_steps': 25}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3441,0.528884,0.806193,0.824285,0.808338,0.804152
2,0.196,0.437912,0.833716,0.835823,0.832965,0.833189
3,0.1359,0.432413,0.83945,0.840112,0.839017,0.839205
4,0.1009,0.539161,0.841743,0.843326,0.8411,0.841339
5,0.0764,0.611484,0.830275,0.833931,0.829292,0.829468


[I 2025-03-23 02:07:22,211] Trial 19 pruned. 


Trial 20 with params: {'learning_rate': 0.0006312229279735081, 'weight_decay': 0.009000000000000001, 'warmup_steps': 43}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3489,0.514806,0.805046,0.822711,0.807169,0.80304
2,0.1954,0.441664,0.830275,0.832738,0.82946,0.82967
3,0.1342,0.436151,0.831422,0.83283,0.830797,0.831011
4,0.1,0.55073,0.84289,0.846559,0.841932,0.842167
5,0.0759,0.624423,0.823394,0.829557,0.822114,0.822113
6,0.0585,0.724875,0.831422,0.834318,0.830544,0.830749
7,0.0454,0.790552,0.830275,0.83046,0.830008,0.830124
8,0.034,0.823987,0.837156,0.837094,0.837143,0.837114
9,0.0263,0.89388,0.838303,0.839176,0.838775,0.838286
10,0.0197,0.977294,0.836009,0.83599,0.836101,0.835992


[I 2025-03-23 02:11:44,101] Trial 20 finished with value: 0.8300741029435774 and parameters: {'learning_rate': 0.0006312229279735081, 'weight_decay': 0.009000000000000001, 'warmup_steps': 43}. Best is trial 12 with value: 0.8439702128779307.


Trial 21 with params: {'learning_rate': 0.001802091302718475, 'weight_decay': 0.004, 'warmup_steps': 43}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.309,0.463687,0.817661,0.828973,0.819346,0.816578
2,0.1526,0.437598,0.84633,0.846876,0.845942,0.846123
3,0.0944,0.403671,0.84633,0.84627,0.846321,0.846291
4,0.0608,0.655833,0.840596,0.842516,0.83989,0.840132
5,0.0402,0.905252,0.825688,0.825649,0.825587,0.825614


[I 2025-03-23 02:12:39,166] Trial 21 pruned. 


Trial 22 with params: {'learning_rate': 0.0012626855053237266, 'weight_decay': 0.005, 'warmup_steps': 34}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3181,0.480744,0.822248,0.836839,0.824145,0.820851
2,0.1657,0.4293,0.844037,0.844097,0.843858,0.843937
3,0.1062,0.412897,0.84289,0.842877,0.842774,0.842815
4,0.0718,0.583175,0.845183,0.845661,0.844816,0.844988
5,0.0499,0.717952,0.826835,0.827152,0.826503,0.826643
6,0.0349,0.819964,0.831422,0.834033,0.830586,0.830797
7,0.0244,0.891518,0.834862,0.834912,0.835017,0.834855
8,0.017,1.066867,0.832569,0.832513,0.832597,0.832537
9,0.0124,1.143325,0.830275,0.831051,0.829797,0.829985
10,0.0089,1.524531,0.823394,0.824852,0.822746,0.822944


[I 2025-03-23 02:14:33,530] Trial 22 pruned. 


Trial 23 with params: {'learning_rate': 0.0025745547643383033, 'weight_decay': 0.006, 'warmup_steps': 42}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3031,0.437146,0.826835,0.831311,0.827892,0.826523
2,0.1442,0.452969,0.829128,0.833631,0.828039,0.828174
3,0.0857,0.397632,0.83945,0.839454,0.839311,0.839365
4,0.0533,0.653367,0.827982,0.828603,0.827545,0.82772
5,0.0339,0.992984,0.834862,0.834862,0.834975,0.834848
6,0.0222,1.009552,0.844037,0.846108,0.843311,0.843563
7,0.0162,1.140542,0.838303,0.838388,0.838101,0.83819
8,0.0096,1.267484,0.834862,0.834862,0.834975,0.834848
9,0.0065,1.365374,0.837156,0.837346,0.837396,0.837155
10,0.0059,1.602529,0.826835,0.827674,0.826335,0.826523


[I 2025-03-23 02:16:49,085] Trial 23 pruned. 


Trial 24 with params: {'learning_rate': 7.176203970997865e-05, 'weight_decay': 0.007, 'warmup_steps': 41}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.477,0.443781,0.791284,0.791225,0.791298,0.791245
2,0.3329,0.449919,0.798165,0.800298,0.797339,0.797445
3,0.2951,0.45053,0.806193,0.809065,0.805264,0.805361
4,0.2682,0.477731,0.81422,0.817961,0.813189,0.813276
5,0.2454,0.430112,0.822248,0.822228,0.822335,0.822229
6,0.2268,0.449332,0.817661,0.817597,0.817662,0.81762
7,0.2133,0.475377,0.819954,0.820184,0.820209,0.819954
8,0.1994,0.499876,0.811927,0.81574,0.812916,0.81164
9,0.1889,0.504383,0.822248,0.822314,0.82204,0.822124
10,0.1792,0.520931,0.813073,0.818122,0.81421,0.812659


[I 2025-03-23 02:19:04,778] Trial 24 pruned. 


Trial 25 with params: {'learning_rate': 0.0012153720924605515, 'weight_decay': 0.007, 'warmup_steps': 38}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3215,0.519735,0.81422,0.831612,0.816305,0.812395
2,0.1674,0.444552,0.841743,0.842151,0.841395,0.841556
3,0.1078,0.409762,0.848624,0.848757,0.848404,0.848509
4,0.0724,0.59409,0.844037,0.845039,0.843521,0.84374
5,0.0501,0.710244,0.836009,0.835992,0.835891,0.835931
6,0.0348,0.932945,0.827982,0.831604,0.826998,0.827163
7,0.0246,0.896436,0.833716,0.833668,0.833765,0.833689
8,0.0165,1.163859,0.838303,0.838382,0.83848,0.838297
9,0.0145,1.065619,0.831422,0.831443,0.83126,0.831324
10,0.0087,1.273122,0.833716,0.833795,0.833512,0.8336


[I 2025-03-23 02:22:19,805] Trial 25 finished with value: 0.8300741029435774 and parameters: {'learning_rate': 0.0012153720924605515, 'weight_decay': 0.007, 'warmup_steps': 38}. Best is trial 12 with value: 0.8439702128779307.


Trial 26 with params: {'learning_rate': 0.0019279751528649045, 'weight_decay': 0.005, 'warmup_steps': 30}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3061,0.467873,0.824541,0.833317,0.826019,0.823788
2,0.1518,0.44247,0.854358,0.855503,0.853825,0.854066
3,0.0936,0.399927,0.847477,0.847714,0.847741,0.847477
4,0.0602,0.661274,0.837156,0.83781,0.836722,0.836908
5,0.0395,0.953344,0.829128,0.829361,0.829387,0.829128
6,0.026,0.955623,0.841743,0.844324,0.840932,0.841178
7,0.019,1.060575,0.833716,0.83494,0.833133,0.833347
8,0.0113,1.128032,0.833716,0.83395,0.833428,0.833556
9,0.0083,1.162806,0.833716,0.83558,0.833007,0.833231
10,0.0075,1.154032,0.838303,0.838256,0.838354,0.838277


[I 2025-03-23 02:26:10,363] Trial 26 finished with value: 0.8220508911279546 and parameters: {'learning_rate': 0.0019279751528649045, 'weight_decay': 0.005, 'warmup_steps': 30}. Best is trial 12 with value: 0.8439702128779307.


Trial 27 with params: {'learning_rate': 0.00021059103361382344, 'weight_decay': 0.001, 'warmup_steps': 42}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4052,0.412031,0.81078,0.816573,0.812,0.810275
2,0.2633,0.457236,0.823394,0.826951,0.822409,0.822554
3,0.2076,0.498031,0.818807,0.825289,0.817483,0.81742
4,0.1732,0.603747,0.817661,0.827793,0.81602,0.815653
5,0.1453,0.512178,0.830275,0.830218,0.830218,0.830218


[I 2025-03-23 02:27:06,486] Trial 27 pruned. 


Trial 28 with params: {'learning_rate': 0.0034381475356838795, 'weight_decay': 0.002, 'warmup_steps': 40}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3009,0.435805,0.832569,0.834086,0.833186,0.832512
2,0.1398,0.497247,0.834862,0.837948,0.833965,0.834178
3,0.0811,0.437455,0.845183,0.846903,0.844521,0.84477
4,0.0503,0.869045,0.826835,0.829131,0.82604,0.82624
5,0.0325,0.916896,0.821101,0.82115,0.821251,0.821092


[I 2025-03-23 02:28:03,811] Trial 28 pruned. 


Trial 29 with params: {'learning_rate': 0.00011735172641973649, 'weight_decay': 0.003, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.43,0.428661,0.786697,0.78998,0.787636,0.78641
2,0.3021,0.435129,0.81422,0.814461,0.813905,0.814028
3,0.2565,0.48152,0.816514,0.819666,0.815568,0.815698
4,0.2249,0.556271,0.811927,0.820544,0.81039,0.810079
5,0.1995,0.454853,0.818807,0.819186,0.819125,0.818806
6,0.1793,0.52122,0.813073,0.813301,0.813326,0.813073
7,0.1636,0.54152,0.813073,0.813269,0.812779,0.812894
8,0.1502,0.55017,0.821101,0.821597,0.821462,0.821097
9,0.1398,0.574535,0.822248,0.822478,0.822504,0.822247
10,0.1293,0.637326,0.811927,0.818788,0.813253,0.811306


[I 2025-03-23 02:30:36,476] Trial 29 pruned. 


Trial 30 with params: {'learning_rate': 0.0009328656464856645, 'weight_decay': 0.005, 'warmup_steps': 43}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3333,0.539386,0.802752,0.824142,0.805085,0.800229
2,0.1777,0.437331,0.841743,0.844324,0.840932,0.841178
3,0.1177,0.408236,0.844037,0.844576,0.843647,0.843826
4,0.0832,0.57312,0.845183,0.846104,0.844689,0.844904
5,0.0605,0.680627,0.830275,0.833611,0.829334,0.829521
6,0.0441,0.805905,0.831422,0.835605,0.830376,0.830537
7,0.0327,0.822669,0.83945,0.839744,0.839143,0.839284
8,0.0229,0.9555,0.836009,0.836462,0.835638,0.835802
9,0.0171,1.105546,0.834862,0.834862,0.834975,0.834848
10,0.0114,1.074692,0.819954,0.819891,0.819957,0.819914


[I 2025-03-23 02:32:40,223] Trial 30 pruned. 


Trial 31 with params: {'learning_rate': 0.0003838641128009602, 'weight_decay': 0.004, 'warmup_steps': 20}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3662,0.414441,0.819954,0.826304,0.82122,0.81943
2,0.2235,0.48445,0.818807,0.826188,0.817399,0.817269
3,0.1632,0.476068,0.824541,0.826332,0.82383,0.82403
4,0.128,0.584235,0.819954,0.828084,0.818483,0.818308
5,0.1008,0.569517,0.833716,0.834049,0.833386,0.833531
6,0.0814,0.641865,0.84633,0.848421,0.845605,0.845863
7,0.066,0.72402,0.838303,0.839756,0.83768,0.837909
8,0.0552,0.713246,0.836009,0.838115,0.836733,0.835914
9,0.0451,0.789124,0.837156,0.840233,0.838027,0.836988
10,0.0368,0.809929,0.831422,0.83461,0.832312,0.831235


[I 2025-03-23 02:35:48,056] Trial 31 finished with value: 0.8371345444597018 and parameters: {'learning_rate': 0.0003838641128009602, 'weight_decay': 0.004, 'warmup_steps': 20}. Best is trial 12 with value: 0.8439702128779307.


Trial 32 with params: {'learning_rate': 0.00027835289963160396, 'weight_decay': 0.004, 'warmup_steps': 15}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3802,0.405899,0.815367,0.820445,0.816505,0.814958
2,0.2431,0.462039,0.819954,0.825423,0.818736,0.818753
3,0.185,0.486068,0.831422,0.83634,0.830292,0.830421
4,0.1486,0.597172,0.808486,0.819945,0.806717,0.806087
5,0.1221,0.542643,0.826835,0.826772,0.826798,0.826784
6,0.1005,0.630042,0.832569,0.834089,0.831923,0.832141
7,0.0844,0.650216,0.840596,0.840945,0.840269,0.84042
8,0.0729,0.651694,0.825688,0.828401,0.826513,0.825533
9,0.0622,0.722384,0.832569,0.835047,0.833354,0.832442
10,0.0535,0.782923,0.821101,0.826063,0.822219,0.820724


[I 2025-03-23 02:37:49,614] Trial 32 pruned. 


Trial 33 with params: {'learning_rate': 0.0007553826543667807, 'weight_decay': 0.0, 'warmup_steps': 19}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3325,0.534825,0.803899,0.821872,0.806043,0.801835
2,0.1864,0.432844,0.833716,0.83558,0.833007,0.833231
3,0.1262,0.431521,0.83945,0.839454,0.839311,0.839365
4,0.0919,0.556941,0.84633,0.848421,0.845605,0.845863
5,0.0688,0.652589,0.827982,0.832648,0.826871,0.826991


[I 2025-03-23 02:38:56,877] Trial 33 pruned. 


Trial 34 with params: {'learning_rate': 0.002266795389700779, 'weight_decay': 0.007, 'warmup_steps': 41}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3053,0.441981,0.834862,0.836606,0.835522,0.834792
2,0.1468,0.455119,0.83945,0.842007,0.838638,0.838877
3,0.0881,0.419408,0.852064,0.852959,0.852541,0.852048
4,0.0563,0.671594,0.833716,0.834289,0.833302,0.833477
5,0.0363,0.805948,0.840596,0.840676,0.840774,0.840591
6,0.023,1.010015,0.844037,0.844097,0.843858,0.843937
7,0.0165,1.021498,0.833716,0.83494,0.833133,0.833347
8,0.0113,1.294218,0.841743,0.842917,0.841185,0.841409
9,0.0082,1.209132,0.841743,0.841743,0.841858,0.84173
10,0.0057,1.639579,0.817661,0.818792,0.817073,0.817257


[I 2025-03-23 02:41:31,371] Trial 34 pruned. 


Trial 35 with params: {'learning_rate': 0.0009779964664909616, 'weight_decay': 0.005, 'warmup_steps': 17}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3221,0.54293,0.802752,0.824964,0.805128,0.800124
2,0.1745,0.426862,0.844037,0.845864,0.843353,0.843601
3,0.1154,0.403715,0.852064,0.85244,0.851741,0.8519
4,0.0818,0.562557,0.841743,0.842042,0.841437,0.84158
5,0.0585,0.646466,0.829128,0.832606,0.828166,0.828343


[I 2025-03-23 02:42:34,983] Trial 35 pruned. 


Trial 36 with params: {'learning_rate': 0.004049761177508626, 'weight_decay': 0.006, 'warmup_steps': 5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2851,0.469516,0.836009,0.84096,0.837112,0.83568
2,0.1329,0.490674,0.830275,0.834266,0.82925,0.829413
3,0.0739,0.492959,0.849771,0.851298,0.849152,0.849404
4,0.0431,0.7225,0.834862,0.834862,0.834975,0.834848
5,0.0274,0.867671,0.823394,0.823354,0.823293,0.823319
6,0.0172,1.181563,0.837156,0.838305,0.836596,0.836813
7,0.013,0.979141,0.84633,0.846983,0.846742,0.846323
8,0.0093,1.21008,0.838303,0.838546,0.838017,0.838148
9,0.0067,1.250208,0.819954,0.819901,0.819872,0.819886
10,0.0039,1.461479,0.819954,0.821708,0.819241,0.81943


[I 2025-03-23 02:45:15,042] Trial 36 pruned. 


Trial 37 with params: {'learning_rate': 0.002588896143063369, 'weight_decay': 0.005, 'warmup_steps': 36}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3012,0.400097,0.838303,0.839532,0.838859,0.838267
2,0.1431,0.462582,0.84289,0.845079,0.842142,0.842392
3,0.0852,0.418359,0.83945,0.84078,0.840027,0.839408
4,0.053,0.716951,0.831422,0.834033,0.830586,0.830797
5,0.0328,0.987312,0.831422,0.831445,0.831555,0.831411
6,0.023,1.045561,0.830275,0.83055,0.829965,0.8301
7,0.0155,1.2084,0.832569,0.832513,0.832597,0.832537
8,0.0101,1.550216,0.837156,0.837273,0.836933,0.837033
9,0.008,1.401489,0.825688,0.825629,0.825629,0.825629
10,0.005,1.658965,0.826835,0.82802,0.82625,0.826451


[I 2025-03-23 02:47:17,524] Trial 37 pruned. 


Trial 38 with params: {'learning_rate': 0.00020813345639311007, 'weight_decay': 0.005, 'warmup_steps': 27}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4022,0.411714,0.813073,0.819312,0.814337,0.812529
2,0.2631,0.457962,0.825688,0.829278,0.824703,0.824859
3,0.2078,0.501432,0.824541,0.832321,0.823114,0.823014
4,0.1736,0.60554,0.817661,0.828927,0.815936,0.81547
5,0.146,0.510023,0.827982,0.827924,0.827924,0.827924


[I 2025-03-23 02:48:25,250] Trial 38 pruned. 


Trial 39 with params: {'learning_rate': 0.00016183935643448408, 'weight_decay': 0.0, 'warmup_steps': 17}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4142,0.421956,0.799312,0.805737,0.800613,0.798676
2,0.2804,0.453047,0.816514,0.818131,0.81582,0.816002
3,0.2286,0.499008,0.823394,0.829557,0.822114,0.822113
4,0.1958,0.584918,0.815367,0.823847,0.813852,0.813596
5,0.1687,0.497101,0.816514,0.816675,0.816241,0.81635
6,0.1476,0.611392,0.815367,0.818074,0.814484,0.81463
7,0.1322,0.573876,0.823394,0.824299,0.822872,0.823058
8,0.118,0.571202,0.825688,0.826624,0.826177,0.825665
9,0.1073,0.640185,0.832569,0.835614,0.833439,0.832396
10,0.098,0.655994,0.823394,0.826377,0.824261,0.823212


[I 2025-03-23 02:51:23,058] Trial 39 finished with value: 0.8221801222215643 and parameters: {'learning_rate': 0.00016183935643448408, 'weight_decay': 0.0, 'warmup_steps': 17}. Best is trial 12 with value: 0.8439702128779307.


Trial 40 with params: {'learning_rate': 0.00020909750162303558, 'weight_decay': 0.009000000000000001, 'warmup_steps': 23}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4003,0.412197,0.811927,0.819235,0.813295,0.811255
2,0.2629,0.456692,0.824541,0.828277,0.823535,0.823678
3,0.2075,0.502018,0.825688,0.834245,0.824198,0.824055
4,0.1732,0.611423,0.817661,0.828927,0.815936,0.81547
5,0.1457,0.5097,0.823394,0.823335,0.823335,0.823335


[I 2025-03-23 02:52:28,483] Trial 40 pruned. 


Trial 41 with params: {'learning_rate': 0.00024275191533810534, 'weight_decay': 0.002, 'warmup_steps': 30}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3944,0.413271,0.809633,0.818316,0.811127,0.808787
2,0.2523,0.446891,0.819954,0.825033,0.818778,0.81882
3,0.1954,0.492909,0.832569,0.838516,0.831334,0.831419
4,0.1599,0.616459,0.813073,0.827229,0.811137,0.810323
5,0.1327,0.53443,0.826835,0.826906,0.826629,0.826714
6,0.1111,0.648129,0.830275,0.833611,0.829334,0.829521
7,0.0947,0.615779,0.837156,0.837552,0.836806,0.836963
8,0.0826,0.648353,0.833716,0.837563,0.834691,0.833477
9,0.0718,0.718358,0.824541,0.827986,0.825471,0.824319
10,0.0634,0.735965,0.819954,0.823679,0.820925,0.819696


[I 2025-03-23 02:54:33,509] Trial 41 pruned. 


Trial 42 with params: {'learning_rate': 0.00059747421507308, 'weight_decay': 0.003, 'warmup_steps': 28}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3466,0.507924,0.816514,0.829283,0.818304,0.815254
2,0.1977,0.447677,0.826835,0.830605,0.825829,0.825983
3,0.1374,0.432935,0.840596,0.841192,0.840185,0.840368
4,0.1027,0.571498,0.840596,0.845299,0.839511,0.839706
5,0.0784,0.628296,0.827982,0.833026,0.826829,0.826929


[I 2025-03-23 02:55:47,183] Trial 42 pruned. 


Trial 43 with params: {'learning_rate': 0.000512429179107631, 'weight_decay': 0.004, 'warmup_steps': 17}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3516,0.468124,0.816514,0.824378,0.817925,0.815807
2,0.2071,0.45735,0.826835,0.831667,0.825703,0.825806
3,0.1465,0.445591,0.838303,0.839035,0.837848,0.838042
4,0.1114,0.554434,0.845183,0.849589,0.844142,0.844371
5,0.0857,0.620357,0.827982,0.830983,0.827082,0.827269
6,0.0685,0.672802,0.83945,0.840112,0.839017,0.839205
7,0.0536,0.806978,0.832569,0.833073,0.832176,0.832343
8,0.0428,0.792098,0.840596,0.840549,0.840648,0.840571
9,0.0334,0.84391,0.837156,0.839144,0.837859,0.83707
10,0.0265,0.929201,0.825688,0.825737,0.82584,0.82568


[I 2025-03-23 02:58:33,949] Trial 43 finished with value: 0.8302314690576542 and parameters: {'learning_rate': 0.000512429179107631, 'weight_decay': 0.004, 'warmup_steps': 17}. Best is trial 12 with value: 0.8439702128779307.


Trial 44 with params: {'learning_rate': 0.0003432988635071501, 'weight_decay': 0.003, 'warmup_steps': 20}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3708,0.411313,0.81422,0.821123,0.815547,0.813607
2,0.2302,0.48591,0.81422,0.823426,0.812642,0.812309
3,0.1701,0.473735,0.827982,0.830421,0.827166,0.827368
4,0.1343,0.58093,0.818807,0.82666,0.817357,0.817191
5,0.1073,0.565364,0.834862,0.834828,0.834765,0.834792
6,0.0878,0.635056,0.840596,0.842064,0.839974,0.840208
7,0.072,0.703188,0.83945,0.840611,0.83889,0.839111
8,0.061,0.693863,0.834862,0.83709,0.835607,0.834757
9,0.0507,0.753031,0.833716,0.835344,0.834354,0.833652
10,0.0421,0.820728,0.826835,0.83206,0.827976,0.826451


[I 2025-03-23 03:00:40,863] Trial 44 pruned. 


Trial 45 with params: {'learning_rate': 0.0007917447427498762, 'weight_decay': 0.002, 'warmup_steps': 35}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.337,0.544868,0.797018,0.820955,0.799497,0.794032
2,0.1851,0.430338,0.838303,0.839973,0.837638,0.837871
3,0.1247,0.419352,0.838303,0.838258,0.838227,0.838241
4,0.09,0.563611,0.845183,0.846903,0.844521,0.84477
5,0.0666,0.675517,0.827982,0.832648,0.826871,0.826991
6,0.0497,0.770379,0.831422,0.834932,0.83046,0.830647
7,0.0373,0.827277,0.833716,0.833668,0.833765,0.833689
8,0.0265,0.914138,0.831422,0.83136,0.831386,0.831372
9,0.0198,1.001448,0.834862,0.834827,0.834933,0.834841
10,0.0145,1.145875,0.822248,0.82246,0.821956,0.822077


[I 2025-03-23 03:02:53,653] Trial 45 pruned. 


Trial 46 with params: {'learning_rate': 0.000334111767547559, 'weight_decay': 0.003, 'warmup_steps': 26}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3734,0.415291,0.822248,0.827806,0.82343,0.821814
2,0.2309,0.474385,0.818807,0.825289,0.817483,0.81742
3,0.1708,0.466486,0.832569,0.833693,0.832007,0.832216
4,0.1348,0.586752,0.809633,0.818175,0.808095,0.807763
5,0.1082,0.577228,0.834862,0.834977,0.834638,0.834737
6,0.0886,0.644578,0.836009,0.837061,0.83547,0.83568
7,0.0725,0.681863,0.833716,0.834162,0.833344,0.833505
8,0.0615,0.700663,0.830275,0.831786,0.830892,0.830218
9,0.0513,0.761195,0.831422,0.833043,0.83206,0.831358
10,0.0428,0.806658,0.829128,0.834383,0.830271,0.82875


[I 2025-03-23 03:06:01,776] Trial 46 finished with value: 0.835972718244181 and parameters: {'learning_rate': 0.000334111767547559, 'weight_decay': 0.003, 'warmup_steps': 26}. Best is trial 12 with value: 0.8439702128779307.


Trial 47 with params: {'learning_rate': 0.002353862913340434, 'weight_decay': 0.005, 'warmup_steps': 42}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3052,0.438993,0.831422,0.835591,0.832439,0.83115
2,0.1458,0.451432,0.847477,0.850562,0.846605,0.846868
3,0.0871,0.431366,0.854358,0.85451,0.854582,0.854356
4,0.0541,0.682729,0.84289,0.843977,0.842353,0.842575
5,0.0347,0.84431,0.830275,0.830239,0.830176,0.830203
6,0.024,0.902972,0.833716,0.835823,0.832965,0.833189
7,0.0149,1.239752,0.832569,0.833354,0.832091,0.832283
8,0.0106,1.23021,0.845183,0.84552,0.845489,0.845183
9,0.0072,1.275625,0.840596,0.841337,0.840143,0.840339
10,0.0049,1.496858,0.830275,0.830654,0.829923,0.830074


[I 2025-03-23 03:09:06,025] Trial 47 finished with value: 0.8288833720754121 and parameters: {'learning_rate': 0.002353862913340434, 'weight_decay': 0.005, 'warmup_steps': 42}. Best is trial 12 with value: 0.8439702128779307.


Trial 48 with params: {'learning_rate': 0.0030474186334273077, 'weight_decay': 0.007, 'warmup_steps': 31}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2987,0.434673,0.830275,0.832481,0.831018,0.830167
2,0.1408,0.47872,0.840596,0.8436,0.839722,0.83996
3,0.083,0.421737,0.850917,0.851477,0.85053,0.850716
4,0.0512,0.728387,0.844037,0.844007,0.843942,0.84397
5,0.0339,0.745395,0.838303,0.838326,0.838438,0.838292
6,0.0215,0.921686,0.831422,0.833042,0.830755,0.830972
7,0.0155,1.03136,0.848624,0.848597,0.848531,0.848559
8,0.0123,1.018907,0.834862,0.834862,0.834975,0.834848
9,0.0076,1.150254,0.829128,0.82957,0.829471,0.829126
10,0.0049,1.48094,0.826835,0.827152,0.826503,0.826643


[I 2025-03-23 03:12:50,384] Trial 48 finished with value: 0.8241896099823025 and parameters: {'learning_rate': 0.0030474186334273077, 'weight_decay': 0.007, 'warmup_steps': 31}. Best is trial 12 with value: 0.8439702128779307.


Trial 49 with params: {'learning_rate': 0.00044320768139836854, 'weight_decay': 0.004, 'warmup_steps': 24}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3596,0.443281,0.819954,0.828131,0.821388,0.819235
2,0.2145,0.469899,0.824541,0.830108,0.823324,0.82337
3,0.1544,0.46506,0.827982,0.82947,0.827334,0.827543
4,0.119,0.549094,0.834862,0.838584,0.833881,0.834077
5,0.0924,0.625334,0.825688,0.828661,0.824787,0.824966


[I 2025-03-23 03:13:59,776] Trial 49 pruned. 


Trial 50 with params: {'learning_rate': 0.00016174467971837156, 'weight_decay': 0.006, 'warmup_steps': 19}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4149,0.42205,0.799312,0.805737,0.800613,0.798676
2,0.2805,0.4544,0.816514,0.817924,0.815863,0.816045
3,0.2286,0.496145,0.824541,0.83052,0.823282,0.823303
4,0.1959,0.585697,0.817661,0.826725,0.816105,0.815827
5,0.1686,0.491983,0.821101,0.821035,0.821083,0.821055


[I 2025-03-23 03:15:11,997] Trial 50 pruned. 


Trial 51 with params: {'learning_rate': 0.0002148392754132308, 'weight_decay': 0.002, 'warmup_steps': 24}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4005,0.412976,0.808486,0.815954,0.809874,0.807776
2,0.2605,0.455388,0.825688,0.829608,0.824661,0.824802
3,0.2047,0.493329,0.827982,0.835148,0.826619,0.826594
4,0.1697,0.595277,0.819954,0.830173,0.818315,0.817972
5,0.1419,0.519764,0.824541,0.824556,0.824377,0.824439
6,0.1206,0.66196,0.818807,0.822298,0.81782,0.817945
7,0.1043,0.58658,0.837156,0.837674,0.836764,0.836936
8,0.0917,0.631506,0.832569,0.836926,0.833607,0.832283
9,0.0807,0.708086,0.830275,0.834261,0.831271,0.830017
10,0.0721,0.703959,0.818807,0.821478,0.81963,0.818646


[I 2025-03-23 03:17:19,689] Trial 51 pruned. 


Trial 52 with params: {'learning_rate': 6.1005881023266626e-05, 'weight_decay': 0.007, 'warmup_steps': 10}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4828,0.454695,0.78555,0.785965,0.78512,0.785243
2,0.3428,0.449974,0.803899,0.80574,0.803138,0.803278
3,0.307,0.450537,0.805046,0.807766,0.804138,0.804239
4,0.282,0.463026,0.807339,0.810088,0.806433,0.806542
5,0.2604,0.427633,0.815367,0.815348,0.815452,0.815347


[I 2025-03-23 03:18:23,853] Trial 52 pruned. 


Trial 53 with params: {'learning_rate': 0.00022202409964081397, 'weight_decay': 0.004, 'warmup_steps': 31}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3997,0.412722,0.81078,0.817404,0.812084,0.81018
2,0.2589,0.452684,0.822248,0.826283,0.821199,0.821315
3,0.203,0.497761,0.826835,0.833296,0.825535,0.825544
4,0.1682,0.617488,0.816514,0.827444,0.81481,0.814357
5,0.1404,0.521661,0.822248,0.822261,0.822083,0.822145
6,0.1195,0.67186,0.816514,0.819971,0.815526,0.815641
7,0.1028,0.596424,0.833716,0.834289,0.833302,0.833477
8,0.0904,0.635328,0.834862,0.838891,0.835859,0.834611
9,0.0792,0.709258,0.827982,0.831946,0.828976,0.82772
10,0.0706,0.72092,0.825688,0.829631,0.826682,0.825423


[I 2025-03-23 03:21:26,286] Trial 53 finished with value: 0.826796366817282 and parameters: {'learning_rate': 0.00022202409964081397, 'weight_decay': 0.004, 'warmup_steps': 31}. Best is trial 12 with value: 0.8439702128779307.


Trial 54 with params: {'learning_rate': 0.0012203047304135536, 'weight_decay': 0.003, 'warmup_steps': 22}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3173,0.511936,0.821101,0.838828,0.823188,0.819344
2,0.1669,0.436275,0.844037,0.845039,0.843521,0.84374
3,0.1073,0.400799,0.854358,0.854299,0.854372,0.854325
4,0.0734,0.597415,0.840596,0.841497,0.840101,0.840309
5,0.0505,0.72232,0.836009,0.836348,0.83568,0.835828
6,0.0353,0.860721,0.832569,0.835935,0.831628,0.831825
7,0.0248,0.890551,0.837156,0.837445,0.836849,0.836988
8,0.0169,1.07389,0.832569,0.832848,0.83226,0.832396
9,0.0131,1.331398,0.834862,0.835821,0.834344,0.834548
10,0.0095,1.519318,0.811927,0.815017,0.810979,0.811091


[I 2025-03-23 03:23:35,911] Trial 54 pruned. 


Trial 55 with params: {'learning_rate': 0.000442770292807566, 'weight_decay': 0.002, 'warmup_steps': 26}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3606,0.444172,0.819954,0.829663,0.821514,0.819069
2,0.2142,0.467948,0.823394,0.82835,0.82224,0.822314
3,0.1537,0.459022,0.829128,0.830522,0.828503,0.828712
4,0.1186,0.539034,0.838303,0.841908,0.837343,0.837559
5,0.0926,0.629814,0.823394,0.826951,0.822409,0.822554
6,0.0742,0.648784,0.844037,0.844715,0.843605,0.843799
7,0.0589,0.768712,0.833716,0.834756,0.833175,0.833382
8,0.0481,0.740062,0.837156,0.837269,0.837354,0.837153
9,0.0381,0.849907,0.836009,0.83893,0.836859,0.835852
10,0.0308,0.865714,0.826835,0.828441,0.827471,0.826769


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--accuracy/f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Sat Oct 12 13:56:14 2024) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.
[I 2025-03-23 03:27:25,023] Trial 55 finished with value: 0.825577049611791 and parameters: {'learning_rate': 0.000442770292807566, 'weight_decay': 0.002, 'warmup_steps': 26}. Best is trial 12 with value: 0.8439702128779307.


Trial 56 with params: {'learning_rate': 0.004913837305728667, 'weight_decay': 0.002, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2834,0.449976,0.841743,0.844003,0.84249,0.841642
2,0.1288,0.428558,0.848624,0.848757,0.848404,0.848509
3,0.0729,0.598137,0.822248,0.826631,0.821156,0.821255
4,0.0431,0.748632,0.831422,0.831499,0.831218,0.831305
5,0.0271,0.818214,0.838303,0.83875,0.838648,0.838301
6,0.0186,0.969263,0.829128,0.830326,0.828545,0.82875
7,0.0134,1.251328,0.833716,0.834751,0.834228,0.833689
8,0.0078,1.235811,0.829128,0.829079,0.82905,0.829063
9,0.0057,1.383949,0.834862,0.836606,0.835522,0.834792
10,0.0052,1.362307,0.825688,0.825682,0.825545,0.825596


[I 2025-03-23 03:29:31,725] Trial 56 pruned. 


Trial 57 with params: {'learning_rate': 0.002166814831584714, 'weight_decay': 0.004, 'warmup_steps': 36}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3048,0.45169,0.825688,0.831094,0.82685,0.825283
2,0.1486,0.441557,0.84289,0.844593,0.842227,0.84247
3,0.0899,0.435102,0.854358,0.854311,0.854414,0.854335
4,0.0571,0.662038,0.824541,0.825713,0.823956,0.824153
5,0.0368,0.85325,0.83945,0.839415,0.839522,0.839428
6,0.0246,1.095343,0.834862,0.837372,0.834049,0.834273
7,0.0178,1.247173,0.826835,0.826785,0.826755,0.826769
8,0.0119,1.304485,0.837156,0.837273,0.836933,0.837033
9,0.0077,1.364778,0.847477,0.8475,0.847615,0.847467
10,0.0061,1.721471,0.819954,0.820763,0.819451,0.819629


[I 2025-03-23 03:32:07,080] Trial 57 pruned. 


Trial 58 with params: {'learning_rate': 0.0013530591483388581, 'weight_decay': 0.006, 'warmup_steps': 38}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3173,0.464604,0.825688,0.840079,0.827566,0.824354
2,0.163,0.437296,0.83945,0.83957,0.839227,0.839328
3,0.1038,0.403116,0.847477,0.84743,0.847531,0.847453
4,0.0693,0.619126,0.837156,0.838499,0.836554,0.836777
5,0.0474,0.812567,0.823394,0.824469,0.82283,0.823022
6,0.0328,0.806411,0.833716,0.837589,0.832712,0.832898
7,0.0229,0.895711,0.833716,0.833668,0.833765,0.833689
8,0.0159,1.071341,0.83945,0.840112,0.839017,0.839205
9,0.0114,1.422005,0.829128,0.831448,0.828334,0.828542
10,0.0088,1.170542,0.834862,0.835253,0.834512,0.834667


[I 2025-03-23 03:35:17,568] Trial 58 finished with value: 0.8301000526592943 and parameters: {'learning_rate': 0.0013530591483388581, 'weight_decay': 0.006, 'warmup_steps': 38}. Best is trial 12 with value: 0.8439702128779307.


Trial 59 with params: {'learning_rate': 0.0048602160405686, 'weight_decay': 0.01, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2828,0.470334,0.832569,0.835047,0.833354,0.832442
2,0.131,0.472483,0.829128,0.831997,0.82825,0.828446
3,0.075,0.564672,0.825688,0.828962,0.824745,0.824913
4,0.0442,0.724585,0.831422,0.831987,0.831007,0.83118
5,0.0275,1.019841,0.829128,0.829079,0.82905,0.829063


[I 2025-03-23 03:36:17,589] Trial 59 pruned. 


Trial 60 with params: {'learning_rate': 0.004014238616142541, 'weight_decay': 0.0, 'warmup_steps': 14}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2907,0.47571,0.838303,0.840677,0.839069,0.83819
2,0.1327,0.475225,0.838303,0.842245,0.837301,0.837508
3,0.0749,0.473606,0.844037,0.845223,0.843479,0.843708
4,0.0446,0.680856,0.83945,0.839388,0.839438,0.839408
5,0.0288,1.058765,0.827982,0.828087,0.827755,0.827851
6,0.0189,1.035611,0.834862,0.834807,0.834807,0.834807
7,0.0128,1.275081,0.844037,0.845223,0.843479,0.843708
8,0.0083,1.179875,0.827982,0.828615,0.828387,0.827974
9,0.0064,1.315544,0.832569,0.833516,0.83306,0.832547
10,0.0044,1.693368,0.834862,0.834806,0.834891,0.834831


[I 2025-03-23 03:39:57,785] Trial 60 finished with value: 0.835972718244181 and parameters: {'learning_rate': 0.004014238616142541, 'weight_decay': 0.0, 'warmup_steps': 14}. Best is trial 12 with value: 0.8439702128779307.


Trial 61 with params: {'learning_rate': 0.0014844178918718939, 'weight_decay': 0.001, 'warmup_steps': 17}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3082,0.478578,0.825688,0.840755,0.827608,0.824282
2,0.1596,0.425715,0.847477,0.848095,0.847068,0.847258
3,0.101,0.414671,0.853211,0.853186,0.853119,0.853148
4,0.0667,0.613947,0.84289,0.845079,0.842142,0.842392
5,0.0451,0.826384,0.81422,0.815081,0.813695,0.813867


[I 2025-03-23 03:40:58,497] Trial 61 pruned. 


Trial 62 with params: {'learning_rate': 0.004741350653024037, 'weight_decay': 0.001, 'warmup_steps': 14}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2894,0.44325,0.833716,0.834582,0.834186,0.833698
2,0.1315,0.475818,0.832569,0.834089,0.831923,0.832141
3,0.0736,0.52137,0.834862,0.837948,0.833965,0.834178
4,0.0431,0.747603,0.83945,0.839418,0.839353,0.839381
5,0.0278,0.863869,0.831422,0.831445,0.831555,0.831411
6,0.0182,1.027318,0.836009,0.836091,0.835807,0.835895
7,0.0118,1.133404,0.832569,0.832617,0.832386,0.832462
8,0.0092,1.113636,0.833716,0.833739,0.833554,0.833619
9,0.0066,1.379236,0.836009,0.836091,0.835807,0.835895
10,0.0044,1.511622,0.840596,0.840945,0.840269,0.84042


[I 2025-03-23 03:44:41,971] Trial 62 finished with value: 0.8380981350314647 and parameters: {'learning_rate': 0.004741350653024037, 'weight_decay': 0.001, 'warmup_steps': 14}. Best is trial 12 with value: 0.8439702128779307.


Trial 63 with params: {'learning_rate': 0.004175651431981934, 'weight_decay': 0.001, 'warmup_steps': 20}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2887,0.462101,0.837156,0.839144,0.837859,0.83707
2,0.1327,0.482803,0.833716,0.834289,0.833302,0.833477
3,0.0756,0.506695,0.837156,0.838499,0.836554,0.836777
4,0.0445,0.73923,0.823394,0.823871,0.822998,0.823156
5,0.0277,1.086791,0.825688,0.825623,0.825671,0.825643


[I 2025-03-23 03:45:43,488] Trial 63 pruned. 


Trial 64 with params: {'learning_rate': 0.0032394425564755136, 'weight_decay': 0.0, 'warmup_steps': 12}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2884,0.409127,0.831422,0.831753,0.831723,0.831422
2,0.135,0.457787,0.847477,0.848408,0.846984,0.847202
3,0.0785,0.51615,0.836009,0.836348,0.83568,0.835828
4,0.0468,0.74275,0.829128,0.829109,0.829218,0.82911
5,0.0301,0.908738,0.831422,0.83136,0.831428,0.831385
6,0.0195,0.99927,0.829128,0.829202,0.828924,0.829009
7,0.0127,1.295434,0.832569,0.832757,0.832302,0.83242
8,0.009,1.144068,0.827982,0.830996,0.82885,0.827804
9,0.0077,1.488417,0.827982,0.829485,0.828597,0.827924
10,0.006,1.512163,0.834862,0.835658,0.834386,0.83458


[I 2025-03-23 03:48:56,882] Trial 64 finished with value: 0.8346396965865992 and parameters: {'learning_rate': 0.0032394425564755136, 'weight_decay': 0.0, 'warmup_steps': 12}. Best is trial 12 with value: 0.8439702128779307.


Trial 65 with params: {'learning_rate': 0.00478921194008496, 'weight_decay': 0.0, 'warmup_steps': 15}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2876,0.426242,0.823394,0.823443,0.823546,0.823386
2,0.1312,0.482916,0.832569,0.835627,0.83167,0.831875
3,0.075,0.553437,0.823394,0.831383,0.821946,0.821819
4,0.0447,0.699945,0.832569,0.83268,0.832344,0.832442
5,0.0279,0.844326,0.827982,0.82817,0.828218,0.827981


[I 2025-03-23 03:50:02,749] Trial 65 pruned. 


Trial 66 with params: {'learning_rate': 0.002553439737298611, 'weight_decay': 0.001, 'warmup_steps': 13}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2948,0.448168,0.821101,0.827685,0.822388,0.820557
2,0.1425,0.445669,0.841743,0.843795,0.841016,0.841262
3,0.0844,0.482419,0.834862,0.837653,0.834007,0.834227
4,0.0521,0.673987,0.836009,0.836159,0.836228,0.836007
5,0.0336,0.871072,0.827982,0.828093,0.828176,0.827978
6,0.0224,0.98553,0.830275,0.830239,0.830176,0.830203
7,0.0147,1.359581,0.827982,0.827944,0.827882,0.827908
8,0.0085,1.44384,0.824541,0.824772,0.824798,0.824541
9,0.0072,1.599374,0.825688,0.825876,0.825924,0.825687
10,0.0059,1.625292,0.830275,0.83055,0.829965,0.8301


[I 2025-03-23 03:53:18,452] Trial 66 finished with value: 0.8336786303874562 and parameters: {'learning_rate': 0.002553439737298611, 'weight_decay': 0.001, 'warmup_steps': 13}. Best is trial 12 with value: 0.8439702128779307.


Trial 67 with params: {'learning_rate': 0.0011018736284734124, 'weight_decay': 0.0, 'warmup_steps': 3}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3124,0.540773,0.81078,0.831445,0.813053,0.808507
2,0.1709,0.449478,0.838303,0.839756,0.83768,0.837909
3,0.1109,0.41523,0.847477,0.847419,0.847447,0.847432
4,0.0764,0.613502,0.833716,0.83608,0.832923,0.833145
5,0.054,0.71168,0.822248,0.822783,0.82183,0.821993


[I 2025-03-23 03:54:54,630] Trial 67 pruned. 


Trial 68 with params: {'learning_rate': 0.0035217967402155163, 'weight_decay': 0.0, 'warmup_steps': 23}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2936,0.476065,0.824541,0.827986,0.825471,0.824319
2,0.1374,0.479821,0.831422,0.833042,0.830755,0.830972
3,0.0796,0.508613,0.833716,0.835352,0.833049,0.833272
4,0.0491,0.747786,0.83945,0.83965,0.839185,0.839307
5,0.0311,0.931127,0.832569,0.83268,0.832344,0.832442
6,0.0201,1.23048,0.829128,0.830155,0.829639,0.829101
7,0.0137,1.316388,0.833716,0.833654,0.833681,0.833666
8,0.01,1.209841,0.836009,0.836163,0.835765,0.835874
9,0.0069,1.452939,0.829128,0.829079,0.82905,0.829063
10,0.0047,1.721052,0.832569,0.834309,0.831881,0.832102


[I 2025-03-23 03:58:04,988] Trial 68 finished with value: 0.826668685126801 and parameters: {'learning_rate': 0.0035217967402155163, 'weight_decay': 0.0, 'warmup_steps': 23}. Best is trial 12 with value: 0.8439702128779307.


Trial 69 with params: {'learning_rate': 0.004586539000806921, 'weight_decay': 0.001, 'warmup_steps': 15}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2877,0.435042,0.833716,0.833654,0.833723,0.833679
2,0.1318,0.461158,0.837156,0.83969,0.836343,0.836575
3,0.0743,0.519043,0.831422,0.835261,0.830418,0.830593
4,0.0451,0.746791,0.829128,0.829106,0.829008,0.829047
5,0.0294,0.945675,0.825688,0.825682,0.825545,0.825596
6,0.0187,1.203874,0.819954,0.819901,0.819872,0.819886
7,0.0137,1.149778,0.818807,0.818746,0.818746,0.818746
8,0.009,1.181556,0.827982,0.828087,0.827755,0.827851
9,0.006,1.394629,0.821101,0.821597,0.821462,0.821097
10,0.0041,1.616499,0.821101,0.821059,0.820999,0.821025


[I 2025-03-23 04:00:39,186] Trial 69 pruned. 


Trial 70 with params: {'learning_rate': 0.0032109758631513803, 'weight_decay': 0.004, 'warmup_steps': 18}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2894,0.482472,0.822248,0.825079,0.823093,0.822077
2,0.1377,0.487396,0.833716,0.83558,0.833007,0.833231
3,0.0793,0.480362,0.826835,0.829397,0.825998,0.826193
4,0.0487,0.658114,0.826835,0.826858,0.826966,0.826824
5,0.0309,1.028043,0.816514,0.819098,0.815652,0.815807
6,0.0218,0.922457,0.827982,0.828087,0.827755,0.827851
7,0.0134,1.233887,0.823394,0.826377,0.824261,0.823212
8,0.0094,1.234412,0.824541,0.824772,0.824798,0.824541
9,0.0072,1.206463,0.819954,0.820184,0.820209,0.819954
10,0.0049,1.616452,0.816514,0.817924,0.815863,0.816045


[I 2025-03-23 04:02:48,890] Trial 70 pruned. 


Trial 71 with params: {'learning_rate': 0.0038675887120878964, 'weight_decay': 0.006, 'warmup_steps': 43}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2993,0.458133,0.824541,0.82711,0.825345,0.824397
2,0.1372,0.485655,0.826835,0.828644,0.826124,0.82633
3,0.0793,0.525987,0.845183,0.846903,0.844521,0.84477
4,0.0482,0.790072,0.833716,0.836352,0.832881,0.833099
5,0.0317,0.833012,0.83945,0.840587,0.839985,0.839419
6,0.0203,0.912109,0.830275,0.832738,0.82946,0.82967
7,0.0137,1.370317,0.832569,0.832533,0.83247,0.832497
8,0.0091,1.461298,0.834862,0.834975,0.835059,0.834859
9,0.0066,1.730181,0.829128,0.829562,0.828755,0.828912
10,0.0053,1.81837,0.826835,0.82802,0.82625,0.826451


[I 2025-03-23 04:05:03,001] Trial 71 pruned. 


Trial 72 with params: {'learning_rate': 0.0005151677048161085, 'weight_decay': 0.005, 'warmup_steps': 33}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3552,0.475143,0.813073,0.823136,0.814673,0.812093
2,0.2057,0.472304,0.822248,0.828598,0.820946,0.820923
3,0.1454,0.441953,0.837156,0.837101,0.837101,0.837101
4,0.1101,0.557869,0.84289,0.847259,0.841848,0.842066
5,0.0841,0.644739,0.827982,0.830983,0.827082,0.827269
6,0.0664,0.668174,0.847477,0.848587,0.846942,0.847171
7,0.0517,0.80684,0.831422,0.831862,0.83105,0.831209
8,0.041,0.787543,0.845183,0.845263,0.845363,0.845178
9,0.0317,0.862213,0.837156,0.839939,0.837985,0.837011
10,0.0249,0.957343,0.834862,0.835052,0.835101,0.834862


[I 2025-03-23 04:08:35,262] Trial 72 finished with value: 0.830185898650498 and parameters: {'learning_rate': 0.0005151677048161085, 'weight_decay': 0.005, 'warmup_steps': 33}. Best is trial 12 with value: 0.8439702128779307.


Trial 73 with params: {'learning_rate': 5.953168512495511e-05, 'weight_decay': 0.01, 'warmup_steps': 38}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4908,0.45429,0.78555,0.786079,0.785078,0.785204
2,0.3447,0.456466,0.798165,0.801656,0.797129,0.79714
3,0.3087,0.446195,0.803899,0.806214,0.803054,0.803172
4,0.2839,0.460856,0.808486,0.811678,0.807517,0.807605
5,0.2624,0.425115,0.813073,0.813025,0.813116,0.813044


[I 2025-03-23 04:09:28,154] Trial 73 pruned. 


Trial 74 with params: {'learning_rate': 0.004865527321699809, 'weight_decay': 0.003, 'warmup_steps': 11}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2852,0.45083,0.841743,0.841792,0.8419,0.841736
2,0.1322,0.469988,0.833716,0.837257,0.832754,0.832951
3,0.0755,0.50501,0.840596,0.840844,0.840311,0.840443
4,0.0447,0.752294,0.826835,0.82888,0.826082,0.826286
5,0.0273,0.827957,0.84289,0.842877,0.842774,0.842815
6,0.0199,1.047528,0.822248,0.822918,0.821788,0.821961
7,0.0135,1.235188,0.829128,0.829079,0.82905,0.829063
8,0.0077,1.429779,0.83945,0.83945,0.839564,0.839436
9,0.0059,1.525679,0.829128,0.829361,0.829387,0.829128
10,0.0058,1.576435,0.840596,0.840619,0.840732,0.840586


[I 2025-03-23 04:12:29,226] Trial 74 finished with value: 0.8276879623969817 and parameters: {'learning_rate': 0.004865527321699809, 'weight_decay': 0.003, 'warmup_steps': 11}. Best is trial 12 with value: 0.8439702128779307.


Trial 75 with params: {'learning_rate': 0.0036189585440114212, 'weight_decay': 0.004, 'warmup_steps': 31}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2969,0.419522,0.830275,0.831219,0.830765,0.830253
2,0.1379,0.469928,0.834862,0.835821,0.834344,0.834548
3,0.0804,0.496124,0.841743,0.842414,0.841311,0.841502
4,0.0483,0.759687,0.840596,0.840582,0.840479,0.840521
5,0.0326,0.934423,0.818807,0.819083,0.819083,0.818807


[I 2025-03-23 04:13:31,696] Trial 75 pruned. 


Trial 76 with params: {'learning_rate': 0.0003858204211579141, 'weight_decay': 0.003, 'warmup_steps': 27}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.367,0.425486,0.816514,0.825358,0.81801,0.815698
2,0.2217,0.478925,0.818807,0.825289,0.817483,0.81742
3,0.1614,0.473899,0.827982,0.82947,0.827334,0.827543
4,0.1262,0.561345,0.825688,0.833261,0.824282,0.824208
5,0.0993,0.586711,0.834862,0.835253,0.834512,0.834667
6,0.081,0.641257,0.844037,0.845422,0.843437,0.843674
7,0.0651,0.728412,0.831422,0.832633,0.830839,0.831049
8,0.0542,0.73578,0.830275,0.831393,0.830807,0.830243
9,0.0445,0.771358,0.832569,0.834086,0.833186,0.832512
10,0.036,0.8096,0.822248,0.825674,0.823177,0.822023


[I 2025-03-23 04:15:41,297] Trial 76 pruned. 


Trial 77 with params: {'learning_rate': 0.0003265149281279455, 'weight_decay': 0.005, 'warmup_steps': 20}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3724,0.408799,0.822248,0.827414,0.823388,0.821854
2,0.2323,0.47979,0.818807,0.82666,0.817357,0.817191
3,0.1721,0.480102,0.831422,0.834033,0.830586,0.830797
4,0.1362,0.589899,0.81422,0.823426,0.812642,0.812309
5,0.1096,0.571609,0.833716,0.833795,0.833512,0.8336
6,0.0897,0.650457,0.83945,0.841482,0.838722,0.838962
7,0.0743,0.690901,0.837156,0.838126,0.836638,0.836846
8,0.0631,0.68849,0.832569,0.834086,0.833186,0.832512
9,0.0527,0.750797,0.836009,0.837873,0.836691,0.835931
10,0.0443,0.816761,0.824541,0.830131,0.825724,0.824114


[I 2025-03-23 04:17:52,778] Trial 77 pruned. 


Trial 78 with params: {'learning_rate': 0.0021566679981372796, 'weight_decay': 0.007, 'warmup_steps': 35}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3048,0.463463,0.827982,0.834245,0.829229,0.827502
2,0.1489,0.431008,0.84633,0.848421,0.845605,0.845863
3,0.0907,0.414302,0.850917,0.85111,0.851162,0.850917
4,0.0568,0.650425,0.821101,0.823216,0.820325,0.820511
5,0.0368,0.931933,0.834862,0.835369,0.835228,0.834859
6,0.025,0.975109,0.84633,0.847729,0.845731,0.845973
7,0.0165,1.111109,0.834862,0.834806,0.834891,0.834831
8,0.0107,1.242798,0.829128,0.829202,0.828924,0.829009
9,0.0095,1.229084,0.833716,0.83494,0.833133,0.833347
10,0.0063,1.600548,0.825688,0.827604,0.824956,0.825158


[I 2025-03-23 04:19:54,957] Trial 78 pruned. 


Trial 79 with params: {'learning_rate': 0.001162100397100965, 'weight_decay': 0.004, 'warmup_steps': 39}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3244,0.482133,0.821101,0.833413,0.822851,0.819941
2,0.1684,0.442301,0.83945,0.84043,0.838932,0.839144
3,0.1087,0.416054,0.84633,0.846639,0.846026,0.846172
4,0.074,0.582137,0.84633,0.84646,0.84611,0.846214
5,0.0515,0.768186,0.838303,0.838546,0.838017,0.838148
6,0.0365,0.779785,0.831422,0.834033,0.830586,0.830797
7,0.0251,0.944727,0.833716,0.833654,0.833723,0.833679
8,0.0171,1.131793,0.831422,0.831402,0.831302,0.831342
9,0.0131,1.111714,0.830275,0.83178,0.829629,0.829842
10,0.0094,1.306969,0.826835,0.828644,0.826124,0.82633


[I 2025-03-23 04:23:01,533] Trial 79 finished with value: 0.8209417302437252 and parameters: {'learning_rate': 0.001162100397100965, 'weight_decay': 0.004, 'warmup_steps': 39}. Best is trial 12 with value: 0.8439702128779307.


Trial 80 with params: {'learning_rate': 5.799396100371127e-05, 'weight_decay': 0.003, 'warmup_steps': 35}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4936,0.452629,0.78555,0.785477,0.785499,0.785487
2,0.3452,0.459986,0.797018,0.800645,0.795961,0.795953
3,0.3097,0.44424,0.805046,0.807249,0.804222,0.80435
4,0.2852,0.463011,0.806193,0.809653,0.80518,0.80524
5,0.2641,0.424679,0.81422,0.814162,0.814242,0.814185


[I 2025-03-23 04:23:53,892] Trial 80 pruned. 


Trial 81 with params: {'learning_rate': 0.004670404000237616, 'weight_decay': 0.006, 'warmup_steps': 22}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2883,0.440663,0.836009,0.838644,0.836817,0.835874
2,0.131,0.464404,0.831422,0.831501,0.831597,0.831416
3,0.0741,0.513398,0.840596,0.842516,0.83989,0.840132
4,0.0456,0.749897,0.830275,0.83055,0.829965,0.8301
5,0.0294,0.900136,0.840596,0.840552,0.840522,0.840536
6,0.0171,1.031434,0.838303,0.838546,0.838017,0.838148
7,0.0117,1.329012,0.841743,0.841867,0.841521,0.841623
8,0.0086,1.359961,0.823394,0.823354,0.823293,0.823319
9,0.0065,1.417772,0.833716,0.834047,0.834017,0.833715
10,0.0056,1.624669,0.830275,0.832477,0.829502,0.829715


[I 2025-03-23 04:27:35,611] Trial 81 finished with value: 0.8275425605974305 and parameters: {'learning_rate': 0.004670404000237616, 'weight_decay': 0.006, 'warmup_steps': 22}. Best is trial 12 with value: 0.8439702128779307.


Trial 82 with params: {'learning_rate': 0.0016845585869486408, 'weight_decay': 0.008, 'warmup_steps': 18}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3065,0.469807,0.821101,0.832228,0.822767,0.82007
2,0.1548,0.431206,0.844037,0.845223,0.843479,0.843708
3,0.0967,0.399387,0.848624,0.848564,0.848615,0.848585
4,0.0632,0.659925,0.831422,0.834617,0.830502,0.830699
5,0.042,0.909566,0.822248,0.82227,0.822377,0.822236


[I 2025-03-23 04:28:40,975] Trial 82 pruned. 


Trial 83 with params: {'learning_rate': 0.0016078242323572562, 'weight_decay': 0.006, 'warmup_steps': 27}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3107,0.456025,0.823394,0.83459,0.825061,0.822377
2,0.1564,0.447349,0.848624,0.849319,0.848194,0.848393
3,0.0985,0.403286,0.854358,0.854352,0.854246,0.854289
4,0.0641,0.658607,0.834862,0.836856,0.834133,0.83436
5,0.0426,0.934453,0.836009,0.836088,0.836185,0.836004
6,0.0295,0.95892,0.834862,0.837653,0.834007,0.834227
7,0.0206,0.984061,0.829128,0.829834,0.829555,0.829117
8,0.0142,1.166124,0.832569,0.832758,0.832807,0.832568
9,0.0108,1.23811,0.829128,0.829834,0.829555,0.829117
10,0.0073,1.323493,0.821101,0.821197,0.820872,0.820965


[I 2025-03-23 04:30:35,542] Trial 83 pruned. 


Trial 84 with params: {'learning_rate': 0.00017339698016545857, 'weight_decay': 0.003, 'warmup_steps': 16}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4092,0.414638,0.807339,0.811778,0.808411,0.806973
2,0.2753,0.454545,0.819954,0.822442,0.819115,0.819287
3,0.2225,0.500455,0.818807,0.82666,0.817357,0.817191
4,0.1897,0.58563,0.81422,0.822914,0.812684,0.812395
5,0.1622,0.494615,0.827982,0.827917,0.827966,0.827937


[I 2025-03-23 04:31:32,951] Trial 84 pruned. 


Trial 85 with params: {'learning_rate': 0.003049766856709297, 'weight_decay': 0.0, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2881,0.488223,0.827982,0.830176,0.828724,0.827872
2,0.1383,0.469208,0.838303,0.840451,0.837554,0.837791
3,0.0812,0.475404,0.836009,0.836732,0.835554,0.835745
4,0.049,0.805698,0.831422,0.831987,0.831007,0.83118
5,0.0318,0.930115,0.826835,0.827275,0.827176,0.826833
6,0.0212,1.083987,0.825688,0.827604,0.824956,0.825158
7,0.014,1.214622,0.824541,0.824494,0.824587,0.824513
8,0.0096,1.257999,0.826835,0.826816,0.826924,0.826816
9,0.0062,1.544994,0.829128,0.830326,0.828545,0.82875
10,0.0045,1.752709,0.822248,0.823229,0.821704,0.821891


[I 2025-03-23 04:33:54,461] Trial 85 pruned. 


Trial 86 with params: {'learning_rate': 0.0002597113179487162, 'weight_decay': 0.01, 'warmup_steps': 9}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3829,0.410945,0.813073,0.820633,0.814463,0.81238
2,0.2479,0.457391,0.819954,0.825828,0.818694,0.818683
3,0.1904,0.502098,0.825688,0.832341,0.824366,0.824354
4,0.1545,0.594166,0.816514,0.826331,0.814894,0.814538
5,0.1273,0.525095,0.829128,0.829081,0.829176,0.829101
6,0.1057,0.632014,0.831422,0.833042,0.830755,0.830972
7,0.0897,0.639135,0.849771,0.850547,0.84932,0.849528
8,0.0779,0.639851,0.832569,0.835323,0.833396,0.83242
9,0.067,0.721556,0.831422,0.835591,0.832439,0.83115
10,0.0585,0.753931,0.823394,0.827657,0.82443,0.823093


[I 2025-03-23 04:36:06,325] Trial 86 pruned. 


Trial 87 with params: {'learning_rate': 0.0012729789412885524, 'weight_decay': 0.005, 'warmup_steps': 43}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3229,0.48076,0.827982,0.842461,0.82986,0.826665
2,0.1659,0.439478,0.845183,0.845661,0.844816,0.844988
3,0.1067,0.415433,0.850917,0.851236,0.850615,0.850764
4,0.0716,0.571562,0.840596,0.841671,0.840059,0.840277
5,0.0496,0.752318,0.83945,0.839505,0.839269,0.839347
6,0.0343,0.870274,0.832569,0.835334,0.831713,0.831924
7,0.0241,0.924088,0.836009,0.836163,0.835765,0.835874
8,0.0166,1.098856,0.836009,0.836248,0.835722,0.835852
9,0.012,1.284357,0.833716,0.833668,0.833639,0.833652
10,0.0088,1.419313,0.834862,0.836191,0.834259,0.834478


[I 2025-03-23 04:39:00,473] Trial 87 finished with value: 0.8300168065455964 and parameters: {'learning_rate': 0.0012729789412885524, 'weight_decay': 0.005, 'warmup_steps': 43}. Best is trial 12 with value: 0.8439702128779307.


Trial 88 with params: {'learning_rate': 0.0032107604541368197, 'weight_decay': 0.005, 'warmup_steps': 28}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2959,0.40396,0.827982,0.828921,0.828471,0.827959
2,0.1398,0.458992,0.840596,0.842765,0.839848,0.840091
3,0.0814,0.47322,0.847477,0.849452,0.846773,0.847033
4,0.0502,0.746821,0.834862,0.835508,0.834428,0.834611
5,0.0313,0.846099,0.836009,0.836248,0.835722,0.835852
6,0.0196,1.158675,0.832569,0.833884,0.831965,0.832179
7,0.0142,1.098768,0.834862,0.835374,0.83447,0.83464
8,0.0089,0.918848,0.827982,0.827925,0.828008,0.827949
9,0.0081,0.975676,0.815367,0.815303,0.815368,0.815326
10,0.0057,1.208464,0.837156,0.837352,0.836891,0.837011


[I 2025-03-23 04:42:34,398] Trial 88 finished with value: 0.8381697354029369 and parameters: {'learning_rate': 0.0032107604541368197, 'weight_decay': 0.005, 'warmup_steps': 28}. Best is trial 12 with value: 0.8439702128779307.


Trial 89 with params: {'learning_rate': 0.004590947025829507, 'weight_decay': 0.006, 'warmup_steps': 32}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2925,0.411653,0.826835,0.8289,0.827555,0.826734
2,0.134,0.475132,0.844037,0.845864,0.843353,0.843601
3,0.0767,0.469634,0.840596,0.840619,0.840732,0.840586
4,0.0469,0.742489,0.837156,0.838126,0.836638,0.836846
5,0.0304,1.000853,0.816514,0.818835,0.815694,0.815859


[I 2025-03-23 04:43:35,619] Trial 89 pruned. 


Trial 90 with params: {'learning_rate': 0.0003897553986094342, 'weight_decay': 0.002, 'warmup_steps': 6}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3618,0.414793,0.825688,0.830333,0.826766,0.825356
2,0.2231,0.47579,0.825688,0.830313,0.824577,0.824684
3,0.1629,0.469606,0.832569,0.834089,0.831923,0.832141
4,0.1273,0.582168,0.827982,0.833828,0.826745,0.826801
5,0.1003,0.615084,0.831422,0.831987,0.831007,0.83118
6,0.0817,0.707168,0.845183,0.847393,0.844437,0.844693
7,0.0668,0.752521,0.836009,0.836732,0.835554,0.835745
8,0.0555,0.745036,0.825688,0.828401,0.826513,0.825533
9,0.0458,0.785861,0.829128,0.832004,0.829976,0.828964
10,0.0374,0.833486,0.824541,0.828643,0.825556,0.824258


[I 2025-03-23 04:45:30,415] Trial 90 pruned. 


Trial 91 with params: {'learning_rate': 0.0034212455928428567, 'weight_decay': 0.005, 'warmup_steps': 23}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2929,0.478222,0.830275,0.831393,0.830807,0.830243
2,0.1362,0.482788,0.836009,0.838396,0.835217,0.835446
3,0.0786,0.477302,0.84289,0.84364,0.842437,0.842636
4,0.048,0.668772,0.834862,0.835055,0.834596,0.834715
5,0.0312,0.874783,0.825688,0.826464,0.826135,0.825673
6,0.0205,1.048733,0.836009,0.836723,0.836438,0.835999
7,0.0144,1.04315,0.829128,0.829361,0.829387,0.829128
8,0.01,1.148094,0.830275,0.831582,0.83085,0.830231
9,0.0071,1.308208,0.823394,0.824142,0.822914,0.823093
10,0.0049,1.569009,0.813073,0.815487,0.812232,0.81238


[I 2025-03-23 04:48:00,852] Trial 91 pruned. 


Trial 92 with params: {'learning_rate': 0.0012507050689732177, 'weight_decay': 0.004, 'warmup_steps': 28}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3167,0.515724,0.817661,0.834853,0.819725,0.815911
2,0.166,0.433728,0.844037,0.843976,0.844026,0.843996
3,0.1067,0.405899,0.844037,0.843984,0.843984,0.843984
4,0.0724,0.646432,0.831422,0.832633,0.830839,0.831049
5,0.0501,0.798598,0.829128,0.830326,0.828545,0.82875
6,0.035,0.877006,0.829128,0.836559,0.827745,0.827714
7,0.0244,0.911911,0.834862,0.834828,0.834765,0.834792
8,0.0168,1.213652,0.833716,0.83558,0.833007,0.833231
9,0.0122,1.278065,0.837156,0.838305,0.836596,0.836813
10,0.0097,1.503846,0.827982,0.829917,0.82725,0.827459


[I 2025-03-23 04:51:44,731] Trial 92 finished with value: 0.8323430257058575 and parameters: {'learning_rate': 0.0012507050689732177, 'weight_decay': 0.004, 'warmup_steps': 28}. Best is trial 12 with value: 0.8439702128779307.


Trial 93 with params: {'learning_rate': 0.0032575818659135573, 'weight_decay': 0.009000000000000001, 'warmup_steps': 23}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2932,0.42603,0.83945,0.840988,0.840069,0.839395
2,0.1385,0.458504,0.841743,0.844324,0.840932,0.841178
3,0.0807,0.466084,0.84289,0.843054,0.842648,0.842761
4,0.0492,0.686954,0.841743,0.841867,0.841521,0.841623
5,0.0315,0.932503,0.84289,0.842871,0.842984,0.842873
6,0.0209,1.008145,0.832569,0.833073,0.832176,0.832343
7,0.0148,1.323698,0.826835,0.827523,0.826377,0.826555
8,0.0107,0.972325,0.829128,0.831715,0.828292,0.828495
9,0.0074,1.466252,0.824541,0.829712,0.823367,0.823436
10,0.0066,1.40377,0.827982,0.828748,0.827503,0.827688


[I 2025-03-23 04:55:28,109] Trial 93 finished with value: 0.8207954808928609 and parameters: {'learning_rate': 0.0032575818659135573, 'weight_decay': 0.009000000000000001, 'warmup_steps': 23}. Best is trial 12 with value: 0.8439702128779307.


Trial 94 with params: {'learning_rate': 0.0037441660370267876, 'weight_decay': 0.004, 'warmup_steps': 38}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2968,0.454325,0.826835,0.828663,0.827513,0.826753
2,0.1368,0.495951,0.827982,0.829686,0.827292,0.827502
3,0.0797,0.446898,0.848624,0.848757,0.848404,0.848509
4,0.0477,0.737228,0.832569,0.832757,0.832302,0.83242
5,0.0307,0.946745,0.826835,0.827674,0.826335,0.826523


[I 2025-03-23 04:56:38,072] Trial 94 pruned. 


Trial 95 with params: {'learning_rate': 0.0019077893320827623, 'weight_decay': 0.006, 'warmup_steps': 26}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3053,0.451298,0.826835,0.835158,0.828271,0.826143
2,0.152,0.421916,0.841743,0.843326,0.8411,0.841339
3,0.0938,0.419055,0.848624,0.848597,0.848531,0.848559
4,0.0601,0.663405,0.823394,0.827279,0.822367,0.822497
5,0.0388,0.934593,0.829128,0.829458,0.829429,0.829128
6,0.0266,1.028103,0.831422,0.834932,0.83046,0.830647
7,0.0179,1.159498,0.832569,0.832757,0.832302,0.83242
8,0.0121,1.227953,0.841743,0.841749,0.841606,0.84166
9,0.0092,1.256391,0.832569,0.833884,0.831965,0.832179
10,0.0068,1.547837,0.827982,0.829269,0.827376,0.827582


[I 2025-03-23 04:59:46,839] Trial 95 finished with value: 0.8347154433019002 and parameters: {'learning_rate': 0.0019077893320827623, 'weight_decay': 0.006, 'warmup_steps': 26}. Best is trial 12 with value: 0.8439702128779307.


Trial 96 with params: {'learning_rate': 0.004307136811674229, 'weight_decay': 0.006, 'warmup_steps': 23}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2874,0.496783,0.832569,0.835614,0.833439,0.832396
2,0.1325,0.440738,0.837156,0.838305,0.836596,0.836813
3,0.0756,0.534411,0.840596,0.843028,0.839806,0.840049
4,0.0447,0.711891,0.840596,0.840945,0.840269,0.84042
5,0.029,0.907491,0.836009,0.836455,0.836354,0.836007
6,0.0186,1.047013,0.825688,0.825629,0.825629,0.825629
7,0.0142,1.244145,0.829128,0.829151,0.82926,0.829117
8,0.0089,1.325612,0.825688,0.825623,0.825671,0.825643
9,0.007,1.305523,0.821101,0.82148,0.82142,0.8211
10,0.0049,1.631137,0.819954,0.822183,0.819157,0.819336


[I 2025-03-23 05:01:47,605] Trial 96 pruned. 


Trial 97 with params: {'learning_rate': 0.0025412509720062694, 'weight_decay': 0.005, 'warmup_steps': 33}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3007,0.403888,0.834862,0.835369,0.835228,0.834859
2,0.1437,0.449301,0.83945,0.842292,0.838596,0.838831
3,0.0858,0.433803,0.847477,0.8475,0.847615,0.847467
4,0.0533,0.661964,0.836009,0.83659,0.835596,0.835774
5,0.0333,0.894767,0.841743,0.841856,0.841942,0.84174
6,0.0233,1.131532,0.849771,0.850712,0.849278,0.8495
7,0.0168,1.03388,0.832569,0.832513,0.832597,0.832537
8,0.0107,1.165592,0.829128,0.829067,0.829134,0.82909
9,0.0075,1.237961,0.824541,0.825394,0.825008,0.824523
10,0.0055,1.56126,0.825688,0.825682,0.825545,0.825596


[I 2025-03-23 05:03:38,578] Trial 97 pruned. 


Trial 98 with params: {'learning_rate': 0.0043626410694573466, 'weight_decay': 0.0, 'warmup_steps': 8}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2868,0.437852,0.84289,0.84682,0.843868,0.842665
2,0.1322,0.474791,0.837156,0.838499,0.836554,0.836777
3,0.0741,0.54717,0.836009,0.837447,0.835386,0.835609
4,0.045,0.750943,0.841743,0.842151,0.841395,0.841556
5,0.0286,0.958723,0.838303,0.838452,0.838522,0.838301
6,0.0176,0.970105,0.827982,0.828472,0.827587,0.82775
7,0.0118,1.370756,0.827982,0.828603,0.827545,0.82772
8,0.0094,1.389521,0.829128,0.829978,0.828629,0.82882
9,0.0062,1.47728,0.821101,0.82115,0.821251,0.821092
10,0.0045,1.753312,0.815367,0.818965,0.814358,0.814459


[I 2025-03-23 05:06:10,172] Trial 98 pruned. 


Trial 99 with params: {'learning_rate': 0.00024393231274402199, 'weight_decay': 0.002, 'warmup_steps': 25}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3924,0.41558,0.808486,0.817391,0.81,0.807605
2,0.2518,0.448026,0.824541,0.82933,0.823409,0.823499
3,0.1946,0.495927,0.831422,0.837135,0.830208,0.830297
4,0.159,0.61749,0.81422,0.827463,0.812347,0.811643
5,0.1316,0.537272,0.823394,0.823354,0.823293,0.823319


[I 2025-03-23 05:07:12,211] Trial 99 pruned. 


Trial 100 with params: {'learning_rate': 0.0019477517063680384, 'weight_decay': 0.005, 'warmup_steps': 36}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3067,0.469974,0.825688,0.835274,0.827229,0.824859
2,0.1509,0.433075,0.849771,0.850547,0.84932,0.849528
3,0.0929,0.404753,0.854358,0.854352,0.854246,0.854289
4,0.0594,0.648335,0.829128,0.832606,0.828166,0.828343
5,0.0392,0.919082,0.838303,0.838242,0.838311,0.838267
6,0.0271,0.972493,0.826835,0.831667,0.825703,0.825806
7,0.0189,0.936221,0.838303,0.83846,0.838059,0.83817
8,0.0127,1.210063,0.847477,0.847961,0.84711,0.847284
9,0.01,1.212533,0.833716,0.834756,0.833175,0.833382
10,0.0078,1.378375,0.834862,0.838925,0.833839,0.834023


[I 2025-03-23 05:10:13,690] Trial 100 finished with value: 0.8357133916984513 and parameters: {'learning_rate': 0.0019477517063680384, 'weight_decay': 0.005, 'warmup_steps': 36}. Best is trial 12 with value: 0.8439702128779307.


Trial 101 with params: {'learning_rate': 0.0013438279917829018, 'weight_decay': 0.005, 'warmup_steps': 33}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3162,0.48423,0.821101,0.83403,0.822893,0.819873
2,0.1634,0.430641,0.840596,0.840582,0.840479,0.840521
3,0.1041,0.411871,0.841743,0.841867,0.841521,0.841623
4,0.0698,0.640084,0.83945,0.842007,0.838638,0.838877
5,0.0484,0.786166,0.836009,0.836091,0.835807,0.835895
6,0.0332,0.853762,0.829128,0.833275,0.828082,0.828232
7,0.0232,0.973491,0.836009,0.836348,0.83568,0.835828
8,0.0169,1.265847,0.834862,0.834864,0.834723,0.834775
9,0.0122,1.183796,0.84289,0.842831,0.842858,0.842843
10,0.0092,1.498073,0.829128,0.830522,0.828503,0.828712


[I 2025-03-23 05:13:59,682] Trial 101 finished with value: 0.8218914349493913 and parameters: {'learning_rate': 0.0013438279917829018, 'weight_decay': 0.005, 'warmup_steps': 33}. Best is trial 12 with value: 0.8439702128779307.


Trial 102 with params: {'learning_rate': 6.939136979950922e-05, 'weight_decay': 0.0, 'warmup_steps': 27}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4765,0.448271,0.784404,0.784582,0.784078,0.784181
2,0.3348,0.451709,0.801606,0.804151,0.800718,0.800813
3,0.2973,0.451693,0.801606,0.804997,0.800591,0.80063
4,0.2709,0.473385,0.811927,0.815017,0.810979,0.811091
5,0.2482,0.428857,0.818807,0.818807,0.818915,0.818792


[I 2025-03-23 05:15:29,243] Trial 102 pruned. 


Trial 103 with params: {'learning_rate': 0.0014379446347447989, 'weight_decay': 0.006, 'warmup_steps': 40}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3162,0.468581,0.824541,0.837277,0.826313,0.82337
2,0.161,0.431201,0.838303,0.838546,0.838017,0.838148
3,0.1019,0.40734,0.844037,0.844037,0.844153,0.844024
4,0.0676,0.60806,0.840596,0.843028,0.839806,0.840049
5,0.0459,0.838391,0.833716,0.834162,0.833344,0.833505
6,0.0309,0.88468,0.826835,0.832051,0.825661,0.825744
7,0.0217,1.022405,0.831422,0.832282,0.830923,0.831118
8,0.0158,1.044869,0.833716,0.833795,0.833891,0.83371
9,0.0112,1.214091,0.838303,0.838891,0.83789,0.838071
10,0.0083,1.300429,0.826835,0.826785,0.826755,0.826769


[I 2025-03-23 05:18:45,753] Trial 103 finished with value: 0.825577049611791 and parameters: {'learning_rate': 0.0014379446347447989, 'weight_decay': 0.006, 'warmup_steps': 40}. Best is trial 12 with value: 0.8439702128779307.


Trial 104 with params: {'learning_rate': 0.002152992166006981, 'weight_decay': 0.005, 'warmup_steps': 37}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3054,0.43736,0.826835,0.83206,0.827976,0.826451
2,0.1484,0.425995,0.848624,0.849476,0.848152,0.848365
3,0.0896,0.423004,0.853211,0.853157,0.853246,0.853183
4,0.0566,0.669829,0.838303,0.840204,0.837596,0.837832
5,0.0368,0.875802,0.837156,0.837121,0.837227,0.837135
6,0.025,0.917582,0.833716,0.835823,0.832965,0.833189
7,0.0169,0.997465,0.837156,0.837352,0.836891,0.837011
8,0.0108,1.296632,0.84289,0.842913,0.843026,0.84288
9,0.0088,1.217808,0.831422,0.831862,0.83105,0.831209
10,0.0078,1.352833,0.823394,0.824,0.822956,0.823126


[I 2025-03-23 05:20:52,711] Trial 104 pruned. 


Trial 105 with params: {'learning_rate': 0.00453521965890269, 'weight_decay': 0.002, 'warmup_steps': 17}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2924,0.431323,0.83945,0.840244,0.839901,0.839436
2,0.1326,0.47785,0.838303,0.841586,0.837385,0.837609
3,0.0768,0.505273,0.832569,0.837318,0.83146,0.831604
4,0.0457,0.820266,0.840596,0.840945,0.840269,0.84042
5,0.0292,0.845444,0.834862,0.834828,0.834765,0.834792
6,0.0199,0.900341,0.847477,0.847436,0.847405,0.847419
7,0.0124,1.174679,0.833716,0.834427,0.834144,0.833705
8,0.008,1.201553,0.832569,0.832506,0.832555,0.832526
9,0.0055,1.41589,0.834862,0.835052,0.835101,0.834862
10,0.0038,1.671368,0.831422,0.83136,0.831428,0.831385


[I 2025-03-23 05:25:00,981] Trial 105 finished with value: 0.8371251183836683 and parameters: {'learning_rate': 0.00453521965890269, 'weight_decay': 0.002, 'warmup_steps': 17}. Best is trial 12 with value: 0.8439702128779307.


Trial 106 with params: {'learning_rate': 0.003782961442765772, 'weight_decay': 0.002, 'warmup_steps': 14}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2865,0.458467,0.819954,0.822496,0.820757,0.819806
2,0.134,0.470925,0.833716,0.83494,0.833133,0.833347
3,0.0773,0.52954,0.837156,0.838708,0.836512,0.83674
4,0.0458,0.765341,0.827982,0.828615,0.828387,0.827974
5,0.0305,0.798528,0.824541,0.824516,0.824419,0.824458


[I 2025-03-23 05:26:04,355] Trial 106 pruned. 


Trial 107 with params: {'learning_rate': 0.0049926890191183656, 'weight_decay': 0.003, 'warmup_steps': 18}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2995,0.393417,0.848624,0.851189,0.849415,0.848509
2,0.1319,0.43537,0.84289,0.845344,0.8421,0.842351
3,0.074,0.521139,0.848624,0.851879,0.847731,0.847997
4,0.0446,0.779707,0.836009,0.83867,0.835175,0.835401
5,0.0286,0.787115,0.834862,0.834806,0.834891,0.834831
6,0.0187,1.027923,0.83945,0.839418,0.839353,0.839381
7,0.0133,1.060334,0.833716,0.833739,0.833554,0.833619
8,0.0079,1.28157,0.832569,0.832569,0.832681,0.832555
9,0.006,1.397298,0.845183,0.846681,0.844563,0.844806
10,0.005,1.451578,0.836009,0.838396,0.835217,0.835446


[I 2025-03-23 05:29:07,201] Trial 107 finished with value: 0.8391754315705162 and parameters: {'learning_rate': 0.0049926890191183656, 'weight_decay': 0.003, 'warmup_steps': 18}. Best is trial 12 with value: 0.8439702128779307.


Trial 108 with params: {'learning_rate': 0.0036347802646680472, 'weight_decay': 0.002, 'warmup_steps': 24}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2904,0.421995,0.830275,0.831058,0.830723,0.830261
2,0.1353,0.484425,0.829128,0.831448,0.828334,0.828542
3,0.0776,0.566659,0.819954,0.827135,0.818567,0.818464
4,0.048,0.712037,0.826835,0.829971,0.825914,0.826092
5,0.0301,0.913087,0.827982,0.827944,0.827882,0.827908
6,0.0183,0.964066,0.825688,0.826775,0.825124,0.825321
7,0.013,1.221935,0.827982,0.828031,0.828134,0.827974
8,0.008,1.296028,0.830275,0.830275,0.830386,0.830261
9,0.0077,1.309813,0.827982,0.829932,0.828681,0.827891
10,0.006,1.409969,0.817661,0.817786,0.81741,0.817511


[I 2025-03-23 05:31:05,176] Trial 108 pruned. 


Trial 109 with params: {'learning_rate': 0.004732165586574243, 'weight_decay': 0.002, 'warmup_steps': 16}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2968,0.432924,0.830275,0.830275,0.830386,0.830261
2,0.1326,0.473743,0.831422,0.834617,0.830502,0.830699
3,0.0752,0.566782,0.840596,0.842283,0.839932,0.840171
4,0.0465,0.639447,0.838303,0.838287,0.838185,0.838226
5,0.0294,0.875289,0.824541,0.825394,0.825008,0.824523


[I 2025-03-23 05:32:06,503] Trial 109 pruned. 


Trial 110 with params: {'learning_rate': 0.004745195168820434, 'weight_decay': 0.003, 'warmup_steps': 19}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2918,0.422895,0.840596,0.840746,0.840816,0.840594
2,0.1319,0.459835,0.826835,0.828213,0.826208,0.826413
3,0.0766,0.509263,0.826835,0.829971,0.825914,0.826092
4,0.045,0.678284,0.827982,0.828025,0.827797,0.827872
5,0.0282,0.968431,0.824541,0.824853,0.824209,0.824347


[I 2025-03-23 05:33:32,821] Trial 110 pruned. 


Trial 111 with params: {'learning_rate': 0.0005149934199426565, 'weight_decay': 0.003, 'warmup_steps': 22}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3522,0.46607,0.816514,0.82391,0.817883,0.815859
2,0.2062,0.469017,0.823394,0.828737,0.822198,0.822249
3,0.1455,0.448314,0.832569,0.832757,0.832302,0.83242
4,0.1106,0.539212,0.84289,0.845079,0.842142,0.842392
5,0.0853,0.616883,0.829128,0.831997,0.82825,0.828446
6,0.0676,0.686824,0.845183,0.846282,0.844647,0.844873
7,0.0532,0.816947,0.831422,0.831652,0.831134,0.83126
8,0.0423,0.803076,0.83945,0.83945,0.839564,0.839436
9,0.0332,0.846046,0.83945,0.841447,0.840153,0.839365
10,0.0261,0.947128,0.832569,0.832758,0.832807,0.832568


[I 2025-03-23 05:37:00,201] Trial 111 finished with value: 0.8302180685358256 and parameters: {'learning_rate': 0.0005149934199426565, 'weight_decay': 0.003, 'warmup_steps': 22}. Best is trial 12 with value: 0.8439702128779307.


Trial 112 with params: {'learning_rate': 0.002238848015846485, 'weight_decay': 0.003, 'warmup_steps': 17}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.296,0.443711,0.829128,0.833275,0.830144,0.828853
2,0.1446,0.447353,0.836009,0.838396,0.835217,0.835446
3,0.0872,0.425767,0.834862,0.835055,0.834596,0.834715
4,0.0547,0.684155,0.825688,0.826961,0.825082,0.825283
5,0.0346,0.951828,0.823394,0.823506,0.823588,0.823391


[I 2025-03-23 05:38:09,289] Trial 112 pruned. 


Trial 113 with params: {'learning_rate': 0.004997859509791104, 'weight_decay': 0.0, 'warmup_steps': 17}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2892,0.502532,0.833716,0.834751,0.834228,0.833689
2,0.1315,0.446557,0.837156,0.837209,0.836975,0.837052
3,0.075,0.529002,0.844037,0.847235,0.843142,0.843391
4,0.0474,0.785292,0.840596,0.840844,0.840311,0.840443
5,0.0276,0.953883,0.827982,0.829095,0.828513,0.827949
6,0.0201,1.003322,0.83945,0.840408,0.839943,0.839428
7,0.0136,1.298486,0.834862,0.835052,0.835101,0.834862
8,0.0078,1.406371,0.841743,0.842134,0.842069,0.841742
9,0.0055,1.538277,0.832569,0.832569,0.832681,0.832555
10,0.0041,1.855291,0.827982,0.827917,0.827966,0.827937


[I 2025-03-23 05:41:25,884] Trial 113 finished with value: 0.8313046568095228 and parameters: {'learning_rate': 0.004997859509791104, 'weight_decay': 0.0, 'warmup_steps': 17}. Best is trial 12 with value: 0.8439702128779307.


Trial 114 with params: {'learning_rate': 0.004113846679640469, 'weight_decay': 0.003, 'warmup_steps': 25}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2923,0.456299,0.838303,0.840677,0.839069,0.83819
2,0.1336,0.485165,0.832569,0.833517,0.832049,0.83225
3,0.0765,0.484115,0.840596,0.841497,0.840101,0.840309
4,0.0463,0.780949,0.829128,0.832606,0.828166,0.828343
5,0.0292,0.918941,0.83945,0.83965,0.839185,0.839307
6,0.0195,1.105464,0.831422,0.831501,0.831597,0.831416
7,0.013,1.158022,0.819954,0.821708,0.819241,0.81943
8,0.0089,1.237859,0.822248,0.822183,0.822209,0.822195
9,0.0055,1.558012,0.815367,0.815514,0.815578,0.815365
10,0.0045,1.817555,0.815367,0.815301,0.815326,0.815312


[I 2025-03-23 05:43:38,414] Trial 114 pruned. 


Trial 115 with params: {'learning_rate': 0.0019890773207780956, 'weight_decay': 0.0, 'warmup_steps': 13}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3003,0.42577,0.832569,0.837671,0.833691,0.832216
2,0.1503,0.448514,0.840596,0.841497,0.840101,0.840309
3,0.0928,0.425481,0.84289,0.843054,0.842648,0.842761
4,0.0584,0.645819,0.832569,0.832954,0.832218,0.83237
5,0.0369,1.063778,0.827982,0.828261,0.828261,0.827982
6,0.0257,1.06559,0.822248,0.826631,0.821156,0.821255
7,0.0184,1.052928,0.827982,0.828472,0.827587,0.82775
8,0.0121,0.967363,0.837156,0.83781,0.836722,0.836908
9,0.0094,1.217298,0.833716,0.834756,0.833175,0.833382
10,0.0067,1.41499,0.830275,0.832477,0.829502,0.829715


[I 2025-03-23 05:46:51,442] Trial 115 finished with value: 0.8323138767274125 and parameters: {'learning_rate': 0.0019890773207780956, 'weight_decay': 0.0, 'warmup_steps': 13}. Best is trial 12 with value: 0.8439702128779307.


Trial 116 with params: {'learning_rate': 0.00044737416842702846, 'weight_decay': 0.004, 'warmup_steps': 27}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.36,0.454403,0.811927,0.822782,0.813589,0.810843
2,0.2134,0.474124,0.816514,0.822515,0.815231,0.815183
3,0.1527,0.460716,0.827982,0.829081,0.827419,0.827619
4,0.1177,0.543302,0.836009,0.840267,0.834965,0.835149
5,0.0915,0.63212,0.823394,0.826638,0.822451,0.82261


[I 2025-03-23 05:47:58,013] Trial 116 pruned. 


Trial 117 with params: {'learning_rate': 0.004247524311451758, 'weight_decay': 0.002, 'warmup_steps': 13}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2871,0.486788,0.832569,0.834785,0.833312,0.832462
2,0.1335,0.500976,0.831422,0.834033,0.830586,0.830797
3,0.0751,0.55985,0.837156,0.839972,0.836301,0.836529
4,0.0466,0.772405,0.827982,0.830421,0.827166,0.827368
5,0.0299,1.022784,0.819954,0.819889,0.819915,0.819901
6,0.02,1.09113,0.831422,0.833268,0.830713,0.830931
7,0.0121,1.214493,0.819954,0.820616,0.819494,0.819664
8,0.0088,1.225809,0.827982,0.829081,0.827419,0.827619
9,0.0059,1.36441,0.829128,0.829067,0.829134,0.82909
10,0.0059,1.33884,0.815367,0.815301,0.815326,0.815312


[I 2025-03-23 05:49:59,352] Trial 117 pruned. 


Trial 118 with params: {'learning_rate': 0.0009752586079423723, 'weight_decay': 0.007, 'warmup_steps': 40}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3318,0.554601,0.81078,0.831445,0.813053,0.808507
2,0.1764,0.448826,0.84289,0.844828,0.842184,0.842432
3,0.1163,0.414517,0.841743,0.841801,0.841564,0.841642
4,0.0814,0.568496,0.841743,0.843114,0.841143,0.841375
5,0.0581,0.693665,0.834862,0.836398,0.834217,0.834441
6,0.0422,0.894483,0.833716,0.837589,0.832712,0.832898
7,0.0303,0.959534,0.838303,0.838287,0.838185,0.838226
8,0.0214,1.100089,0.836009,0.835948,0.835975,0.835961
9,0.0156,1.123473,0.827982,0.828472,0.827587,0.82775
10,0.0112,1.399631,0.818807,0.81875,0.818831,0.818773


[I 2025-03-23 05:52:25,276] Trial 118 pruned. 


Trial 119 with params: {'learning_rate': 0.0029749272741748083, 'weight_decay': 0.005, 'warmup_steps': 42}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3003,0.416866,0.834862,0.83599,0.835396,0.834831
2,0.1409,0.455915,0.840596,0.843909,0.83968,0.839912
3,0.0822,0.440591,0.845183,0.845172,0.845068,0.84511
4,0.0506,0.658822,0.822248,0.824499,0.821451,0.821638
5,0.0318,0.917955,0.824541,0.824869,0.82484,0.824541


[I 2025-03-23 05:53:39,988] Trial 119 pruned. 


Trial 120 with params: {'learning_rate': 0.003645360633206452, 'weight_decay': 0.008, 'warmup_steps': 41}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3004,0.404924,0.834862,0.835503,0.83527,0.834855
2,0.1378,0.478338,0.830275,0.834983,0.829166,0.829297
3,0.0803,0.449541,0.848624,0.849835,0.848068,0.848305
4,0.048,0.849893,0.825688,0.827604,0.824956,0.825158
5,0.0323,0.998664,0.827982,0.828031,0.828134,0.827974
6,0.0224,1.098842,0.825688,0.829608,0.824661,0.824802
7,0.0153,1.078295,0.831422,0.831374,0.831344,0.831358
8,0.0092,1.324609,0.825688,0.827375,0.824998,0.825202
9,0.0063,1.374416,0.829128,0.829066,0.829092,0.829078
10,0.0045,1.806957,0.819954,0.821288,0.819325,0.819515


[I 2025-03-23 05:55:59,790] Trial 120 pruned. 


Trial 121 with params: {'learning_rate': 0.004677094793655873, 'weight_decay': 0.004, 'warmup_steps': 25}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2869,0.483194,0.827982,0.829701,0.828639,0.827908
2,0.1303,0.469637,0.837156,0.839169,0.836428,0.836661
3,0.0755,0.52619,0.847477,0.848781,0.846899,0.847139
4,0.0461,0.77343,0.834862,0.835147,0.834554,0.834692
5,0.0288,0.882089,0.818807,0.818918,0.818999,0.818804


[I 2025-03-23 05:57:02,166] Trial 121 pruned. 


Trial 122 with params: {'learning_rate': 0.0017297498247800434, 'weight_decay': 0.005, 'warmup_steps': 30}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3099,0.458823,0.823394,0.829171,0.824598,0.822944
2,0.154,0.444326,0.841743,0.842567,0.841269,0.841473
3,0.096,0.402305,0.856651,0.856605,0.856708,0.856629
4,0.0619,0.665803,0.836009,0.837892,0.835301,0.835531
5,0.0409,0.882666,0.832569,0.832849,0.832849,0.832569
6,0.0272,0.949435,0.84289,0.844373,0.842269,0.842507
7,0.0187,0.925239,0.836009,0.836088,0.836185,0.836004
8,0.0142,1.185649,0.832569,0.833207,0.832134,0.832314
9,0.0098,1.251516,0.836009,0.837447,0.835386,0.835609
10,0.0067,1.510525,0.829128,0.831997,0.82825,0.828446


[I 2025-03-23 06:00:25,750] Trial 122 finished with value: 0.8311802395643327 and parameters: {'learning_rate': 0.0017297498247800434, 'weight_decay': 0.005, 'warmup_steps': 30}. Best is trial 12 with value: 0.8439702128779307.


Trial 123 with params: {'learning_rate': 0.0007194363668797047, 'weight_decay': 0.0, 'warmup_steps': 33}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3404,0.540316,0.799312,0.82089,0.801665,0.796691
2,0.1893,0.431184,0.834862,0.835999,0.834302,0.834514
3,0.1281,0.424872,0.838303,0.838891,0.83789,0.838071
4,0.0939,0.557869,0.848624,0.850734,0.847899,0.848164
5,0.0703,0.664992,0.826835,0.833296,0.825535,0.825544


[I 2025-03-23 06:01:33,200] Trial 123 pruned. 


Trial 124 with params: {'learning_rate': 0.0005295196802477818, 'weight_decay': 0.005, 'warmup_steps': 10}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3459,0.461919,0.817661,0.827848,0.819262,0.816704
2,0.2048,0.449473,0.825688,0.829608,0.824661,0.824802
3,0.145,0.460011,0.831422,0.832633,0.830839,0.831049
4,0.1098,0.551017,0.834862,0.838584,0.833881,0.834077
5,0.0843,0.643065,0.822248,0.826994,0.821114,0.821192
6,0.0668,0.709339,0.838303,0.839366,0.837764,0.837979
7,0.0521,0.781283,0.832569,0.83268,0.832344,0.832442
8,0.0413,0.819601,0.836009,0.836243,0.83627,0.836009
9,0.0323,0.841723,0.827982,0.830176,0.828724,0.827872
10,0.0249,0.880937,0.832569,0.832758,0.832807,0.832568


[I 2025-03-23 06:04:41,604] Trial 124 finished with value: 0.8244191325976666 and parameters: {'learning_rate': 0.0005295196802477818, 'weight_decay': 0.005, 'warmup_steps': 10}. Best is trial 12 with value: 0.8439702128779307.


Trial 125 with params: {'learning_rate': 0.00024634111538182466, 'weight_decay': 0.004, 'warmup_steps': 26}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3922,0.415893,0.808486,0.8179,0.810043,0.807544
2,0.2511,0.447275,0.823394,0.82835,0.82224,0.822314
3,0.1937,0.494768,0.831422,0.837994,0.830123,0.830166
4,0.1578,0.615815,0.81422,0.827463,0.812347,0.811643
5,0.1304,0.53722,0.823394,0.823387,0.823251,0.823302


[I 2025-03-23 06:05:45,922] Trial 125 pruned. 


Trial 126 with params: {'learning_rate': 0.0013550745741247334, 'weight_decay': 0.003, 'warmup_steps': 39}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3181,0.487067,0.822248,0.837517,0.824188,0.820777
2,0.1633,0.428448,0.841743,0.841749,0.841606,0.84166
3,0.1041,0.400308,0.845183,0.845206,0.845321,0.845174
4,0.0697,0.596388,0.840596,0.841671,0.840059,0.840277
5,0.0475,0.810201,0.832569,0.833207,0.832134,0.832314
6,0.0325,0.892946,0.823394,0.827279,0.822367,0.822497
7,0.023,0.878002,0.841743,0.842391,0.842153,0.841736
8,0.0157,1.225146,0.83945,0.839499,0.839606,0.839442
9,0.0113,1.263604,0.841743,0.841867,0.841521,0.841623
10,0.0081,1.377636,0.830275,0.83024,0.830344,0.830253


[I 2025-03-23 06:09:02,006] Trial 126 finished with value: 0.8346109469092289 and parameters: {'learning_rate': 0.0013550745741247334, 'weight_decay': 0.003, 'warmup_steps': 39}. Best is trial 12 with value: 0.8439702128779307.


Trial 127 with params: {'learning_rate': 0.0025126146734439857, 'weight_decay': 0.005, 'warmup_steps': 17}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2951,0.462668,0.831422,0.837957,0.832691,0.830931
2,0.1436,0.436087,0.84633,0.847343,0.845815,0.846038
3,0.0854,0.443172,0.844037,0.844451,0.843689,0.843852
4,0.0534,0.632009,0.831422,0.83245,0.830881,0.831084
5,0.034,0.767911,0.84289,0.84478,0.843574,0.842815
6,0.0214,1.069605,0.832569,0.833354,0.832091,0.832283
7,0.0158,1.176326,0.834862,0.835658,0.834386,0.83458
8,0.0112,1.246791,0.833716,0.834289,0.833302,0.833477
9,0.0076,1.120921,0.838303,0.838326,0.838438,0.838292
10,0.0061,1.510758,0.830275,0.832738,0.82946,0.82967


[I 2025-03-23 06:12:39,784] Trial 127 finished with value: 0.826668685126801 and parameters: {'learning_rate': 0.0025126146734439857, 'weight_decay': 0.005, 'warmup_steps': 17}. Best is trial 12 with value: 0.8439702128779307.


Trial 128 with params: {'learning_rate': 0.00024684882053097187, 'weight_decay': 0.002, 'warmup_steps': 34}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3954,0.411001,0.81078,0.818294,0.812168,0.810078
2,0.2514,0.451557,0.817661,0.823482,0.816399,0.816374
3,0.1938,0.493402,0.830275,0.83576,0.829081,0.829174
4,0.1584,0.615588,0.81422,0.826837,0.812389,0.811744
5,0.1308,0.537883,0.826835,0.826852,0.826671,0.826734


[I 2025-03-23 06:13:32,219] Trial 128 pruned. 


Trial 129 with params: {'learning_rate': 0.004590033179347377, 'weight_decay': 0.004, 'warmup_steps': 18}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.293,0.429381,0.837156,0.838289,0.83769,0.837125
2,0.1316,0.483044,0.83945,0.839505,0.839269,0.839347
3,0.0746,0.523429,0.833716,0.836639,0.832839,0.833051
4,0.0456,0.780629,0.826835,0.827523,0.826377,0.826555
5,0.0293,0.876849,0.827982,0.828162,0.827713,0.827829
6,0.0182,1.03069,0.841743,0.84169,0.84169,0.84169
7,0.0124,1.286557,0.832569,0.832534,0.832639,0.832547
8,0.0072,1.101607,0.818807,0.820958,0.819546,0.818692
9,0.0059,1.51902,0.819954,0.820362,0.819578,0.819726
10,0.004,1.826712,0.813073,0.813578,0.812653,0.812805


[I 2025-03-23 06:15:27,761] Trial 129 pruned. 


Trial 130 with params: {'learning_rate': 0.0008027687508333916, 'weight_decay': 0.01, 'warmup_steps': 29}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3347,0.535294,0.800459,0.820906,0.802749,0.798011
2,0.1842,0.414707,0.84633,0.847529,0.845773,0.846006
3,0.1236,0.410366,0.840596,0.840582,0.840479,0.840521
4,0.0889,0.565334,0.845183,0.84766,0.844395,0.844652
5,0.0661,0.650076,0.834862,0.839281,0.833796,0.833968
6,0.0495,0.768824,0.830275,0.833931,0.829292,0.829468
7,0.0366,0.822855,0.827982,0.828252,0.827671,0.827804
8,0.0264,0.926808,0.832569,0.832506,0.832555,0.832526
9,0.0194,1.06115,0.823394,0.823335,0.823335,0.823335
10,0.0142,1.129881,0.819954,0.819889,0.819915,0.819901


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--recall/11f90e583db35601050aed380d48e83202a896976b9608432fba9244fb447f24 (last modified on Fri Jan 10 23:14:00 2025) since it couldn't be found locally at evaluate-metric--recall, or remotely on the Hugging Face Hub.
[I 2025-03-23 06:17:57,648] Trial 130 pruned. 


Trial 131 with params: {'learning_rate': 0.00148761649249654, 'weight_decay': 0.007, 'warmup_steps': 25}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3126,0.463392,0.826835,0.837273,0.828439,0.825926
2,0.1585,0.424175,0.847477,0.847738,0.847194,0.847331
3,0.1008,0.417536,0.848624,0.848635,0.848489,0.848544
4,0.0664,0.666603,0.827982,0.832285,0.826913,0.82705
5,0.0454,0.845947,0.833716,0.833696,0.833807,0.833698
6,0.0311,0.849784,0.829128,0.831997,0.82825,0.828446
7,0.0219,0.979258,0.823394,0.823493,0.823167,0.823261
8,0.0149,1.148203,0.825688,0.826961,0.825082,0.825283
9,0.0113,1.360717,0.830275,0.830464,0.830513,0.830274
10,0.0079,1.331283,0.821101,0.822753,0.820409,0.820602


[I 2025-03-23 06:19:52,840] Trial 131 pruned. 


Trial 132 with params: {'learning_rate': 0.0021805872602140793, 'weight_decay': 0.007, 'warmup_steps': 29}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3026,0.439126,0.826835,0.835663,0.828313,0.826092
2,0.1481,0.429421,0.850917,0.851779,0.850446,0.850663
3,0.0901,0.405304,0.848624,0.848673,0.848783,0.848617
4,0.0567,0.713183,0.826835,0.830281,0.825871,0.826038
5,0.0361,0.98824,0.830275,0.831058,0.830723,0.830261
6,0.0261,1.011199,0.840596,0.840844,0.840311,0.840443
7,0.0177,1.217946,0.834862,0.8348,0.834849,0.83482
8,0.0123,1.219348,0.832569,0.832569,0.832681,0.832555
9,0.0085,1.286368,0.827982,0.828031,0.828134,0.827974
10,0.0061,1.404556,0.834862,0.834807,0.834807,0.834807


[I 2025-03-23 06:23:46,392] Trial 132 finished with value: 0.8346666947559132 and parameters: {'learning_rate': 0.0021805872602140793, 'weight_decay': 0.007, 'warmup_steps': 29}. Best is trial 12 with value: 0.8439702128779307.


Trial 133 with params: {'learning_rate': 0.003529113577834755, 'weight_decay': 0.006, 'warmup_steps': 27}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.29,0.401511,0.829128,0.829277,0.829345,0.829126
2,0.1355,0.469524,0.83945,0.842292,0.838596,0.838831
3,0.0786,0.45989,0.84633,0.847529,0.845773,0.846006
4,0.0467,0.765529,0.84289,0.842921,0.842732,0.842799
5,0.0305,0.761589,0.83945,0.839505,0.839269,0.839347
6,0.0201,1.030944,0.827982,0.829081,0.827419,0.827619
7,0.0139,1.203632,0.836009,0.836723,0.836438,0.835999
8,0.0099,1.152681,0.827982,0.827925,0.828008,0.827949
9,0.0061,1.45786,0.817661,0.819624,0.816905,0.817083
10,0.005,1.463216,0.831422,0.831862,0.83105,0.831209


[I 2025-03-23 06:26:38,809] Trial 133 finished with value: 0.8312833411647641 and parameters: {'learning_rate': 0.003529113577834755, 'weight_decay': 0.006, 'warmup_steps': 27}. Best is trial 12 with value: 0.8439702128779307.


Trial 134 with params: {'learning_rate': 0.0017468192229391668, 'weight_decay': 0.005, 'warmup_steps': 20}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3065,0.459536,0.823394,0.830924,0.824766,0.822764
2,0.1524,0.432279,0.850917,0.851477,0.85053,0.850716
3,0.0957,0.400427,0.853211,0.853281,0.853035,0.853118
4,0.0618,0.62312,0.831422,0.832633,0.830839,0.831049
5,0.0414,0.897831,0.831422,0.83136,0.831428,0.831385
6,0.028,0.972699,0.838303,0.841586,0.837385,0.837609
7,0.0198,1.007172,0.844037,0.844007,0.843942,0.84397
8,0.0138,1.151716,0.840596,0.840536,0.840606,0.840561
9,0.0092,1.145036,0.834862,0.837107,0.834091,0.834318
10,0.0064,1.489747,0.832569,0.834089,0.831923,0.832141


[I 2025-03-23 06:30:57,937] Trial 134 finished with value: 0.8367773061611874 and parameters: {'learning_rate': 0.0017468192229391668, 'weight_decay': 0.005, 'warmup_steps': 20}. Best is trial 12 with value: 0.8439702128779307.


Trial 135 with params: {'learning_rate': 0.0006070964815669602, 'weight_decay': 0.006, 'warmup_steps': 19}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3435,0.51636,0.808486,0.825241,0.810548,0.806649
2,0.1974,0.443813,0.826835,0.829971,0.825914,0.826092
3,0.1366,0.440508,0.830275,0.830905,0.829839,0.830017
4,0.1018,0.536539,0.841743,0.842414,0.841311,0.841502
5,0.0775,0.648014,0.826835,0.831298,0.825745,0.825867


[I 2025-03-23 06:32:02,284] Trial 135 pruned. 


Trial 136 with params: {'learning_rate': 0.0048079002310287364, 'weight_decay': 0.007, 'warmup_steps': 15}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2875,0.445083,0.829128,0.831723,0.829934,0.828988
2,0.131,0.457469,0.83945,0.84043,0.838932,0.839144
3,0.0755,0.488669,0.833716,0.83694,0.832797,0.833002
4,0.046,0.682742,0.84633,0.84633,0.846447,0.846317
5,0.03,0.81328,0.822248,0.822395,0.822461,0.822246
6,0.0185,0.975441,0.833716,0.833865,0.833933,0.833714
7,0.0135,1.125826,0.833716,0.834289,0.833302,0.833477
8,0.0094,1.204839,0.829128,0.829066,0.829092,0.829078
9,0.0067,1.268059,0.824541,0.82462,0.824714,0.824536
10,0.0048,1.454214,0.827982,0.828603,0.827545,0.82772


[I 2025-03-23 06:35:47,122] Trial 136 finished with value: 0.8266157332606465 and parameters: {'learning_rate': 0.0048079002310287364, 'weight_decay': 0.007, 'warmup_steps': 15}. Best is trial 12 with value: 0.8439702128779307.


Trial 137 with params: {'learning_rate': 0.0016237812391404374, 'weight_decay': 0.003, 'warmup_steps': 10}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3025,0.486954,0.824541,0.833317,0.826019,0.823788
2,0.1548,0.426295,0.840596,0.840844,0.840311,0.840443
3,0.0976,0.420543,0.849771,0.849713,0.849741,0.849726
4,0.0637,0.650471,0.830275,0.831998,0.829587,0.829802
5,0.0422,0.828492,0.833716,0.833696,0.833807,0.833698
6,0.0291,1.003049,0.819954,0.823003,0.81903,0.819182
7,0.0193,1.245423,0.834862,0.836191,0.834259,0.834478
8,0.0137,1.234752,0.827982,0.828162,0.827713,0.827829
9,0.0099,1.25462,0.826835,0.826772,0.826798,0.826784
10,0.0077,1.440695,0.829128,0.829271,0.828881,0.828988


[I 2025-03-23 06:38:40,195] Trial 137 finished with value: 0.8245133591928057 and parameters: {'learning_rate': 0.0016237812391404374, 'weight_decay': 0.003, 'warmup_steps': 10}. Best is trial 12 with value: 0.8439702128779307.


Trial 138 with params: {'learning_rate': 0.0021376983326530396, 'weight_decay': 0.005, 'warmup_steps': 22}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3008,0.421848,0.833716,0.836921,0.834607,0.833531
2,0.1471,0.435964,0.844037,0.846108,0.843311,0.843563
3,0.0894,0.40651,0.853211,0.853186,0.853119,0.853148
4,0.0565,0.671884,0.840596,0.84186,0.840016,0.840243
5,0.0372,0.972198,0.829128,0.829207,0.829302,0.829123
6,0.0244,1.038253,0.834862,0.837372,0.834049,0.834273
7,0.0174,1.126402,0.836009,0.835962,0.836059,0.835983
8,0.0124,1.23567,0.836009,0.835992,0.835891,0.835931
9,0.0085,1.431354,0.833716,0.833739,0.833554,0.833619
10,0.0057,1.68454,0.829128,0.829686,0.828713,0.828883


[I 2025-03-23 06:42:09,003] Trial 138 finished with value: 0.8335788059108218 and parameters: {'learning_rate': 0.0021376983326530396, 'weight_decay': 0.005, 'warmup_steps': 22}. Best is trial 12 with value: 0.8439702128779307.


Trial 139 with params: {'learning_rate': 0.00030973211048442874, 'weight_decay': 0.002, 'warmup_steps': 16}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3749,0.407159,0.81422,0.820689,0.815505,0.813656
2,0.2369,0.485589,0.813073,0.8225,0.811474,0.811106
3,0.1777,0.490731,0.824541,0.828277,0.823535,0.823678
4,0.1416,0.592143,0.816514,0.824783,0.815021,0.814795
5,0.1144,0.554533,0.834862,0.834828,0.834765,0.834792
6,0.094,0.629539,0.838303,0.839366,0.837764,0.837979
7,0.0781,0.667039,0.841743,0.842275,0.841353,0.84153
8,0.067,0.665486,0.829128,0.831204,0.82985,0.829029
9,0.0564,0.742377,0.830275,0.833305,0.831144,0.8301
10,0.0476,0.817214,0.821101,0.826446,0.822262,0.820685


[I 2025-03-23 06:44:15,897] Trial 139 pruned. 


Trial 140 with params: {'learning_rate': 0.001572695033260391, 'weight_decay': 0.003, 'warmup_steps': 27}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3112,0.462724,0.825688,0.834237,0.827145,0.824966
2,0.1572,0.442119,0.84633,0.846542,0.846068,0.846194
3,0.0994,0.411426,0.844037,0.844097,0.843858,0.843937
4,0.0652,0.630457,0.829128,0.830732,0.82846,0.828672
5,0.0429,0.830678,0.834862,0.835503,0.83527,0.834855
6,0.0303,0.993326,0.825688,0.830688,0.824535,0.824622
7,0.0209,1.025397,0.829128,0.829081,0.829176,0.829101
8,0.0135,1.264525,0.831422,0.83136,0.831428,0.831385
9,0.0107,1.325968,0.833716,0.833795,0.833891,0.83371
10,0.0079,1.440378,0.831422,0.83283,0.830797,0.831011


[I 2025-03-23 06:47:47,138] Trial 140 finished with value: 0.834757204895381 and parameters: {'learning_rate': 0.001572695033260391, 'weight_decay': 0.003, 'warmup_steps': 27}. Best is trial 12 with value: 0.8439702128779307.


Trial 141 with params: {'learning_rate': 0.0014233218148361653, 'weight_decay': 0.003, 'warmup_steps': 31}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3133,0.470366,0.823394,0.839766,0.825398,0.821819
2,0.1607,0.439242,0.84633,0.847173,0.845858,0.846068
3,0.102,0.401054,0.841743,0.841682,0.841732,0.841702
4,0.0679,0.69261,0.840596,0.843028,0.839806,0.840049
5,0.0462,0.830946,0.840596,0.840684,0.840395,0.840485
6,0.0318,0.981834,0.833716,0.836639,0.832839,0.833051
7,0.022,1.187559,0.827982,0.828908,0.827461,0.827654
8,0.0152,1.126602,0.832569,0.835055,0.831755,0.831971
9,0.0114,1.244941,0.832569,0.833517,0.832049,0.83225
10,0.0077,1.45308,0.838303,0.839366,0.837764,0.837979


[I 2025-03-23 06:51:08,790] Trial 141 finished with value: 0.8391440693598967 and parameters: {'learning_rate': 0.0014233218148361653, 'weight_decay': 0.003, 'warmup_steps': 31}. Best is trial 12 with value: 0.8439702128779307.


Trial 142 with params: {'learning_rate': 0.0009942738534226394, 'weight_decay': 0.002, 'warmup_steps': 28}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3246,0.522607,0.808486,0.829015,0.810758,0.806186
2,0.1749,0.429951,0.841743,0.841947,0.841479,0.841602
3,0.1145,0.418269,0.841743,0.841801,0.841564,0.841642
4,0.08,0.579146,0.844037,0.844576,0.843647,0.843826
5,0.0575,0.737435,0.836009,0.838396,0.835217,0.835446
6,0.0418,0.780439,0.831422,0.833508,0.830671,0.830888
7,0.0304,0.931283,0.837156,0.837156,0.83727,0.837142
8,0.0209,1.027616,0.826835,0.826773,0.82684,0.826796
9,0.0152,1.053099,0.830275,0.830273,0.830134,0.830186
10,0.0112,1.219769,0.816514,0.817551,0.815947,0.816127


[I 2025-03-23 06:53:14,223] Trial 142 pruned. 


Trial 143 with params: {'learning_rate': 0.0014286107751462758, 'weight_decay': 0.004, 'warmup_steps': 34}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3142,0.474738,0.826835,0.839648,0.828608,0.825679
2,0.1612,0.429903,0.848624,0.848635,0.848489,0.848544
3,0.1024,0.405944,0.84633,0.84627,0.846321,0.846291
4,0.0683,0.605241,0.847477,0.848408,0.846984,0.847202
5,0.0463,0.824626,0.834862,0.834977,0.834638,0.834737
6,0.0323,0.910417,0.825688,0.828962,0.824745,0.824913
7,0.0223,1.019979,0.840596,0.840626,0.840437,0.840504
8,0.0159,1.136977,0.832569,0.832513,0.832597,0.832537
9,0.0122,1.149183,0.833716,0.833795,0.833512,0.8336
10,0.0086,1.28371,0.824541,0.826566,0.823788,0.823985


[I 2025-03-23 06:55:14,129] Trial 143 pruned. 


Trial 144 with params: {'learning_rate': 0.004209686152793855, 'weight_decay': 0.001, 'warmup_steps': 11}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2892,0.500946,0.834862,0.838231,0.835775,0.834667
2,0.1328,0.46388,0.827982,0.82947,0.827334,0.827543
3,0.0759,0.501258,0.84289,0.84364,0.842437,0.842636
4,0.0458,0.848627,0.833716,0.833865,0.83347,0.833579
5,0.029,0.840061,0.837156,0.837094,0.837143,0.837114
6,0.0195,1.03315,0.832569,0.832618,0.832723,0.832561
7,0.0125,1.234273,0.829128,0.829686,0.828713,0.828883
8,0.0084,1.126678,0.832569,0.832681,0.832765,0.832565
9,0.0052,1.592503,0.832569,0.832534,0.832639,0.832547
10,0.0044,1.508916,0.823394,0.823329,0.823377,0.823349


[I 2025-03-23 06:57:04,571] Trial 144 pruned. 


Trial 145 with params: {'learning_rate': 0.003908994973245309, 'weight_decay': 0.002, 'warmup_steps': 21}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.288,0.466799,0.831422,0.83461,0.832312,0.831235
2,0.1344,0.459011,0.83945,0.840807,0.838848,0.839076
3,0.0773,0.523137,0.848624,0.849177,0.848236,0.84842
4,0.0472,0.902859,0.824541,0.825713,0.823956,0.824153
5,0.0302,1.021416,0.832569,0.833207,0.832134,0.832314
6,0.0196,1.066399,0.833716,0.833949,0.833975,0.833715
7,0.0125,1.286802,0.834862,0.834806,0.834891,0.834831
8,0.0088,1.483808,0.841743,0.841749,0.841606,0.84166
9,0.007,1.482085,0.830275,0.830387,0.830471,0.830272
10,0.005,1.714401,0.824541,0.824478,0.824503,0.824489


[I 2025-03-23 06:59:32,684] Trial 145 pruned. 


Trial 146 with params: {'learning_rate': 0.0017477286814156078, 'weight_decay': 0.005, 'warmup_steps': 36}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.309,0.462647,0.827982,0.839313,0.82965,0.826991
2,0.1538,0.429111,0.84633,0.847529,0.845773,0.846006
3,0.0958,0.394694,0.84633,0.84627,0.846321,0.846291
4,0.0619,0.638074,0.834862,0.835999,0.834302,0.834514
5,0.0409,0.909871,0.832569,0.833207,0.832975,0.832561
6,0.0276,0.899434,0.833716,0.83494,0.833133,0.833347
7,0.0204,0.86918,0.837156,0.837273,0.836933,0.837033
8,0.013,1.133121,0.834862,0.834827,0.834933,0.834841
9,0.0096,1.421107,0.84289,0.842913,0.843026,0.84288
10,0.0081,1.302723,0.823394,0.825787,0.822577,0.822764


[I 2025-03-23 07:01:34,182] Trial 146 pruned. 


Trial 147 with params: {'learning_rate': 0.004057154390862344, 'weight_decay': 0.002, 'warmup_steps': 34}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2971,0.427049,0.831422,0.832285,0.831891,0.831404
2,0.1351,0.463628,0.836009,0.836348,0.83568,0.835828
3,0.0786,0.508233,0.845183,0.847943,0.844353,0.844609
4,0.0484,0.725339,0.836009,0.835963,0.835933,0.835947
5,0.0305,0.872331,0.830275,0.830219,0.830302,0.830243
6,0.0204,0.952129,0.829128,0.829148,0.828966,0.829029
7,0.0129,1.283517,0.816514,0.81647,0.81641,0.816436
8,0.0084,1.372944,0.829128,0.829148,0.828966,0.829029
9,0.0055,1.645547,0.831422,0.831445,0.831555,0.831411
10,0.0044,1.735432,0.816514,0.817385,0.815989,0.816165


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--accuracy/f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Sat Oct 12 13:56:14 2024) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.
[I 2025-03-23 07:03:31,826] Trial 147 pruned. 


Trial 148 with params: {'learning_rate': 0.0034625319192573842, 'weight_decay': 0.005, 'warmup_steps': 32}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2966,0.427054,0.834862,0.837631,0.835691,0.834715
2,0.138,0.492721,0.833716,0.835139,0.833091,0.83331
3,0.0804,0.455078,0.838303,0.838287,0.838185,0.838226
4,0.0487,0.697592,0.840596,0.840844,0.840311,0.840443
5,0.0323,0.904633,0.830275,0.830275,0.830386,0.830261
6,0.0216,1.046377,0.824541,0.826332,0.82383,0.82403
7,0.0158,1.165881,0.831422,0.831862,0.83105,0.831209
8,0.0093,1.210838,0.830275,0.83046,0.830008,0.830124
9,0.0066,1.427148,0.829128,0.829451,0.828797,0.828939
10,0.0043,1.802318,0.825688,0.825629,0.825629,0.825629


[I 2025-03-23 07:06:07,054] Trial 148 pruned. 


Trial 149 with params: {'learning_rate': 0.0012924272883759233, 'weight_decay': 0.005, 'warmup_steps': 19}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3141,0.523554,0.808486,0.826698,0.810632,0.80647
2,0.1645,0.435669,0.840596,0.840582,0.840479,0.840521
3,0.1062,0.405227,0.856651,0.856632,0.85675,0.856636
4,0.0719,0.602238,0.841743,0.843326,0.8411,0.841339
5,0.0499,0.770074,0.830275,0.830321,0.830092,0.830167
6,0.0346,0.831368,0.834862,0.836856,0.834133,0.83436
7,0.0245,0.922542,0.832569,0.832848,0.83226,0.832396
8,0.0161,1.222348,0.837156,0.837156,0.83727,0.837142
9,0.0126,1.178339,0.832569,0.832568,0.832428,0.832481
10,0.0094,1.475344,0.825688,0.826775,0.825124,0.825321


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--recall/11f90e583db35601050aed380d48e83202a896976b9608432fba9244fb447f24 (last modified on Fri Jan 10 23:14:00 2025) since it couldn't be found locally at evaluate-metric--recall, or remotely on the Hugging Face Hub.
[I 2025-03-23 07:08:23,826] Trial 149 pruned. 


In [25]:
print(best_trial)

BestRun(run_id='12', objective=0.8439702128779307, hyperparameters={'learning_rate': 0.002163019453168294, 'weight_decay': 0.006, 'warmup_steps': 41}, run_summary=None)


In [26]:
base.reset_seed()

In [27]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill-embedd_hp-search", logging_dir=f"~/logs/{DATASET}/bilstm-distill-embedd_hp-search", remove_unused_columns=False, epochs=num_epochs, batch_size=batch_size)

In [28]:
def hp_space(trial):
    params =  {
        "learning_rate": trial.suggest_float("learning_rate", 5e-5, 5e-3, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0, 1e-2, step=1e-3),
        "warmup_steps" : trial.suggest_int("warmup_steps", 0, warm_up),
        "lambda_param": trial.suggest_float("lambda_param",0,1,step=.1),
        "temperature": trial.suggest_float("temperature", 2,7, step=.5)
    }
    print(f"Trial {trial.number} with params: {params}")
    return params

In [29]:
pruner = optuna.pruners.HyperbandPruner(min_resource=min_r, max_resource=max_r, reduction_factor=2, bootstrap_count=2)
sampler = optuna.samplers.TPESampler(seed=42, multivariate=True)



In [30]:
trainer = base.DistilTrainer(
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    model_init = lambda: get_BiLSTM(),
)
  

In [31]:
best_trial2 = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    compute_objective=lambda metrics: metrics["eval_f1"],
    pruner=pruner,
    sampler=sampler,
    study_name="Distill-embedd",
    n_trials=150
)

[I 2025-03-23 07:08:24,142] A new study created in memory with name: Distill-embedd


Trial 0 with params: {'learning_rate': 0.0002805758207667253, 'weight_decay': 0.01, 'warmup_steps': 32, 'lambda_param': 0.6000000000000001, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7204,1.466995,0.823394,0.825325,0.824093,0.823302
2,0.924,1.401038,0.831422,0.83283,0.830797,0.831011
3,0.6483,1.526375,0.829128,0.837538,0.827661,0.827566
4,0.4929,1.503149,0.822248,0.823406,0.821662,0.821854
5,0.3825,1.508689,0.841743,0.841801,0.841564,0.841642


[I 2025-03-23 07:09:22,825] Trial 0 pruned. 


Trial 1 with params: {'learning_rate': 0.00010255552094216992, 'weight_decay': 0.0, 'warmup_steps': 38, 'lambda_param': 0.6000000000000001, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1704,1.556597,0.793578,0.794176,0.793972,0.793568
2,1.3026,1.481213,0.808486,0.808598,0.808232,0.808329
3,1.0972,1.426977,0.823394,0.823394,0.823503,0.82338
4,0.9546,1.505648,0.819954,0.823621,0.818946,0.819069
5,0.827,1.415868,0.826835,0.826858,0.826966,0.826824
6,0.7356,1.558581,0.817661,0.823973,0.818925,0.817129
7,0.6604,1.512005,0.831422,0.832832,0.832018,0.831372
8,0.6021,1.522306,0.822248,0.823597,0.82162,0.821814
9,0.5563,1.559684,0.824541,0.824962,0.824166,0.824319
10,0.518,1.645762,0.819954,0.823362,0.820883,0.819726


[I 2025-03-23 07:11:21,706] Trial 1 pruned. 


Trial 2 with params: {'learning_rate': 5.497167787383099e-05, 'weight_decay': 0.01, 'warmup_steps': 36, 'lambda_param': 0.2, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5042,1.673235,0.784404,0.786391,0.785131,0.784266
2,1.5477,1.554937,0.788991,0.788932,0.788878,0.788901
3,1.3454,1.495983,0.805046,0.804987,0.805064,0.805009
4,1.224,1.513532,0.81422,0.816039,0.813484,0.813656
5,1.1238,1.450882,0.811927,0.811891,0.81199,0.811902


[I 2025-03-23 07:12:32,545] Trial 2 pruned. 


Trial 3 with params: {'learning_rate': 0.00011635338541918901, 'weight_decay': 0.003, 'warmup_steps': 23, 'lambda_param': 0.4, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0801,1.529365,0.801606,0.804042,0.802402,0.801442
2,1.2703,1.435461,0.817661,0.817631,0.817536,0.817574
3,1.0421,1.412928,0.819954,0.82028,0.820251,0.819954
4,0.8969,1.607578,0.819954,0.825423,0.818736,0.818753
5,0.7665,1.408516,0.823394,0.823892,0.823756,0.823391
6,0.672,1.371974,0.827982,0.828093,0.828176,0.827978
7,0.6009,1.526793,0.821101,0.824068,0.821967,0.820916
8,0.5374,1.520734,0.832569,0.832849,0.832849,0.832569
9,0.4896,1.562374,0.826835,0.826858,0.826966,0.826824
10,0.4555,1.603018,0.821101,0.822583,0.821714,0.821041


[I 2025-03-23 07:14:41,781] Trial 3 pruned. 


Trial 4 with params: {'learning_rate': 0.0008369042894376068, 'weight_decay': 0.001, 'warmup_steps': 12, 'lambda_param': 0.4, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3414,1.656729,0.816514,0.82486,0.817968,0.815754
2,0.5833,1.262644,0.838303,0.838646,0.837975,0.838124
3,0.3498,1.128302,0.855505,0.856079,0.855119,0.85531
4,0.2343,1.215646,0.864679,0.86619,0.864086,0.864364
5,0.1706,1.339102,0.849771,0.852021,0.849025,0.849295
6,0.1299,1.198609,0.853211,0.854652,0.852614,0.85287
7,0.1034,1.241643,0.864679,0.864729,0.864844,0.864672
8,0.0826,1.193724,0.858945,0.858942,0.858834,0.858878
9,0.0699,1.214023,0.862385,0.862787,0.862718,0.862385
10,0.0598,1.211695,0.858945,0.858968,0.859087,0.858936


[I 2025-03-23 07:16:42,020] Trial 4 pruned. 


Trial 5 with params: {'learning_rate': 0.0018591820902866042, 'weight_decay': 0.002, 'warmup_steps': 22, 'lambda_param': 0.6000000000000001, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1858,1.293941,0.838303,0.841859,0.839238,0.838098
2,0.4454,1.086542,0.864679,0.865598,0.864212,0.864448
3,0.2435,1.091242,0.850917,0.852345,0.85032,0.850571
4,0.1591,1.200505,0.858945,0.859927,0.858455,0.858691
5,0.1126,1.338112,0.84633,0.84646,0.84611,0.846214
6,0.0849,1.254436,0.854358,0.855152,0.853909,0.854123
7,0.0654,1.239336,0.849771,0.849869,0.849573,0.849666
8,0.0528,1.242543,0.854358,0.854539,0.854119,0.854238
9,0.0448,1.196896,0.860092,0.860429,0.859792,0.859947
10,0.0391,1.248263,0.862385,0.862537,0.862171,0.862281


[I 2025-03-23 07:19:49,214] Trial 5 finished with value: 0.8575816487273915 and parameters: {'learning_rate': 0.0018591820902866042, 'weight_decay': 0.002, 'warmup_steps': 22, 'lambda_param': 0.6000000000000001, 'temperature': 2.0}. Best is trial 5 with value: 0.8575816487273915.


Trial 6 with params: {'learning_rate': 0.0008204643365323959, 'weight_decay': 0.001, 'warmup_steps': 2, 'lambda_param': 1.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3413,1.695942,0.816514,0.828082,0.81822,0.815391
2,0.5909,1.219729,0.840596,0.840844,0.840311,0.840443
3,0.357,1.178524,0.860092,0.86024,0.859876,0.859986
4,0.2386,1.250844,0.844037,0.848609,0.842974,0.843192
5,0.1744,1.290923,0.855505,0.855732,0.855245,0.855376
6,0.1333,1.21941,0.855505,0.856079,0.855119,0.85531
7,0.106,1.204414,0.861239,0.861203,0.861171,0.861186
8,0.0848,1.192161,0.863532,0.863532,0.863423,0.863467
9,0.0721,1.199767,0.863532,0.863773,0.863802,0.863532
10,0.0607,1.236348,0.861239,0.861319,0.861423,0.861234


[I 2025-03-23 07:21:48,772] Trial 6 pruned. 


Trial 7 with params: {'learning_rate': 0.0020690200562805084, 'weight_decay': 0.003, 'warmup_steps': 4, 'lambda_param': 0.7000000000000001, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1454,1.204479,0.83945,0.841965,0.840237,0.839328
2,0.4251,1.108997,0.865826,0.866505,0.865423,0.865633
3,0.2298,1.175163,0.852064,0.853016,0.851572,0.851797
4,0.1461,1.295906,0.847477,0.848408,0.846984,0.847202
5,0.1056,1.284936,0.852064,0.852057,0.851951,0.851994
6,0.0791,1.201819,0.849771,0.851088,0.849194,0.849438
7,0.0617,1.23484,0.849771,0.849713,0.849741,0.849726
8,0.0487,1.227978,0.847477,0.847842,0.847152,0.847308
9,0.0427,1.24913,0.84633,0.846542,0.846068,0.846194
10,0.0374,1.248777,0.84633,0.846302,0.846236,0.846265


[I 2025-03-23 07:25:06,653] Trial 7 finished with value: 0.8484197218710493 and parameters: {'learning_rate': 0.0020690200562805084, 'weight_decay': 0.003, 'warmup_steps': 4, 'lambda_param': 0.7000000000000001, 'temperature': 4.0}. Best is trial 5 with value: 0.8575816487273915.


Trial 8 with params: {'learning_rate': 8.770946743725407e-05, 'weight_decay': 0.005, 'warmup_steps': 1, 'lambda_param': 1.0, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2141,1.611492,0.790138,0.792267,0.790888,0.789992
2,1.3759,1.52418,0.805046,0.805585,0.804601,0.804749
3,1.1663,1.449557,0.817661,0.818094,0.817999,0.817658
4,1.0349,1.507953,0.815367,0.818357,0.814442,0.814575
5,0.9098,1.409543,0.825688,0.825729,0.825503,0.825577
6,0.8215,1.439035,0.829128,0.8323,0.830018,0.828939
7,0.7546,1.413005,0.831422,0.831403,0.831513,0.831404
8,0.689,1.422467,0.834862,0.834827,0.834933,0.834841
9,0.6421,1.472034,0.830275,0.830239,0.830176,0.830203
10,0.5979,1.555597,0.815367,0.818738,0.816294,0.815133


[I 2025-03-23 07:27:54,023] Trial 8 pruned. 


Trial 9 with params: {'learning_rate': 0.0010568529720322872, 'weight_decay': 0.003, 'warmup_steps': 22, 'lambda_param': 0.6000000000000001, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3121,1.585586,0.827982,0.832652,0.82906,0.827654
2,0.5418,1.164149,0.856651,0.856614,0.856582,0.856597
3,0.3103,1.100168,0.857798,0.85803,0.85754,0.857672
4,0.2029,1.152255,0.863532,0.865385,0.862876,0.863168
5,0.1472,1.348176,0.847477,0.851557,0.846478,0.846727


[I 2025-03-23 07:29:25,998] Trial 9 pruned. 


Trial 10 with params: {'learning_rate': 0.003553256925699131, 'weight_decay': 0.003, 'warmup_steps': 26, 'lambda_param': 0.1, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1223,1.310164,0.855505,0.858399,0.85634,0.855376
2,0.387,1.177439,0.853211,0.853778,0.852825,0.853013
3,0.2062,1.215435,0.849771,0.85258,0.848941,0.849214
4,0.1288,1.253868,0.855505,0.855554,0.855666,0.855498
5,0.088,1.23094,0.857798,0.857944,0.857582,0.85769
6,0.0653,1.246242,0.862385,0.862464,0.862213,0.862298
7,0.0529,1.28455,0.856651,0.856836,0.856414,0.856533
8,0.0423,1.235547,0.857798,0.857744,0.857834,0.857771
9,0.0365,1.224162,0.856651,0.856993,0.856961,0.856651
10,0.0331,1.216101,0.858945,0.859025,0.859129,0.85894


[I 2025-03-23 07:34:05,770] Trial 10 finished with value: 0.8588913227951769 and parameters: {'learning_rate': 0.003553256925699131, 'weight_decay': 0.003, 'warmup_steps': 26, 'lambda_param': 0.1, 'temperature': 2.0}. Best is trial 10 with value: 0.8588913227951769.


Trial 11 with params: {'learning_rate': 0.0036979694616670403, 'weight_decay': 0.006, 'warmup_steps': 37, 'lambda_param': 0.1, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1346,1.126188,0.865826,0.86834,0.866601,0.865732
2,0.392,1.163341,0.862385,0.862328,0.862381,0.86235
3,0.2088,1.16516,0.854358,0.856144,0.853698,0.853969
4,0.1319,1.190722,0.856651,0.856614,0.856582,0.856597
5,0.0907,1.289912,0.84289,0.842871,0.842984,0.842873


[I 2025-03-23 07:35:27,968] Trial 11 pruned. 


Trial 12 with params: {'learning_rate': 0.0044803639948611095, 'weight_decay': 0.001, 'warmup_steps': 22, 'lambda_param': 0.0, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0842,1.213853,0.860092,0.861086,0.860592,0.860073
2,0.3732,1.227322,0.840596,0.843028,0.839806,0.840049
3,0.2006,1.226586,0.854358,0.854462,0.854161,0.854256
4,0.1294,1.275028,0.855505,0.855577,0.85533,0.855413
5,0.0882,1.314433,0.848624,0.848564,0.848615,0.848585
6,0.0656,1.302237,0.849771,0.849945,0.849531,0.849647
7,0.051,1.327103,0.854358,0.854302,0.85433,0.854315
8,0.043,1.320046,0.848624,0.848597,0.848531,0.848559
9,0.0376,1.33072,0.853211,0.853281,0.853035,0.853118
10,0.0334,1.338194,0.853211,0.853923,0.852783,0.852988


[I 2025-03-23 07:40:15,549] Trial 12 finished with value: 0.853059505002633 and parameters: {'learning_rate': 0.0044803639948611095, 'weight_decay': 0.001, 'warmup_steps': 22, 'lambda_param': 0.0, 'temperature': 3.0}. Best is trial 10 with value: 0.8588913227951769.


Trial 13 with params: {'learning_rate': 0.002518208951412107, 'weight_decay': 0.0, 'warmup_steps': 14, 'lambda_param': 0.5, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1353,1.16012,0.84633,0.847889,0.846952,0.846279
2,0.4033,1.181269,0.853211,0.854082,0.852741,0.85296
3,0.214,1.245919,0.84633,0.847343,0.845815,0.846038
4,0.1409,1.269796,0.848624,0.84905,0.848278,0.848444
5,0.0985,1.272851,0.844037,0.844002,0.84411,0.844016


[I 2025-03-23 07:41:47,856] Trial 13 pruned. 


Trial 14 with params: {'learning_rate': 0.0035985903311758468, 'weight_decay': 0.007, 'warmup_steps': 15, 'lambda_param': 0.4, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1001,1.190646,0.844037,0.847161,0.84491,0.843876
2,0.3835,1.162091,0.861239,0.863075,0.860581,0.860868
3,0.2046,1.261904,0.854358,0.855152,0.853909,0.854123
4,0.1292,1.244255,0.849771,0.849713,0.849741,0.849726
5,0.0883,1.321573,0.848624,0.848573,0.848573,0.848573
6,0.0666,1.266318,0.847477,0.847648,0.847236,0.847352
7,0.0533,1.323049,0.849771,0.850712,0.849278,0.8495
8,0.0437,1.262531,0.856651,0.856929,0.856372,0.856514
9,0.0372,1.253078,0.856651,0.856929,0.856372,0.856514
10,0.0324,1.286234,0.857798,0.85803,0.85754,0.857672


[I 2025-03-23 07:46:27,973] Trial 14 finished with value: 0.8506904381819427 and parameters: {'learning_rate': 0.0035985903311758468, 'weight_decay': 0.007, 'warmup_steps': 15, 'lambda_param': 0.4, 'temperature': 2.5}. Best is trial 10 with value: 0.8588913227951769.


Trial 15 with params: {'learning_rate': 0.002356648803391792, 'weight_decay': 0.003, 'warmup_steps': 36, 'lambda_param': 0.5, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1908,1.329009,0.841743,0.845493,0.8427,0.84153
2,0.4152,1.156078,0.852064,0.854077,0.851362,0.851633
3,0.222,1.217029,0.850917,0.851236,0.850615,0.850764
4,0.1417,1.282282,0.848624,0.849648,0.84811,0.848336
5,0.1002,1.335825,0.84289,0.842913,0.843026,0.84288


[I 2025-03-23 07:48:02,827] Trial 15 pruned. 


Trial 16 with params: {'learning_rate': 0.0025370794193732664, 'weight_decay': 0.005, 'warmup_steps': 11, 'lambda_param': 0.0, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1143,1.158891,0.847477,0.850178,0.848289,0.847352
2,0.405,1.139027,0.856651,0.856605,0.856708,0.856629
3,0.2204,1.173192,0.853211,0.85536,0.852488,0.852765
4,0.139,1.18152,0.855505,0.855456,0.855456,0.855456
5,0.0989,1.254022,0.858945,0.858991,0.858792,0.858863
6,0.0733,1.184414,0.854358,0.854539,0.854119,0.854238
7,0.0583,1.218799,0.853211,0.853281,0.853035,0.853118
8,0.0473,1.219139,0.852064,0.852104,0.851909,0.851978
9,0.0406,1.225531,0.860092,0.860092,0.860213,0.86008
10,0.0356,1.207763,0.855505,0.85547,0.855582,0.855486


[I 2025-03-23 07:52:49,269] Trial 16 finished with value: 0.8587887716692801 and parameters: {'learning_rate': 0.0025370794193732664, 'weight_decay': 0.005, 'warmup_steps': 11, 'lambda_param': 0.0, 'temperature': 2.0}. Best is trial 10 with value: 0.8588913227951769.


Trial 17 with params: {'learning_rate': 0.0013436998429840659, 'weight_decay': 0.003, 'warmup_steps': 5, 'lambda_param': 0.0, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2291,1.659138,0.821101,0.83166,0.822725,0.820132
2,0.4947,1.166872,0.856651,0.856891,0.856919,0.856651
3,0.2795,1.12942,0.853211,0.853649,0.852867,0.853037
4,0.1812,1.193712,0.856651,0.858701,0.855951,0.856234
5,0.13,1.360463,0.840596,0.842516,0.83989,0.840132


[I 2025-03-23 07:54:20,529] Trial 17 pruned. 


Trial 18 with params: {'learning_rate': 0.00018235753455668626, 'weight_decay': 0.006, 'warmup_steps': 9, 'lambda_param': 0.1, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8654,1.643195,0.78211,0.793681,0.783889,0.780614
2,1.1036,1.404334,0.827982,0.827946,0.82805,0.827959
3,0.8412,1.430626,0.829128,0.831997,0.82825,0.828446
4,0.6818,1.539716,0.823394,0.826638,0.822451,0.82261
5,0.5596,1.504901,0.831422,0.831501,0.831597,0.831416


[I 2025-03-23 07:55:58,824] Trial 18 pruned. 


Trial 19 with params: {'learning_rate': 0.003959562123806803, 'weight_decay': 0.006, 'warmup_steps': 17, 'lambda_param': 0.0, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0922,1.216713,0.853211,0.855801,0.854004,0.8531
2,0.3807,1.155475,0.855505,0.855948,0.855161,0.855333
3,0.2025,1.164805,0.860092,0.860045,0.860045,0.860045
4,0.1312,1.254271,0.853211,0.853186,0.853119,0.853148
5,0.0918,1.249395,0.862385,0.862351,0.862465,0.862367
6,0.0661,1.262174,0.855505,0.855446,0.855498,0.855467
7,0.052,1.283097,0.853211,0.853186,0.853119,0.853148
8,0.0438,1.319267,0.850917,0.851137,0.850657,0.850785
9,0.0376,1.278252,0.857798,0.857817,0.857666,0.857723
10,0.033,1.249306,0.857798,0.857744,0.857834,0.857771


[I 2025-03-23 08:01:32,599] Trial 19 finished with value: 0.8588095911960034 and parameters: {'learning_rate': 0.003959562123806803, 'weight_decay': 0.006, 'warmup_steps': 17, 'lambda_param': 0.0, 'temperature': 2.0}. Best is trial 10 with value: 0.8588913227951769.


Trial 20 with params: {'learning_rate': 0.00014766637242423952, 'weight_decay': 0.008, 'warmup_steps': 30, 'lambda_param': 0.9, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9904,1.51194,0.792431,0.795337,0.793308,0.792201
2,1.1779,1.400817,0.822248,0.822478,0.822504,0.822247
3,0.9278,1.378358,0.825688,0.826603,0.825166,0.825356
4,0.7681,1.798867,0.805046,0.819106,0.803086,0.802122
5,0.6469,1.435509,0.830275,0.830321,0.830092,0.830167


[I 2025-03-23 08:02:58,810] Trial 20 pruned. 


Trial 21 with params: {'learning_rate': 0.0009880924655514415, 'weight_decay': 0.008, 'warmup_steps': 23, 'lambda_param': 0.2, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3258,1.55999,0.815367,0.825492,0.816968,0.814398
2,0.5518,1.173956,0.853211,0.853923,0.852783,0.852988
3,0.3247,1.085034,0.856651,0.857454,0.856203,0.85642
4,0.2154,1.259623,0.850917,0.855605,0.849857,0.85011
5,0.1561,1.362766,0.848624,0.852901,0.847605,0.847855
6,0.1177,1.191028,0.862385,0.864613,0.861665,0.861967
7,0.0934,1.187148,0.861239,0.861525,0.86096,0.861105
8,0.0734,1.185152,0.860092,0.860547,0.85975,0.859926
9,0.0616,1.217729,0.857798,0.858247,0.857456,0.85763
10,0.0516,1.264669,0.856651,0.856614,0.856582,0.856597


[I 2025-03-23 08:07:27,547] Trial 21 finished with value: 0.8565138121910605 and parameters: {'learning_rate': 0.0009880924655514415, 'weight_decay': 0.008, 'warmup_steps': 23, 'lambda_param': 0.2, 'temperature': 2.5}. Best is trial 10 with value: 0.8588913227951769.


Trial 22 with params: {'learning_rate': 0.0025799268433007493, 'weight_decay': 0.006, 'warmup_steps': 14, 'lambda_param': 0.1, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1392,1.216661,0.841743,0.843512,0.842405,0.841676
2,0.4077,1.13481,0.860092,0.861801,0.859455,0.859735
3,0.2204,1.193014,0.849771,0.851765,0.849068,0.849333
4,0.1386,1.265046,0.84289,0.843977,0.842353,0.842575
5,0.0991,1.215926,0.849771,0.849808,0.849615,0.849683
6,0.0732,1.183252,0.849771,0.849945,0.849531,0.849647
7,0.0564,1.260515,0.847477,0.847419,0.847447,0.847432
8,0.0457,1.259254,0.850917,0.851236,0.850615,0.850764
9,0.0399,1.225143,0.850917,0.851054,0.850699,0.850804
10,0.0344,1.23942,0.847477,0.847573,0.847278,0.847371


[I 2025-03-23 08:10:30,587] Trial 22 pruned. 


Trial 23 with params: {'learning_rate': 0.0029292232326761126, 'weight_decay': 0.007, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.107,1.20235,0.852064,0.857204,0.853172,0.851768
2,0.3898,1.291624,0.84633,0.847017,0.8459,0.846096
3,0.2146,1.257784,0.840596,0.841337,0.840143,0.840339
4,0.1363,1.300483,0.849771,0.851088,0.849194,0.849438
5,0.0939,1.337006,0.848624,0.850487,0.847941,0.848201
6,0.0697,1.331185,0.83945,0.843945,0.838385,0.83858
7,0.056,1.27708,0.850917,0.852345,0.85032,0.850571
8,0.0456,1.326486,0.854358,0.855915,0.85374,0.854003
9,0.0401,1.290226,0.850917,0.851621,0.850488,0.85069
10,0.0348,1.242684,0.854358,0.85532,0.853867,0.854095


[I 2025-03-23 08:13:52,882] Trial 23 pruned. 


Trial 24 with params: {'learning_rate': 0.0011061598255246836, 'weight_decay': 0.005, 'warmup_steps': 30, 'lambda_param': 0.0, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3188,1.600623,0.822248,0.829991,0.82364,0.821589
2,0.53,1.149382,0.855505,0.856224,0.855077,0.855285
3,0.3062,1.246419,0.84633,0.851729,0.845184,0.84539
4,0.2027,1.301929,0.848624,0.853273,0.847563,0.847804
5,0.1456,1.3749,0.850917,0.853312,0.850152,0.850426
6,0.1107,1.234812,0.858945,0.860315,0.858371,0.858632
7,0.086,1.214225,0.856651,0.857038,0.856329,0.856493
8,0.0683,1.243844,0.855505,0.856561,0.854993,0.85523
9,0.0572,1.255478,0.854358,0.854999,0.853951,0.854149
10,0.0478,1.271204,0.855505,0.855521,0.855372,0.855429


[I 2025-03-23 08:17:01,795] Trial 24 pruned. 


Trial 25 with params: {'learning_rate': 0.0017086411811106689, 'weight_decay': 0.003, 'warmup_steps': 32, 'lambda_param': 0.1, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2334,1.372722,0.829128,0.838009,0.830607,0.828395
2,0.4637,1.117345,0.858945,0.859461,0.858582,0.858766
3,0.2517,1.091624,0.860092,0.860429,0.859792,0.859947
4,0.1594,1.235695,0.854358,0.856649,0.853614,0.853896
5,0.1155,1.403519,0.841743,0.842414,0.841311,0.841502
6,0.085,1.249028,0.847477,0.848587,0.846942,0.847171
7,0.0657,1.281118,0.850917,0.851952,0.850404,0.850634
8,0.0532,1.293098,0.855505,0.855948,0.855161,0.855333
9,0.0454,1.258321,0.852064,0.852242,0.851825,0.851943
10,0.0393,1.281403,0.849771,0.850036,0.849489,0.849626


[I 2025-03-23 08:20:14,832] Trial 25 pruned. 


Trial 26 with params: {'learning_rate': 0.0025762288174464368, 'weight_decay': 0.007, 'warmup_steps': 14, 'lambda_param': 0.0, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1307,1.203398,0.850917,0.852491,0.851541,0.850867
2,0.4009,1.166292,0.858945,0.859461,0.858582,0.858766
3,0.2181,1.112515,0.853211,0.853649,0.852867,0.853037
4,0.1384,1.112009,0.855505,0.855456,0.855456,0.855456
5,0.0972,1.219087,0.844037,0.844321,0.844321,0.844037
6,0.071,1.228306,0.852064,0.853016,0.851572,0.851797
7,0.0559,1.192952,0.849771,0.849713,0.849741,0.849726
8,0.0459,1.217264,0.84633,0.846295,0.846405,0.84631
9,0.0398,1.186,0.853211,0.853157,0.853246,0.853183
10,0.0355,1.22023,0.848624,0.848569,0.848657,0.848595


[I 2025-03-23 08:23:10,153] Trial 26 pruned. 


Trial 27 with params: {'learning_rate': 0.004456983089178604, 'weight_decay': 0.004, 'warmup_steps': 13, 'lambda_param': 0.0, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0907,1.223238,0.844037,0.844321,0.844321,0.844037
2,0.3783,1.247773,0.838303,0.841908,0.837343,0.837559
3,0.2037,1.135981,0.869266,0.869213,0.869306,0.869241
4,0.1302,1.269475,0.849771,0.850036,0.849489,0.849626
5,0.0875,1.20968,0.862385,0.8625,0.862592,0.862382
6,0.0645,1.211968,0.864679,0.86476,0.864507,0.864593
7,0.051,1.278599,0.858945,0.859227,0.858666,0.85881
8,0.0417,1.243005,0.863532,0.863582,0.863381,0.863453
9,0.0367,1.251719,0.858945,0.858942,0.858834,0.858878
10,0.0325,1.270232,0.857798,0.857817,0.857666,0.857723


[I 2025-03-23 08:27:44,124] Trial 27 finished with value: 0.8553760128891628 and parameters: {'learning_rate': 0.004456983089178604, 'weight_decay': 0.004, 'warmup_steps': 13, 'lambda_param': 0.0, 'temperature': 3.0}. Best is trial 10 with value: 0.8588913227951769.


Trial 28 with params: {'learning_rate': 0.0038987418546817293, 'weight_decay': 0.004, 'warmup_steps': 24, 'lambda_param': 0.0, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0986,1.10377,0.858945,0.859185,0.859213,0.858945
2,0.3776,1.149727,0.852064,0.854898,0.851236,0.851516
3,0.2064,1.202835,0.858945,0.859227,0.858666,0.85881
4,0.1332,1.223673,0.856651,0.856731,0.856835,0.856647
5,0.0894,1.263131,0.862385,0.862787,0.862718,0.862385
6,0.066,1.206064,0.860092,0.862042,0.859413,0.859701
7,0.0522,1.16265,0.856651,0.856647,0.85654,0.856583
8,0.0433,1.181653,0.850917,0.851621,0.850488,0.85069
9,0.0371,1.175436,0.857798,0.857944,0.857582,0.85769
10,0.0329,1.166067,0.853211,0.853176,0.853288,0.853192


[I 2025-03-23 08:30:35,242] Trial 28 pruned. 


Trial 29 with params: {'learning_rate': 0.0037867653604961434, 'weight_decay': 0.008, 'warmup_steps': 28, 'lambda_param': 0.4, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1242,1.19344,0.854358,0.856814,0.85513,0.854256
2,0.3816,1.172604,0.854358,0.854739,0.854035,0.854197
3,0.2049,1.150996,0.858945,0.860315,0.858371,0.858632
4,0.1318,1.250189,0.858945,0.859133,0.858708,0.858829
5,0.0904,1.275151,0.852064,0.852242,0.851825,0.851943
6,0.0666,1.217622,0.865826,0.865792,0.865759,0.865775
7,0.054,1.325665,0.850917,0.852079,0.851457,0.850889
8,0.0444,1.283359,0.856651,0.856605,0.856708,0.856629
9,0.0378,1.28472,0.864679,0.865026,0.864381,0.864539
10,0.0342,1.247896,0.857798,0.85775,0.85775,0.85775


[I 2025-03-23 08:35:02,677] Trial 29 finished with value: 0.8542732810223925 and parameters: {'learning_rate': 0.0037867653604961434, 'weight_decay': 0.008, 'warmup_steps': 28, 'lambda_param': 0.4, 'temperature': 5.5}. Best is trial 10 with value: 0.8588913227951769.


Trial 30 with params: {'learning_rate': 0.0033311455861838265, 'weight_decay': 0.009000000000000001, 'warmup_steps': 22, 'lambda_param': 0.0, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1255,1.177209,0.848624,0.849598,0.84912,0.848604
2,0.3892,1.218902,0.853211,0.854873,0.852572,0.852836
3,0.2086,1.144689,0.853211,0.854257,0.852698,0.852932
4,0.1341,1.169392,0.855505,0.85545,0.85554,0.855477
5,0.0922,1.252638,0.849771,0.849945,0.849531,0.849647
6,0.0702,1.20698,0.853211,0.853162,0.853162,0.853162
7,0.0532,1.231143,0.848624,0.848689,0.848447,0.848527
8,0.0442,1.17745,0.852064,0.852104,0.851909,0.851978
9,0.038,1.199488,0.849771,0.84973,0.849699,0.849714
10,0.0337,1.189343,0.849771,0.849762,0.849657,0.849699


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--f1/34c46321f42186df33a6260966e34a368f14868d9cc2ba47d142112e2800d233 (last modified on Fri Jan 10 23:14:01 2025) since it couldn't be found locally at evaluate-metric--f1, or remotely on the Hugging Face Hub.
[I 2025-03-23 08:38:06,411] Trial 30 pruned. 


Trial 31 with params: {'learning_rate': 0.0019406507386717218, 'weight_decay': 0.002, 'warmup_steps': 27, 'lambda_param': 0.7000000000000001, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.193,1.325188,0.847477,0.853385,0.848668,0.847105
2,0.4442,1.129668,0.847477,0.850878,0.846563,0.846823
3,0.2407,1.100276,0.853211,0.853778,0.852825,0.853013
4,0.1545,1.236962,0.852064,0.853197,0.85153,0.851768
5,0.1096,1.316231,0.83945,0.839388,0.839438,0.839408
6,0.0795,1.261182,0.84289,0.843801,0.842395,0.842607
7,0.0631,1.274644,0.852064,0.852104,0.851909,0.851978
8,0.0521,1.247701,0.84633,0.846639,0.846026,0.846172
9,0.0443,1.263134,0.850917,0.851054,0.850699,0.850804
10,0.0378,1.270417,0.850917,0.851054,0.850699,0.850804


[I 2025-03-23 08:40:43,319] Trial 31 pruned. 


Trial 32 with params: {'learning_rate': 0.002601722558278549, 'weight_decay': 0.003, 'warmup_steps': 23, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.157,1.237568,0.848624,0.849978,0.849204,0.848585
2,0.4099,1.102458,0.861239,0.861181,0.861255,0.861208
3,0.2247,1.167241,0.853211,0.854257,0.852698,0.852932
4,0.1445,1.203853,0.853211,0.853649,0.852867,0.853037
5,0.0977,1.309957,0.848624,0.848689,0.848447,0.848527
6,0.0741,1.248778,0.852064,0.853016,0.851572,0.851797
7,0.0585,1.330077,0.848624,0.84884,0.848362,0.848489
8,0.048,1.28113,0.849771,0.849869,0.849573,0.849666
9,0.0416,1.287093,0.847477,0.848095,0.847068,0.847258
10,0.036,1.235453,0.849771,0.849945,0.849531,0.849647


[I 2025-03-23 08:43:28,030] Trial 32 pruned. 


Trial 33 with params: {'learning_rate': 0.0023743307676572525, 'weight_decay': 0.002, 'warmup_steps': 25, 'lambda_param': 0.7000000000000001, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1582,1.178989,0.853211,0.854791,0.853835,0.853162
2,0.4143,1.131779,0.862385,0.863475,0.861876,0.862123
3,0.2251,1.134851,0.855505,0.85742,0.854824,0.855101
4,0.1421,1.22034,0.84633,0.846639,0.846026,0.846172
5,0.1027,1.227965,0.852064,0.853016,0.851572,0.851797
6,0.0754,1.245177,0.849771,0.850547,0.84932,0.849528
7,0.0596,1.235344,0.845183,0.845543,0.844858,0.845012
8,0.0485,1.236025,0.850917,0.850931,0.850783,0.850839
9,0.0413,1.236525,0.849771,0.850712,0.849278,0.8495
10,0.0357,1.194853,0.850917,0.851137,0.850657,0.850785


[I 2025-03-23 08:47:01,898] Trial 33 pruned. 


Trial 34 with params: {'learning_rate': 0.0007129036305893922, 'weight_decay': 0.007, 'warmup_steps': 11, 'lambda_param': 0.0, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3773,1.619035,0.817661,0.823155,0.818841,0.817216
2,0.6327,1.293682,0.838303,0.839973,0.837638,0.837871
3,0.3867,1.276644,0.845183,0.846903,0.844521,0.84477
4,0.2627,1.29804,0.848624,0.852544,0.847647,0.847904
5,0.1937,1.371747,0.848624,0.850487,0.847941,0.848201
6,0.148,1.20775,0.860092,0.86068,0.859708,0.859903
7,0.1172,1.269859,0.860092,0.860071,0.860003,0.860032
8,0.0973,1.213215,0.862385,0.862537,0.862171,0.862281
9,0.0809,1.240756,0.860092,0.860141,0.860255,0.860085
10,0.0672,1.269122,0.858945,0.858887,0.858961,0.858914


[I 2025-03-23 08:51:43,846] Trial 34 finished with value: 0.8542564041823769 and parameters: {'learning_rate': 0.0007129036305893922, 'weight_decay': 0.007, 'warmup_steps': 11, 'lambda_param': 0.0, 'temperature': 2.0}. Best is trial 10 with value: 0.8588913227951769.


Trial 35 with params: {'learning_rate': 5.817102176211476e-05, 'weight_decay': 0.0, 'warmup_steps': 14, 'lambda_param': 0.8, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4567,1.677524,0.78555,0.788144,0.786383,0.785345
2,1.5343,1.554652,0.787844,0.787826,0.787667,0.787721
3,1.3255,1.486753,0.797018,0.796999,0.797097,0.796997
4,1.204,1.571057,0.811927,0.817033,0.810727,0.810707
5,1.0946,1.429336,0.817661,0.817631,0.817536,0.817574
6,1.0133,1.456496,0.81422,0.816599,0.815,0.814079
7,0.945,1.434617,0.822248,0.822478,0.822504,0.822247
8,0.8863,1.436462,0.825688,0.825688,0.825798,0.825673
9,0.8434,1.443991,0.818807,0.818746,0.818746,0.818746
10,0.8034,1.463854,0.827982,0.831298,0.828892,0.827778


[I 2025-03-23 08:54:49,140] Trial 35 pruned. 


Trial 36 with params: {'learning_rate': 0.001860030226418261, 'weight_decay': 0.005, 'warmup_steps': 15, 'lambda_param': 0.0, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1834,1.258543,0.852064,0.856076,0.853046,0.851852
2,0.4376,1.090496,0.856651,0.858008,0.856077,0.856334
3,0.242,1.137642,0.853211,0.853281,0.853035,0.853118
4,0.1574,1.320505,0.840596,0.845299,0.839511,0.839706
5,0.1151,1.343454,0.841743,0.843114,0.841143,0.841375


[I 2025-03-23 08:56:25,544] Trial 36 pruned. 


Trial 37 with params: {'learning_rate': 0.0003114584293983801, 'weight_decay': 0.002, 'warmup_steps': 9, 'lambda_param': 0.8, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6482,1.484932,0.817661,0.820749,0.818546,0.817459
2,0.8938,1.348865,0.831422,0.833268,0.830713,0.830931
3,0.6178,1.374107,0.840596,0.843909,0.83968,0.839912
4,0.4641,1.484744,0.838303,0.840988,0.837469,0.837703
5,0.358,1.501119,0.837156,0.837101,0.837101,0.837101
6,0.288,1.391845,0.845183,0.845172,0.845068,0.84511
7,0.2395,1.421734,0.844037,0.844097,0.843858,0.843937
8,0.201,1.488744,0.84633,0.847482,0.846868,0.846301
9,0.1747,1.486342,0.847477,0.848728,0.848036,0.847443
10,0.1512,1.516123,0.84633,0.846983,0.846742,0.846323


[I 2025-03-23 09:00:02,808] Trial 37 pruned. 


Trial 38 with params: {'learning_rate': 0.00014198795619548116, 'weight_decay': 0.005, 'warmup_steps': 28, 'lambda_param': 0.30000000000000004, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9838,1.514088,0.799312,0.801735,0.800107,0.799147
2,1.1955,1.409691,0.823394,0.824168,0.82384,0.82338
3,0.9497,1.384223,0.822248,0.822185,0.822251,0.822208
4,0.7977,1.772647,0.809633,0.820852,0.807885,0.807297
5,0.6739,1.433906,0.822248,0.822575,0.822546,0.822247
6,0.5813,1.476405,0.821101,0.821288,0.821335,0.8211
7,0.5145,1.514843,0.826835,0.827164,0.827134,0.826835
8,0.4556,1.621107,0.817661,0.818502,0.818125,0.817641
9,0.409,1.624586,0.822248,0.822326,0.822419,0.822242
10,0.3769,1.657761,0.815367,0.816935,0.815999,0.815297


[I 2025-03-23 09:03:11,241] Trial 38 pruned. 


Trial 39 with params: {'learning_rate': 0.001395039612162253, 'weight_decay': 0.001, 'warmup_steps': 21, 'lambda_param': 0.2, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2605,1.565378,0.824541,0.830965,0.825808,0.82403
2,0.4853,1.119411,0.853211,0.853211,0.85333,0.853199
3,0.2731,1.132196,0.855505,0.855481,0.855414,0.855443
4,0.1776,1.218809,0.847477,0.850878,0.846563,0.846823
5,0.129,1.332315,0.84633,0.847529,0.845773,0.846006
6,0.0962,1.216446,0.850917,0.851236,0.850615,0.850764
7,0.0751,1.28137,0.852064,0.852087,0.852204,0.852055
8,0.059,1.289442,0.855505,0.855832,0.855203,0.855355
9,0.0503,1.248817,0.854358,0.854861,0.853993,0.854173
10,0.0441,1.271452,0.855505,0.855446,0.855498,0.855467


[I 2025-03-23 09:07:35,299] Trial 39 finished with value: 0.8541965366016144 and parameters: {'learning_rate': 0.001395039612162253, 'weight_decay': 0.001, 'warmup_steps': 21, 'lambda_param': 0.2, 'temperature': 7.0}. Best is trial 10 with value: 0.8588913227951769.


Trial 40 with params: {'learning_rate': 0.002301313995834585, 'weight_decay': 0.007, 'warmup_steps': 17, 'lambda_param': 1.0, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1599,1.306943,0.84289,0.84682,0.843868,0.842665
2,0.4181,1.104205,0.862385,0.863475,0.861876,0.862123
3,0.226,1.125571,0.863532,0.863582,0.863381,0.863453
4,0.1408,1.337537,0.844037,0.849391,0.84289,0.843082
5,0.1002,1.27631,0.852064,0.852087,0.852204,0.852055
6,0.0759,1.215997,0.854358,0.854739,0.854035,0.854197
7,0.0594,1.220696,0.861239,0.861184,0.861213,0.861197
8,0.0491,1.203876,0.860092,0.860429,0.859792,0.859947
9,0.0421,1.210712,0.856651,0.856929,0.856372,0.856514
10,0.0366,1.202406,0.862385,0.862464,0.862213,0.862298


[I 2025-03-23 09:12:11,678] Trial 40 finished with value: 0.8517364502315135 and parameters: {'learning_rate': 0.002301313995834585, 'weight_decay': 0.007, 'warmup_steps': 17, 'lambda_param': 1.0, 'temperature': 4.5}. Best is trial 10 with value: 0.8588913227951769.


Trial 41 with params: {'learning_rate': 0.0006612908305141491, 'weight_decay': 0.01, 'warmup_steps': 19, 'lambda_param': 0.5, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4373,1.542556,0.821101,0.825001,0.822093,0.820829
2,0.6529,1.274232,0.847477,0.849452,0.846773,0.847033
3,0.401,1.255954,0.854358,0.854861,0.853993,0.854173
4,0.2705,1.238005,0.854358,0.855915,0.85374,0.854003
5,0.1995,1.332984,0.861239,0.862621,0.860666,0.860931
6,0.1556,1.23488,0.856651,0.856929,0.856372,0.856514
7,0.1248,1.320666,0.861239,0.861219,0.861339,0.861224
8,0.1017,1.263387,0.865826,0.865943,0.865633,0.865732
9,0.0862,1.270815,0.863532,0.863479,0.863507,0.863492
10,0.0727,1.320751,0.855505,0.855698,0.855751,0.855504


[I 2025-03-23 09:16:37,710] Trial 41 finished with value: 0.8565515789196623 and parameters: {'learning_rate': 0.0006612908305141491, 'weight_decay': 0.01, 'warmup_steps': 19, 'lambda_param': 0.5, 'temperature': 3.0}. Best is trial 10 with value: 0.8588913227951769.


Trial 42 with params: {'learning_rate': 0.0013626432701280583, 'weight_decay': 0.009000000000000001, 'warmup_steps': 14, 'lambda_param': 0.8, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2347,1.517687,0.825688,0.833262,0.827061,0.825066
2,0.4863,1.125127,0.858945,0.859133,0.858708,0.858829
3,0.2741,1.071959,0.856651,0.856605,0.856708,0.856629
4,0.1801,1.201276,0.854358,0.856649,0.853614,0.853896
5,0.1315,1.266873,0.848624,0.850997,0.847857,0.848125
6,0.0965,1.198726,0.855505,0.856753,0.854951,0.8552
7,0.0752,1.21461,0.858945,0.859756,0.858498,0.858717
8,0.061,1.302012,0.852064,0.854077,0.851362,0.851633
9,0.0513,1.224807,0.850917,0.851779,0.850446,0.850663
10,0.0441,1.219622,0.854358,0.854352,0.854246,0.854289


[I 2025-03-23 09:21:10,569] Trial 42 finished with value: 0.8540658854101465 and parameters: {'learning_rate': 0.0013626432701280583, 'weight_decay': 0.009000000000000001, 'warmup_steps': 14, 'lambda_param': 0.8, 'temperature': 3.0}. Best is trial 10 with value: 0.8588913227951769.


Trial 43 with params: {'learning_rate': 0.0004298671642354885, 'weight_decay': 0.008, 'warmup_steps': 26, 'lambda_param': 0.6000000000000001, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5578,1.577133,0.817661,0.823973,0.818925,0.817129
2,0.7846,1.400715,0.827982,0.831604,0.826998,0.827163
3,0.5109,1.380577,0.84633,0.84925,0.845479,0.845739
4,0.364,1.41895,0.84633,0.84988,0.845394,0.845647
5,0.2748,1.473588,0.841743,0.841947,0.841479,0.841602


[I 2025-03-23 09:22:31,016] Trial 43 pruned. 


Trial 44 with params: {'learning_rate': 0.0003238339946107003, 'weight_decay': 0.01, 'warmup_steps': 17, 'lambda_param': 0.8, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6435,1.485633,0.821101,0.823263,0.821841,0.820987
2,0.8781,1.389578,0.823394,0.827978,0.822283,0.822377
3,0.6087,1.406091,0.84633,0.84988,0.845394,0.845647
4,0.4489,1.571575,0.826835,0.836211,0.825282,0.825094
5,0.3432,1.500967,0.841743,0.842042,0.841437,0.84158
6,0.2734,1.398909,0.855505,0.856079,0.855119,0.85531
7,0.2277,1.473781,0.84633,0.846876,0.845942,0.846123
8,0.1922,1.46184,0.849771,0.850225,0.85012,0.849769
9,0.1644,1.581832,0.83945,0.841699,0.840195,0.839347
10,0.1433,1.50749,0.855505,0.855698,0.855751,0.855504


[I 2025-03-23 09:27:16,167] Trial 44 finished with value: 0.8542886202128093 and parameters: {'learning_rate': 0.0003238339946107003, 'weight_decay': 0.01, 'warmup_steps': 17, 'lambda_param': 0.8, 'temperature': 2.5}. Best is trial 10 with value: 0.8588913227951769.


Trial 45 with params: {'learning_rate': 0.0007642176933495712, 'weight_decay': 0.009000000000000001, 'warmup_steps': 21, 'lambda_param': 0.6000000000000001, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3943,1.778552,0.81078,0.824278,0.812632,0.80937
2,0.6149,1.306078,0.840596,0.842064,0.839974,0.840208
3,0.3684,1.299374,0.84633,0.847729,0.845731,0.845973
4,0.2478,1.253736,0.850917,0.854528,0.849983,0.850255
5,0.1825,1.340692,0.850917,0.852798,0.850236,0.850501
6,0.1424,1.25386,0.858945,0.859756,0.858498,0.858717
7,0.1131,1.289827,0.858945,0.859054,0.85875,0.858847
8,0.0909,1.199372,0.862385,0.862625,0.862128,0.862263
9,0.0766,1.220151,0.860092,0.86024,0.859876,0.859986
10,0.0646,1.266077,0.857798,0.85774,0.857792,0.857762


[I 2025-03-23 09:31:45,963] Trial 45 finished with value: 0.8600181056443295 and parameters: {'learning_rate': 0.0007642176933495712, 'weight_decay': 0.009000000000000001, 'warmup_steps': 21, 'lambda_param': 0.6000000000000001, 'temperature': 3.5}. Best is trial 45 with value: 0.8600181056443295.


Trial 46 with params: {'learning_rate': 0.0011757468541955139, 'weight_decay': 0.008, 'warmup_steps': 8, 'lambda_param': 0.5, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2734,1.619818,0.819954,0.831927,0.821683,0.81882
2,0.5144,1.178362,0.853211,0.853649,0.852867,0.853037
3,0.2964,1.122532,0.862385,0.863671,0.861834,0.862095
4,0.1934,1.165655,0.862385,0.86411,0.86175,0.862034
5,0.1403,1.397181,0.84633,0.851729,0.845184,0.84539
6,0.1068,1.218371,0.849771,0.849945,0.849531,0.849647
7,0.0827,1.196426,0.857798,0.857873,0.857624,0.857708
8,0.0665,1.270229,0.854358,0.854999,0.853951,0.854149
9,0.0565,1.270735,0.849771,0.850141,0.849446,0.849604
10,0.048,1.287916,0.852064,0.852104,0.851909,0.851978


[I 2025-03-23 09:34:45,250] Trial 46 pruned. 


Trial 47 with params: {'learning_rate': 0.00039294592429744307, 'weight_decay': 0.01, 'warmup_steps': 26, 'lambda_param': 0.5, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5951,1.546639,0.822248,0.826327,0.823261,0.821961
2,0.8146,1.442982,0.827982,0.831937,0.826955,0.827108
3,0.5393,1.440233,0.838303,0.842965,0.837217,0.837399
4,0.3874,1.535271,0.834862,0.843191,0.833418,0.833389
5,0.2958,1.48671,0.84289,0.842847,0.842816,0.84283


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--recall/11f90e583db35601050aed380d48e83202a896976b9608432fba9244fb447f24 (last modified on Fri Jan 10 23:14:00 2025) since it couldn't be found locally at evaluate-metric--recall, or remotely on the Hugging Face Hub.
[I 2025-03-23 09:36:43,022] Trial 47 pruned. 


Trial 48 with params: {'learning_rate': 0.0027511979602444763, 'weight_decay': 0.005, 'warmup_steps': 2, 'lambda_param': 0.7000000000000001, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.093,1.265892,0.840596,0.841473,0.841069,0.840579
2,0.3981,1.174209,0.853211,0.853923,0.852783,0.852988
3,0.2141,1.173655,0.845183,0.845543,0.844858,0.845012
4,0.1344,1.328956,0.841743,0.842151,0.841395,0.841556
5,0.0968,1.282244,0.844037,0.844045,0.8439,0.843955


[I 2025-03-23 09:38:09,953] Trial 48 pruned. 


Trial 49 with params: {'learning_rate': 0.0009472559228590378, 'weight_decay': 0.002, 'warmup_steps': 19, 'lambda_param': 0.7000000000000001, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3362,1.592621,0.824541,0.83233,0.825935,0.823891
2,0.5557,1.196101,0.848624,0.848589,0.848699,0.848604
3,0.3279,1.095194,0.860092,0.860141,0.860255,0.860085
4,0.2168,1.257083,0.857798,0.858131,0.857498,0.857651
5,0.159,1.334725,0.847477,0.850261,0.846647,0.846912
6,0.12,1.21314,0.849771,0.851298,0.849152,0.849404
7,0.093,1.217386,0.862385,0.862366,0.862297,0.862327
8,0.0754,1.226088,0.858945,0.858942,0.858834,0.858878
9,0.0635,1.22152,0.860092,0.860112,0.85996,0.860018
10,0.0533,1.248356,0.860092,0.860381,0.860381,0.860092


[I 2025-03-23 09:43:40,516] Trial 49 finished with value: 0.8542564041823769 and parameters: {'learning_rate': 0.0009472559228590378, 'weight_decay': 0.002, 'warmup_steps': 19, 'lambda_param': 0.7000000000000001, 'temperature': 2.0}. Best is trial 45 with value: 0.8600181056443295.


Trial 50 with params: {'learning_rate': 0.00011155354646039437, 'weight_decay': 0.004, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0801,1.546261,0.798165,0.799578,0.798771,0.798097
2,1.2912,1.502939,0.805046,0.806377,0.804391,0.804548
3,1.0627,1.394507,0.822248,0.822261,0.822083,0.822145
4,0.9181,1.542964,0.818807,0.824452,0.817568,0.817564
5,0.7928,1.40706,0.819954,0.820513,0.820336,0.819948
6,0.697,1.441571,0.821101,0.8222,0.82163,0.821067
7,0.6251,1.473757,0.825688,0.82607,0.826008,0.825687
8,0.5597,1.475103,0.829128,0.829202,0.828924,0.829009
9,0.5146,1.529742,0.826835,0.826811,0.826713,0.826753
10,0.4819,1.637555,0.821101,0.825001,0.822093,0.820829


[I 2025-03-23 09:46:42,780] Trial 50 pruned. 


Trial 51 with params: {'learning_rate': 0.0016981526509688345, 'weight_decay': 0.01, 'warmup_steps': 19, 'lambda_param': 0.4, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2235,1.399225,0.840596,0.844855,0.841616,0.840339
2,0.4537,1.069424,0.855505,0.855456,0.855456,0.855456
3,0.2496,1.139068,0.855505,0.856561,0.854993,0.85523
4,0.1611,1.284117,0.845183,0.84766,0.844395,0.844652
5,0.1151,1.432217,0.834862,0.837653,0.834007,0.834227


[I 2025-03-23 09:48:14,429] Trial 51 pruned. 


Trial 52 with params: {'learning_rate': 0.0005200025057894818, 'weight_decay': 0.01, 'warmup_steps': 26, 'lambda_param': 0.4, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5036,1.735267,0.822248,0.826675,0.823303,0.821927
2,0.7134,1.324334,0.838303,0.84128,0.837427,0.837657
3,0.454,1.518629,0.827982,0.830162,0.827208,0.827414
4,0.3205,1.383067,0.845183,0.852088,0.843889,0.84403
5,0.2424,1.437715,0.848624,0.849476,0.848152,0.848365
6,0.1894,1.251523,0.863532,0.864361,0.863086,0.863312
7,0.153,1.287791,0.861239,0.862059,0.860792,0.861015
8,0.1271,1.296798,0.865826,0.865769,0.865844,0.865796
9,0.106,1.310274,0.857798,0.857764,0.857877,0.857779
10,0.0902,1.394319,0.853211,0.854577,0.853793,0.853173


[I 2025-03-23 09:51:44,338] Trial 52 pruned. 


Trial 53 with params: {'learning_rate': 0.0003559971385023291, 'weight_decay': 0.007, 'warmup_steps': 18, 'lambda_param': 0.8, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6256,1.543435,0.822248,0.826675,0.823303,0.821927
2,0.8456,1.376561,0.832569,0.834089,0.831923,0.832141
3,0.5638,1.36253,0.83945,0.841482,0.838722,0.838962
4,0.4104,1.517387,0.837156,0.84027,0.836259,0.836482
5,0.3162,1.506981,0.844037,0.844451,0.843689,0.843852


[I 2025-03-23 09:53:16,672] Trial 53 pruned. 


Trial 54 with params: {'learning_rate': 0.0002382552536664986, 'weight_decay': 0.01, 'warmup_steps': 11, 'lambda_param': 0.4, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7474,1.543905,0.802752,0.810796,0.804201,0.801935
2,0.989,1.39151,0.829128,0.829106,0.829008,0.829047
3,0.7216,1.426756,0.829128,0.833275,0.828082,0.828232
4,0.5676,1.504584,0.826835,0.830605,0.825829,0.825983
5,0.4528,1.502838,0.832569,0.832617,0.832386,0.832462
6,0.3691,1.437088,0.850917,0.850985,0.850741,0.850822
7,0.3115,1.489488,0.831422,0.831655,0.831681,0.831422
8,0.2647,1.628016,0.834862,0.838231,0.835775,0.834667
9,0.234,1.60074,0.84289,0.84478,0.843574,0.842815
10,0.2062,1.578923,0.838303,0.838283,0.838396,0.838286


[I 2025-03-23 09:56:20,140] Trial 54 pruned. 


Trial 55 with params: {'learning_rate': 0.0007308638909598288, 'weight_decay': 0.009000000000000001, 'warmup_steps': 17, 'lambda_param': 0.6000000000000001, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4014,1.726066,0.818807,0.825793,0.820136,0.81821
2,0.6214,1.256723,0.84633,0.847173,0.845858,0.846068
3,0.3788,1.207692,0.849771,0.850397,0.849362,0.849555
4,0.2586,1.222333,0.857798,0.859492,0.857161,0.857435
5,0.1883,1.329947,0.857798,0.859986,0.857077,0.857366
6,0.145,1.198443,0.858945,0.860532,0.858329,0.858601
7,0.1166,1.245178,0.869266,0.869427,0.869054,0.869167
8,0.0951,1.194963,0.864679,0.86466,0.864591,0.864621
9,0.0788,1.208269,0.864679,0.864834,0.864465,0.864576
10,0.0664,1.244869,0.855505,0.85547,0.855582,0.855486


[I 2025-03-23 09:59:26,982] Trial 55 pruned. 


Trial 56 with params: {'learning_rate': 0.0001413812546509425, 'weight_decay': 0.003, 'warmup_steps': 41, 'lambda_param': 0.8, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0101,1.49665,0.800459,0.800642,0.800686,0.800458
2,1.1898,1.407411,0.822248,0.822314,0.82204,0.822124
3,0.9484,1.392093,0.823394,0.824653,0.822788,0.822984
4,0.7968,1.610074,0.817661,0.824332,0.816315,0.816228
5,0.6705,1.445785,0.823394,0.824168,0.82384,0.82338
6,0.5821,1.445469,0.823394,0.823672,0.823672,0.823394
7,0.5188,1.499769,0.819954,0.819906,0.819999,0.819925
8,0.4599,1.589839,0.816514,0.817006,0.816873,0.81651
9,0.4144,1.554664,0.821101,0.821092,0.820956,0.821007
10,0.3822,1.593985,0.819954,0.820101,0.820167,0.819952


[I 2025-03-23 10:02:37,741] Trial 56 pruned. 


Trial 57 with params: {'learning_rate': 0.004329803696971102, 'weight_decay': 0.005, 'warmup_steps': 8, 'lambda_param': 0.0, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0639,1.156528,0.856651,0.856674,0.856793,0.856642
2,0.3743,1.212471,0.849771,0.85258,0.848941,0.849214
3,0.2048,1.191973,0.847477,0.847418,0.847489,0.847443
4,0.1296,1.231849,0.84633,0.84634,0.846194,0.846249
5,0.0908,1.26185,0.849771,0.849724,0.849825,0.849747
6,0.0661,1.217875,0.861239,0.861636,0.860918,0.861085
7,0.0527,1.219359,0.860092,0.860092,0.860213,0.86008
8,0.0442,1.190328,0.856651,0.856647,0.85654,0.856583
9,0.0372,1.194923,0.861239,0.861193,0.861297,0.861216
10,0.0338,1.192411,0.858945,0.859227,0.858666,0.85881


[I 2025-03-23 10:08:05,561] Trial 57 finished with value: 0.8576716317321919 and parameters: {'learning_rate': 0.004329803696971102, 'weight_decay': 0.005, 'warmup_steps': 8, 'lambda_param': 0.0, 'temperature': 2.0}. Best is trial 45 with value: 0.8600181056443295.


Trial 58 with params: {'learning_rate': 0.00451964694389276, 'weight_decay': 0.005, 'warmup_steps': 7, 'lambda_param': 0.1, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0626,1.151109,0.856651,0.856891,0.856919,0.856651
2,0.373,1.169468,0.84633,0.847529,0.845773,0.846006
3,0.2068,1.222831,0.848624,0.850254,0.847983,0.848237
4,0.1293,1.212921,0.854358,0.854999,0.853951,0.854149
5,0.0887,1.230626,0.853211,0.853186,0.853119,0.853148
6,0.0658,1.213823,0.855505,0.855456,0.855456,0.855456
7,0.0523,1.21195,0.860092,0.860112,0.85996,0.860018
8,0.0437,1.247225,0.848624,0.849319,0.848194,0.848393
9,0.038,1.234208,0.850917,0.850931,0.850783,0.850839
10,0.0353,1.18064,0.855505,0.855481,0.855414,0.855443


[I 2025-03-23 10:11:09,429] Trial 58 pruned. 


Trial 59 with params: {'learning_rate': 0.0038521036086771464, 'weight_decay': 0.003, 'warmup_steps': 30, 'lambda_param': 0.2, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1382,1.148432,0.845183,0.848164,0.846036,0.845035
2,0.3804,1.209156,0.850917,0.852564,0.850278,0.850537
3,0.2019,1.142341,0.860092,0.860828,0.859666,0.859879
4,0.1264,1.114278,0.868119,0.868074,0.86818,0.868098
5,0.0867,1.17101,0.862385,0.862728,0.862086,0.862243
6,0.0642,1.187816,0.857798,0.857776,0.857708,0.857738
7,0.0527,1.229223,0.861239,0.861761,0.860876,0.861063
8,0.043,1.289913,0.849771,0.851298,0.849152,0.849404
9,0.0372,1.275275,0.853211,0.854652,0.852614,0.85287
10,0.0328,1.228639,0.857798,0.858131,0.857498,0.857651


[I 2025-03-23 10:16:28,351] Trial 59 finished with value: 0.8541734975106101 and parameters: {'learning_rate': 0.0038521036086771464, 'weight_decay': 0.003, 'warmup_steps': 30, 'lambda_param': 0.2, 'temperature': 3.5}. Best is trial 45 with value: 0.8600181056443295.


Trial 60 with params: {'learning_rate': 0.0003020939879565185, 'weight_decay': 0.005, 'warmup_steps': 43, 'lambda_param': 0.5, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7122,1.495133,0.819954,0.823679,0.820925,0.819696
2,0.9004,1.39797,0.831422,0.83283,0.830797,0.831011
3,0.6247,1.441441,0.836009,0.84182,0.834796,0.834915
4,0.4684,1.561004,0.829128,0.835211,0.827871,0.827922
5,0.3642,1.515206,0.83945,0.839744,0.839143,0.839284


[I 2025-03-23 10:18:24,621] Trial 60 pruned. 


Trial 61 with params: {'learning_rate': 0.0025231992939589733, 'weight_decay': 0.005, 'warmup_steps': 8, 'lambda_param': 0.0, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1015,1.186062,0.83945,0.840095,0.839859,0.839442
2,0.4059,1.157372,0.858945,0.860532,0.858329,0.858601
3,0.217,1.137638,0.853211,0.853923,0.852783,0.852988
4,0.14,1.256359,0.858945,0.859133,0.858708,0.858829
5,0.0984,1.331329,0.845183,0.845351,0.844942,0.845056
6,0.0732,1.205474,0.857798,0.858866,0.857287,0.857528
7,0.0581,1.291889,0.84633,0.84633,0.846447,0.846317
8,0.0467,1.324018,0.847477,0.847573,0.847278,0.847371
9,0.0411,1.299985,0.844037,0.844097,0.843858,0.843937
10,0.0352,1.294495,0.84633,0.84675,0.845984,0.846148


[I 2025-03-23 10:21:53,982] Trial 61 pruned. 


Trial 62 with params: {'learning_rate': 0.004641167038388093, 'weight_decay': 0.001, 'warmup_steps': 43, 'lambda_param': 0.2, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1494,1.085718,0.856651,0.856593,0.856666,0.85662
2,0.3853,1.190781,0.849771,0.850712,0.849278,0.8495
3,0.2047,1.302934,0.850917,0.852564,0.850278,0.850537
4,0.1311,1.255827,0.852064,0.852087,0.852204,0.852055
5,0.0909,1.213956,0.856651,0.856674,0.856793,0.856642
6,0.066,1.210201,0.857798,0.858379,0.857413,0.857606
7,0.0537,1.242857,0.852064,0.852025,0.851993,0.852008
8,0.0432,1.310351,0.850917,0.850858,0.850909,0.850879
9,0.0376,1.275137,0.850917,0.850867,0.850867,0.850867
10,0.033,1.283403,0.849771,0.84973,0.849699,0.849714


[I 2025-03-23 10:24:49,481] Trial 62 pruned. 


Trial 63 with params: {'learning_rate': 0.0026253869325269276, 'weight_decay': 0.004, 'warmup_steps': 12, 'lambda_param': 0.7000000000000001, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1245,1.114779,0.847477,0.84743,0.847531,0.847453
2,0.4023,1.189207,0.845183,0.845942,0.844731,0.844934
3,0.2156,1.223558,0.84289,0.843801,0.842395,0.842607
4,0.1352,1.312396,0.854358,0.857523,0.853488,0.853776
5,0.098,1.297209,0.848624,0.849177,0.848236,0.84842
6,0.0718,1.247851,0.84633,0.848175,0.845647,0.845901
7,0.0568,1.277228,0.852064,0.853394,0.851488,0.851736
8,0.0471,1.274803,0.84633,0.848175,0.845647,0.845901
9,0.0407,1.233777,0.853211,0.853923,0.852783,0.852988
10,0.0352,1.248286,0.852064,0.852561,0.851699,0.851877


[I 2025-03-23 10:27:53,349] Trial 63 pruned. 


Trial 64 with params: {'learning_rate': 0.00013405290551132384, 'weight_decay': 0.0, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9903,1.528963,0.800459,0.803858,0.801402,0.80019
2,1.2132,1.424478,0.816514,0.816759,0.816199,0.816324
3,0.9843,1.392594,0.823394,0.823871,0.822998,0.823156
4,0.8289,1.546201,0.821101,0.825643,0.819988,0.82007
5,0.7108,1.480806,0.817661,0.819032,0.818252,0.817607


[I 2025-03-23 10:29:49,567] Trial 64 pruned. 


Trial 65 with params: {'learning_rate': 0.004880842217572153, 'weight_decay': 0.0, 'warmup_steps': 25, 'lambda_param': 0.2, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.119,1.172359,0.854358,0.855256,0.854835,0.854342
2,0.38,1.143452,0.857798,0.858247,0.857456,0.85763
3,0.2067,1.267019,0.848624,0.84905,0.848278,0.848444
4,0.1305,1.287282,0.847477,0.847436,0.847405,0.847419
5,0.092,1.302396,0.847477,0.847648,0.847236,0.847352
6,0.0661,1.320091,0.849771,0.850397,0.849362,0.849555
7,0.0527,1.356756,0.83945,0.83957,0.839227,0.839328
8,0.0424,1.356694,0.849771,0.850892,0.849236,0.84947
9,0.0364,1.353585,0.844037,0.844097,0.843858,0.843937
10,0.0326,1.316573,0.850917,0.850858,0.850909,0.850879


[I 2025-03-23 10:32:38,253] Trial 65 pruned. 


Trial 66 with params: {'learning_rate': 0.0036463150157132313, 'weight_decay': 0.006, 'warmup_steps': 15, 'lambda_param': 0.1, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1079,1.206218,0.849771,0.852484,0.850583,0.849647
2,0.3912,1.233153,0.849771,0.850547,0.84932,0.849528
3,0.2086,1.222542,0.848624,0.850734,0.847899,0.848164
4,0.1316,1.227785,0.852064,0.852045,0.852162,0.852048
5,0.0913,1.261935,0.84633,0.846275,0.846363,0.846301
6,0.0683,1.17768,0.853211,0.853176,0.853288,0.853192
7,0.0534,1.262797,0.847477,0.847573,0.847278,0.847371
8,0.0435,1.221271,0.849771,0.849945,0.849531,0.849647
9,0.0373,1.184257,0.854358,0.854299,0.854372,0.854325
10,0.0336,1.190149,0.848624,0.848689,0.848447,0.848527


[I 2025-03-23 10:35:57,511] Trial 66 pruned. 


Trial 67 with params: {'learning_rate': 0.0007786760087666213, 'weight_decay': 0.01, 'warmup_steps': 23, 'lambda_param': 0.6000000000000001, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.38,1.659798,0.818807,0.823741,0.819925,0.818425
2,0.6041,1.315615,0.84633,0.84925,0.845479,0.845739
3,0.3666,1.244214,0.847477,0.848095,0.847068,0.847258
4,0.245,1.233037,0.863532,0.865149,0.862918,0.863199
5,0.181,1.423218,0.84289,0.846901,0.84189,0.842117
6,0.1386,1.304325,0.855505,0.855732,0.855245,0.855376
7,0.1116,1.289446,0.855505,0.856561,0.854993,0.85523
8,0.088,1.241655,0.858945,0.859337,0.858624,0.858789
9,0.074,1.213436,0.863532,0.863773,0.863802,0.863532
10,0.0633,1.272692,0.854358,0.854339,0.854456,0.854342


[I 2025-03-23 10:40:39,384] Trial 67 finished with value: 0.856533453371398 and parameters: {'learning_rate': 0.0007786760087666213, 'weight_decay': 0.01, 'warmup_steps': 23, 'lambda_param': 0.6000000000000001, 'temperature': 3.5}. Best is trial 45 with value: 0.8600181056443295.


Trial 68 with params: {'learning_rate': 0.0011525159271653389, 'weight_decay': 0.008, 'warmup_steps': 37, 'lambda_param': 0.8, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3261,1.606032,0.823394,0.836404,0.825187,0.822182
2,0.5264,1.175179,0.856651,0.856593,0.856666,0.85662
3,0.3026,1.100321,0.862385,0.862537,0.862171,0.862281
4,0.1994,1.241223,0.847477,0.853533,0.846268,0.846459
5,0.1433,1.284193,0.855505,0.856753,0.854951,0.8552
6,0.1089,1.20347,0.858945,0.859337,0.858624,0.858789
7,0.085,1.218365,0.862385,0.862351,0.862465,0.862367
8,0.0681,1.254084,0.858945,0.859927,0.858455,0.858691
9,0.0571,1.202322,0.861239,0.861203,0.861171,0.861186
10,0.0484,1.293101,0.855505,0.855554,0.855666,0.855498


[I 2025-03-23 10:45:26,365] Trial 68 finished with value: 0.8496660547078061 and parameters: {'learning_rate': 0.0011525159271653389, 'weight_decay': 0.008, 'warmup_steps': 37, 'lambda_param': 0.8, 'temperature': 2.5}. Best is trial 45 with value: 0.8600181056443295.


Trial 69 with params: {'learning_rate': 0.0003183848757718585, 'weight_decay': 0.01, 'warmup_steps': 17, 'lambda_param': 0.4, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.652,1.458353,0.827982,0.829701,0.828639,0.827908
2,0.8786,1.37732,0.832569,0.835055,0.831755,0.831971
3,0.6054,1.407303,0.832569,0.837702,0.831418,0.831544
4,0.4489,1.49711,0.827982,0.831937,0.826955,0.827108
5,0.3499,1.523779,0.836009,0.837247,0.835428,0.835646


[I 2025-03-23 10:47:11,917] Trial 69 pruned. 


Trial 70 with params: {'learning_rate': 0.0005071657640141345, 'weight_decay': 0.003, 'warmup_steps': 37, 'lambda_param': 0.6000000000000001, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5296,1.667639,0.825688,0.830706,0.826808,0.825321
2,0.7228,1.339526,0.832569,0.835935,0.831628,0.831825
3,0.4531,1.473654,0.831422,0.833508,0.830671,0.830888
4,0.3198,1.380207,0.841743,0.846277,0.840679,0.840886
5,0.2409,1.381562,0.841743,0.841947,0.841479,0.841602


[I 2025-03-23 10:48:42,692] Trial 70 pruned. 


Trial 71 with params: {'learning_rate': 0.0011098108211473024, 'weight_decay': 0.009000000000000001, 'warmup_steps': 32, 'lambda_param': 0.5, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3057,1.52965,0.818807,0.826719,0.82022,0.81811
2,0.5277,1.132695,0.853211,0.853534,0.852909,0.85306
3,0.3082,1.161799,0.853211,0.855627,0.852446,0.852727
4,0.1994,1.17859,0.855505,0.858227,0.854698,0.854989
5,0.1462,1.317333,0.852064,0.854898,0.851236,0.851516
6,0.1103,1.180559,0.854358,0.85532,0.853867,0.854095
7,0.0863,1.187994,0.858945,0.859097,0.859171,0.858943
8,0.0687,1.254673,0.847477,0.848244,0.847026,0.847231
9,0.0583,1.213764,0.853211,0.853778,0.852825,0.853013
10,0.0492,1.244989,0.848624,0.848564,0.848615,0.848585


[I 2025-03-23 10:51:30,841] Trial 71 pruned. 


Trial 72 with params: {'learning_rate': 0.000838754836564938, 'weight_decay': 0.01, 'warmup_steps': 25, 'lambda_param': 0.8, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3653,1.681248,0.817661,0.828973,0.819346,0.816578
2,0.5922,1.246342,0.845183,0.846681,0.844563,0.844806
3,0.3521,1.224491,0.853211,0.853404,0.853456,0.85321
4,0.236,1.221753,0.856651,0.860869,0.855656,0.855946
5,0.1728,1.378356,0.848624,0.852544,0.847647,0.847904
6,0.1322,1.272117,0.849771,0.852021,0.849025,0.849295
7,0.1066,1.249549,0.860092,0.860206,0.860297,0.860089
8,0.0842,1.169033,0.862385,0.862407,0.862255,0.862313
9,0.0714,1.205404,0.856651,0.856731,0.856835,0.856647
10,0.0598,1.261335,0.855505,0.855619,0.855709,0.855502


[I 2025-03-23 10:56:09,490] Trial 72 finished with value: 0.8588289181174557 and parameters: {'learning_rate': 0.000838754836564938, 'weight_decay': 0.01, 'warmup_steps': 25, 'lambda_param': 0.8, 'temperature': 3.5}. Best is trial 45 with value: 0.8600181056443295.


Trial 73 with params: {'learning_rate': 0.004469957370895107, 'weight_decay': 0.003, 'warmup_steps': 16, 'lambda_param': 0.1, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.085,1.216874,0.84633,0.846295,0.846405,0.84631
2,0.3745,1.262273,0.838303,0.840712,0.837512,0.837748
3,0.2018,1.263113,0.858945,0.861013,0.858245,0.858534
4,0.1261,1.229496,0.863532,0.863935,0.863213,0.863381
5,0.0873,1.308543,0.858945,0.858899,0.859003,0.858923
6,0.0648,1.246,0.860092,0.860112,0.85996,0.860018
7,0.0514,1.261384,0.856651,0.856632,0.85675,0.856636
8,0.0425,1.279318,0.857798,0.85803,0.85754,0.857672
9,0.0365,1.297077,0.854358,0.854352,0.854246,0.854289
10,0.0328,1.303957,0.855505,0.855521,0.855372,0.855429


[I 2025-03-23 11:00:52,771] Trial 73 finished with value: 0.8530803940461336 and parameters: {'learning_rate': 0.004469957370895107, 'weight_decay': 0.003, 'warmup_steps': 16, 'lambda_param': 0.1, 'temperature': 2.5}. Best is trial 45 with value: 0.8600181056443295.


Trial 74 with params: {'learning_rate': 0.0005648543556671768, 'weight_decay': 0.008, 'warmup_steps': 26, 'lambda_param': 0.9, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.485,1.700998,0.816514,0.823458,0.817841,0.815909
2,0.6929,1.325635,0.845183,0.846104,0.844689,0.844904
3,0.4329,1.538127,0.825688,0.827847,0.824914,0.825113
4,0.2996,1.329589,0.841743,0.844052,0.840974,0.841221
5,0.2231,1.377094,0.844037,0.844715,0.843605,0.843799


[I 2025-03-23 11:02:26,273] Trial 74 pruned. 


Trial 75 with params: {'learning_rate': 0.0005517683477217503, 'weight_decay': 0.01, 'warmup_steps': 35, 'lambda_param': 0.8, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4901,1.664114,0.808486,0.816897,0.809958,0.807664
2,0.7,1.291976,0.841743,0.844611,0.84089,0.841134
3,0.4407,1.407089,0.849771,0.853201,0.848857,0.849126
4,0.3066,1.321205,0.850917,0.854528,0.849983,0.850255
5,0.2287,1.377442,0.845183,0.845351,0.844942,0.845056
6,0.1767,1.243773,0.862385,0.862846,0.862044,0.862222
7,0.1452,1.306408,0.863532,0.863498,0.863465,0.86348
8,0.1201,1.271728,0.869266,0.869623,0.868969,0.869131
9,0.0996,1.278477,0.861239,0.861834,0.861634,0.861234
10,0.0844,1.340244,0.861239,0.861219,0.861339,0.861224


[I 2025-03-23 11:07:15,616] Trial 75 finished with value: 0.8646076759510726 and parameters: {'learning_rate': 0.0005517683477217503, 'weight_decay': 0.01, 'warmup_steps': 35, 'lambda_param': 0.8, 'temperature': 3.5}. Best is trial 75 with value: 0.8646076759510726.


Trial 76 with params: {'learning_rate': 0.00028794463081255984, 'weight_decay': 0.01, 'warmup_steps': 29, 'lambda_param': 0.9, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7024,1.51682,0.813073,0.815576,0.813873,0.81292
2,0.9256,1.351861,0.834862,0.835374,0.83447,0.83464
3,0.6543,1.433599,0.831422,0.837135,0.830208,0.830297
4,0.4942,1.497248,0.837156,0.841987,0.836049,0.836218
5,0.3835,1.495469,0.848624,0.849177,0.848236,0.84842
6,0.3073,1.468317,0.850917,0.850985,0.850741,0.850822
7,0.2557,1.469592,0.833716,0.833696,0.833807,0.833698
8,0.2149,1.461161,0.847477,0.847815,0.847784,0.847477
9,0.1872,1.542551,0.845183,0.847083,0.845868,0.84511
10,0.1651,1.548989,0.84633,0.846723,0.846657,0.846329


[I 2025-03-23 11:10:28,340] Trial 76 pruned. 


Trial 77 with params: {'learning_rate': 0.0012656943296273765, 'weight_decay': 0.009000000000000001, 'warmup_steps': 30, 'lambda_param': 0.8, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2898,1.451549,0.829128,0.836532,0.830481,0.828542
2,0.4978,1.112084,0.862385,0.86298,0.862002,0.8622
3,0.281,1.080589,0.860092,0.86068,0.859708,0.859903
4,0.1863,1.183951,0.858945,0.862492,0.858034,0.85834
5,0.133,1.375371,0.847477,0.852299,0.846394,0.846625
6,0.1011,1.214978,0.855505,0.85742,0.854824,0.855101
7,0.0767,1.209793,0.852064,0.852561,0.851699,0.851877
8,0.0623,1.302346,0.848624,0.850997,0.847857,0.848125
9,0.0525,1.312655,0.847477,0.849976,0.846689,0.846954
10,0.0451,1.315584,0.849771,0.850712,0.849278,0.8495


[I 2025-03-23 11:13:39,544] Trial 77 pruned. 


Trial 78 with params: {'learning_rate': 0.0008498019888753552, 'weight_decay': 0.01, 'warmup_steps': 19, 'lambda_param': 1.0, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3491,1.685936,0.823394,0.831892,0.824851,0.822663
2,0.5793,1.218347,0.844037,0.844164,0.843816,0.843918
3,0.3463,1.17748,0.852064,0.852087,0.852204,0.852055
4,0.2353,1.202154,0.860092,0.862299,0.859371,0.859667
5,0.1709,1.345122,0.855505,0.858227,0.854698,0.854989
6,0.1296,1.179524,0.861239,0.861636,0.860918,0.861085
7,0.104,1.237729,0.862385,0.863054,0.862802,0.862379
8,0.0846,1.225951,0.864679,0.865026,0.864381,0.864539
9,0.0694,1.247964,0.858945,0.858942,0.858834,0.858878
10,0.0584,1.257772,0.858945,0.858968,0.859087,0.858936


[I 2025-03-23 11:18:13,800] Trial 78 finished with value: 0.8577502736381242 and parameters: {'learning_rate': 0.0008498019888753552, 'weight_decay': 0.01, 'warmup_steps': 19, 'lambda_param': 1.0, 'temperature': 3.5}. Best is trial 75 with value: 0.8646076759510726.


Trial 79 with params: {'learning_rate': 0.0006135759056628725, 'weight_decay': 0.01, 'warmup_steps': 41, 'lambda_param': 0.8, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.481,1.598326,0.822248,0.825369,0.823135,0.822051
2,0.6661,1.366078,0.83945,0.843584,0.838427,0.838634
3,0.4133,1.404204,0.837156,0.842377,0.836007,0.83616
4,0.2832,1.266466,0.847477,0.849213,0.846815,0.84707
5,0.2116,1.373532,0.84289,0.844167,0.842311,0.842542


[I 2025-03-23 11:19:45,737] Trial 79 pruned. 


Trial 80 with params: {'learning_rate': 0.0006819636125122306, 'weight_decay': 0.01, 'warmup_steps': 30, 'lambda_param': 0.9, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4332,1.734936,0.81422,0.823011,0.815715,0.813395
2,0.6488,1.364719,0.845183,0.84714,0.844479,0.844732
3,0.3961,1.346351,0.841743,0.843795,0.841016,0.841262
4,0.2694,1.302989,0.841743,0.844324,0.840932,0.841178
5,0.1996,1.338953,0.852064,0.853606,0.851446,0.851704
6,0.1534,1.192179,0.856651,0.856758,0.856456,0.856552
7,0.1217,1.286603,0.862385,0.862913,0.86276,0.862382
8,0.1003,1.272683,0.863532,0.863727,0.863297,0.86342
9,0.0821,1.242863,0.862385,0.862385,0.862507,0.862374
10,0.0693,1.309746,0.860092,0.860038,0.860129,0.860065


[I 2025-03-23 11:24:26,752] Trial 80 finished with value: 0.8646333249136988 and parameters: {'learning_rate': 0.0006819636125122306, 'weight_decay': 0.01, 'warmup_steps': 30, 'lambda_param': 0.9, 'temperature': 3.0}. Best is trial 80 with value: 0.8646333249136988.


Trial 81 with params: {'learning_rate': 0.00034840794552941227, 'weight_decay': 0.01, 'warmup_steps': 10, 'lambda_param': 1.0, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6135,1.409819,0.826835,0.829151,0.827597,0.826714
2,0.8534,1.374988,0.830275,0.83223,0.829544,0.829759
3,0.5829,1.380344,0.840596,0.845685,0.839469,0.83965
4,0.4293,1.476902,0.834862,0.840442,0.83367,0.833791
5,0.3307,1.476248,0.83945,0.839395,0.839395,0.839395


[I 2025-03-23 11:25:58,646] Trial 81 pruned. 


Trial 82 with params: {'learning_rate': 0.0008832181752557159, 'weight_decay': 0.01, 'warmup_steps': 25, 'lambda_param': 0.9, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.365,1.888379,0.81078,0.824918,0.812674,0.809293
2,0.5833,1.269729,0.84289,0.843977,0.842353,0.842575
3,0.3449,1.178748,0.849771,0.851524,0.84911,0.849369
4,0.2292,1.227027,0.848624,0.853273,0.847563,0.847804
5,0.1659,1.381369,0.848624,0.852204,0.847689,0.847951
6,0.1277,1.228988,0.861239,0.863325,0.860539,0.860834
7,0.1008,1.253144,0.853211,0.853157,0.853246,0.853183
8,0.0799,1.203069,0.860092,0.860057,0.860171,0.860073
9,0.0677,1.233364,0.856651,0.856596,0.856624,0.856609
10,0.0562,1.266457,0.854358,0.854302,0.85433,0.854315


[I 2025-03-23 11:30:27,622] Trial 82 finished with value: 0.8508224766416634 and parameters: {'learning_rate': 0.0008832181752557159, 'weight_decay': 0.01, 'warmup_steps': 25, 'lambda_param': 0.9, 'temperature': 2.5}. Best is trial 80 with value: 0.8646333249136988.


Trial 83 with params: {'learning_rate': 0.00021092917570554568, 'weight_decay': 0.009000000000000001, 'warmup_steps': 35, 'lambda_param': 0.8, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8314,1.582778,0.786697,0.795311,0.788225,0.785683
2,1.0479,1.38858,0.827982,0.827944,0.827882,0.827908
3,0.7729,1.381637,0.830275,0.83178,0.829629,0.829842
4,0.6101,1.480537,0.819954,0.821708,0.819241,0.81943
5,0.4913,1.45248,0.829128,0.829354,0.828839,0.828964


[I 2025-03-23 11:31:54,876] Trial 83 pruned. 


Trial 84 with params: {'learning_rate': 0.0009798327620635931, 'weight_decay': 0.01, 'warmup_steps': 28, 'lambda_param': 1.0, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3472,1.514362,0.818807,0.827204,0.820262,0.818057
2,0.5544,1.225894,0.849771,0.849762,0.849657,0.849699
3,0.3259,1.16272,0.848624,0.852204,0.847689,0.847951
4,0.2148,1.228073,0.857798,0.860847,0.85695,0.857251
5,0.1555,1.386277,0.847477,0.850878,0.846563,0.846823
6,0.1201,1.203838,0.857798,0.859986,0.857077,0.857366
7,0.0925,1.211602,0.853211,0.853435,0.852951,0.85308
8,0.0743,1.241449,0.857798,0.858379,0.857413,0.857606
9,0.0617,1.271789,0.848624,0.848937,0.84832,0.848468
10,0.0523,1.268803,0.850917,0.850863,0.850951,0.850889


[I 2025-03-23 11:34:53,465] Trial 84 pruned. 


Trial 85 with params: {'learning_rate': 0.003856848670031493, 'weight_decay': 0.001, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1051,1.226857,0.836009,0.836243,0.83627,0.836009
2,0.378,1.251242,0.84289,0.845079,0.842142,0.842392
3,0.2053,1.239741,0.854358,0.854631,0.854077,0.854218
4,0.1287,1.21077,0.858945,0.859601,0.85854,0.858743
5,0.0904,1.257487,0.858945,0.858942,0.858834,0.858878
6,0.0674,1.275175,0.84289,0.844167,0.842311,0.842542
7,0.0576,1.291406,0.853211,0.853435,0.852951,0.85308
8,0.0448,1.262984,0.854358,0.854631,0.854077,0.854218
9,0.0384,1.263574,0.854358,0.854352,0.854246,0.854289
10,0.0342,1.204639,0.857798,0.85775,0.85775,0.85775


[I 2025-03-23 11:39:31,139] Trial 85 finished with value: 0.856533453371398 and parameters: {'learning_rate': 0.003856848670031493, 'weight_decay': 0.001, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 2.0}. Best is trial 80 with value: 0.8646333249136988.


Trial 86 with params: {'learning_rate': 0.0009478467887748727, 'weight_decay': 0.01, 'warmup_steps': 17, 'lambda_param': 0.9, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3125,1.663887,0.81078,0.822457,0.812505,0.809588
2,0.5609,1.200273,0.847477,0.848408,0.846984,0.847202
3,0.3296,1.103454,0.868119,0.8682,0.868306,0.868115
4,0.2184,1.185669,0.854358,0.856925,0.853572,0.853858
5,0.1576,1.336886,0.854358,0.855701,0.853783,0.854035
6,0.1216,1.206993,0.856651,0.858008,0.856077,0.856334
7,0.0938,1.189178,0.863532,0.863486,0.863591,0.86351
8,0.0767,1.217968,0.858945,0.858909,0.858876,0.858891
9,0.065,1.19902,0.863532,0.863727,0.863297,0.86342
10,0.0532,1.237264,0.860092,0.860286,0.860339,0.860091


[I 2025-03-23 11:44:04,598] Trial 86 finished with value: 0.8576513954713008 and parameters: {'learning_rate': 0.0009478467887748727, 'weight_decay': 0.01, 'warmup_steps': 17, 'lambda_param': 0.9, 'temperature': 3.5}. Best is trial 80 with value: 0.8646333249136988.


Trial 87 with params: {'learning_rate': 0.0036885182391836785, 'weight_decay': 0.01, 'warmup_steps': 22, 'lambda_param': 1.0, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.102,1.195323,0.849771,0.851027,0.85033,0.849737
2,0.3818,1.257681,0.850917,0.851054,0.850699,0.850804
3,0.2086,1.208424,0.84289,0.843493,0.842479,0.842665
4,0.1314,1.207173,0.849771,0.849711,0.849783,0.849737
5,0.0912,1.348395,0.84289,0.842877,0.842774,0.842815


[I 2025-03-23 11:45:36,148] Trial 87 pruned. 


Trial 88 with params: {'learning_rate': 0.0014424694845455415, 'weight_decay': 0.004, 'warmup_steps': 18, 'lambda_param': 0.0, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2278,1.387842,0.830275,0.833305,0.831144,0.8301
2,0.4826,1.037068,0.864679,0.865779,0.86417,0.864421
3,0.2703,1.109107,0.861239,0.862621,0.860666,0.860931
4,0.1789,1.216777,0.844037,0.848992,0.842932,0.843138
5,0.1287,1.280687,0.850917,0.851952,0.850404,0.850634
6,0.0954,1.237927,0.849771,0.851298,0.849152,0.849404
7,0.0741,1.254722,0.849771,0.851765,0.849068,0.849333
8,0.0597,1.247377,0.853211,0.855109,0.85253,0.852801
9,0.0496,1.253896,0.849771,0.850547,0.84932,0.849528
10,0.0429,1.259962,0.857798,0.857817,0.857666,0.857723


[I 2025-03-23 11:50:14,609] Trial 88 finished with value: 0.8541965366016144 and parameters: {'learning_rate': 0.0014424694845455415, 'weight_decay': 0.004, 'warmup_steps': 18, 'lambda_param': 0.0, 'temperature': 2.0}. Best is trial 80 with value: 0.8646333249136988.


Trial 89 with params: {'learning_rate': 0.0004895958325056396, 'weight_decay': 0.01, 'warmup_steps': 30, 'lambda_param': 1.0, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.527,1.648512,0.813073,0.821103,0.814505,0.812327
2,0.7378,1.333082,0.838303,0.840204,0.837596,0.837832
3,0.47,1.432882,0.837156,0.838931,0.83647,0.836702
4,0.3332,1.483244,0.838303,0.84648,0.83688,0.836895
5,0.2526,1.41439,0.847477,0.848244,0.847026,0.847231
6,0.1982,1.276852,0.858945,0.859927,0.858455,0.858691
7,0.1618,1.326394,0.857798,0.858526,0.857371,0.857582
8,0.1322,1.347324,0.860092,0.860429,0.859792,0.859947
9,0.1117,1.354048,0.855505,0.857092,0.856129,0.855456
10,0.0953,1.410718,0.857798,0.858974,0.85834,0.857771


[I 2025-03-23 11:55:06,567] Trial 89 finished with value: 0.8611857728310277 and parameters: {'learning_rate': 0.0004895958325056396, 'weight_decay': 0.01, 'warmup_steps': 30, 'lambda_param': 1.0, 'temperature': 2.5}. Best is trial 80 with value: 0.8646333249136988.


Trial 90 with params: {'learning_rate': 0.0007374269671147737, 'weight_decay': 0.009000000000000001, 'warmup_steps': 38, 'lambda_param': 1.0, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4327,1.553809,0.811927,0.81794,0.813168,0.811402
2,0.6175,1.284827,0.844037,0.846641,0.843226,0.84348
3,0.3719,1.280367,0.84633,0.847343,0.845815,0.846038
4,0.2517,1.272008,0.852064,0.853197,0.85153,0.851768
5,0.184,1.313745,0.848624,0.849648,0.84811,0.848336
6,0.141,1.252098,0.853211,0.854652,0.852614,0.85287
7,0.1121,1.227434,0.858945,0.858887,0.858961,0.858914
8,0.0915,1.265151,0.848624,0.849319,0.848194,0.848393
9,0.0774,1.260782,0.853211,0.85335,0.852993,0.8531
10,0.0648,1.289208,0.855505,0.855505,0.855624,0.855492


[I 2025-03-23 11:59:39,797] Trial 90 finished with value: 0.8530997304582211 and parameters: {'learning_rate': 0.0007374269671147737, 'weight_decay': 0.009000000000000001, 'warmup_steps': 38, 'lambda_param': 1.0, 'temperature': 2.0}. Best is trial 80 with value: 0.8646333249136988.


Trial 91 with params: {'learning_rate': 0.00041089292798918067, 'weight_decay': 0.009000000000000001, 'warmup_steps': 25, 'lambda_param': 0.9, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5727,1.554357,0.818807,0.823375,0.819883,0.818463
2,0.7889,1.376637,0.834862,0.837372,0.834049,0.834273
3,0.5133,1.611168,0.841743,0.848797,0.840427,0.840532
4,0.3717,1.477472,0.837156,0.843204,0.835922,0.836038
5,0.2835,1.45903,0.845183,0.846903,0.844521,0.84477
6,0.2235,1.370287,0.852064,0.852561,0.851699,0.851877
7,0.1874,1.411816,0.854358,0.855701,0.853783,0.854035
8,0.154,1.417266,0.858945,0.859097,0.859171,0.858943
9,0.131,1.354563,0.864679,0.86535,0.865096,0.864672
10,0.1121,1.471238,0.858945,0.860223,0.859508,0.858914


[I 2025-03-23 12:04:08,739] Trial 91 finished with value: 0.8623389744885073 and parameters: {'learning_rate': 0.00041089292798918067, 'weight_decay': 0.009000000000000001, 'warmup_steps': 25, 'lambda_param': 0.9, 'temperature': 2.5}. Best is trial 80 with value: 0.8646333249136988.


Trial 92 with params: {'learning_rate': 0.0004073415843843474, 'weight_decay': 0.01, 'warmup_steps': 36, 'lambda_param': 1.0, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6155,1.599233,0.81422,0.820689,0.815505,0.813656
2,0.8019,1.372296,0.831422,0.834617,0.830502,0.830699
3,0.5263,1.450961,0.845183,0.848241,0.84431,0.844565
4,0.3787,1.48986,0.837156,0.843204,0.835922,0.836038
5,0.2879,1.513887,0.845183,0.845794,0.844774,0.844961
6,0.2275,1.300808,0.856651,0.856758,0.856456,0.856552
7,0.1865,1.39743,0.857798,0.85803,0.85754,0.857672
8,0.1556,1.379786,0.858945,0.859287,0.859255,0.858945
9,0.1313,1.496624,0.849771,0.851687,0.850457,0.849699
10,0.1132,1.467272,0.856651,0.856891,0.856919,0.856651


[I 2025-03-23 12:09:08,045] Trial 92 finished with value: 0.8543146894268172 and parameters: {'learning_rate': 0.0004073415843843474, 'weight_decay': 0.01, 'warmup_steps': 36, 'lambda_param': 1.0, 'temperature': 2.5}. Best is trial 80 with value: 0.8646333249136988.


Trial 93 with params: {'learning_rate': 0.00025030760841176787, 'weight_decay': 0.01, 'warmup_steps': 23, 'lambda_param': 1.0, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7535,1.489842,0.821101,0.824675,0.822051,0.82086
2,0.9623,1.385978,0.825688,0.826603,0.825166,0.825356
3,0.6918,1.446321,0.826835,0.833742,0.825493,0.825474
4,0.5313,1.48938,0.825688,0.829608,0.824661,0.824802
5,0.4207,1.465269,0.840596,0.840945,0.840269,0.84042


[I 2025-03-23 12:10:35,080] Trial 93 pruned. 


Trial 94 with params: {'learning_rate': 0.00032932025868277016, 'weight_decay': 0.01, 'warmup_steps': 31, 'lambda_param': 0.8, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6664,1.522161,0.81422,0.817736,0.815168,0.81397
2,0.8774,1.373654,0.832569,0.833884,0.831965,0.832179
3,0.5932,1.395671,0.838303,0.841586,0.837385,0.837609
4,0.4374,1.509472,0.827982,0.831937,0.826955,0.827108
5,0.3352,1.497818,0.837156,0.837156,0.83727,0.837142
6,0.2665,1.396765,0.845183,0.845277,0.844984,0.845076
7,0.224,1.471936,0.848624,0.849648,0.84811,0.848336
8,0.1875,1.46291,0.850917,0.851728,0.851372,0.850905
9,0.1629,1.462061,0.849771,0.851027,0.85033,0.849737
10,0.1417,1.519119,0.855505,0.855554,0.855666,0.855498


[I 2025-03-23 12:15:30,314] Trial 94 finished with value: 0.8565832876110329 and parameters: {'learning_rate': 0.00032932025868277016, 'weight_decay': 0.01, 'warmup_steps': 31, 'lambda_param': 0.8, 'temperature': 3.0}. Best is trial 80 with value: 0.8646333249136988.


Trial 95 with params: {'learning_rate': 0.0002684354404258217, 'weight_decay': 0.007, 'warmup_steps': 23, 'lambda_param': 0.8, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7352,1.486752,0.821101,0.824364,0.822009,0.820889
2,0.9417,1.368799,0.823394,0.824852,0.822746,0.822944
3,0.6672,1.508653,0.818807,0.827148,0.817315,0.81711
4,0.5129,1.481072,0.824541,0.825371,0.82404,0.824225
5,0.3996,1.502139,0.836009,0.836462,0.835638,0.835802


[I 2025-03-23 12:17:01,752] Trial 95 pruned. 


Trial 96 with params: {'learning_rate': 0.00015972356535382792, 'weight_decay': 0.01, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9118,1.629998,0.783257,0.793007,0.784889,0.782046
2,1.1661,1.411596,0.823394,0.823506,0.823588,0.823391
3,0.9078,1.400191,0.834862,0.835821,0.834344,0.834548
4,0.7445,1.589311,0.818807,0.824056,0.81761,0.817632
5,0.6235,1.42217,0.827982,0.828093,0.828176,0.827978
6,0.5418,1.492697,0.834862,0.834828,0.834765,0.834792
7,0.4763,1.476619,0.833716,0.834289,0.833302,0.833477
8,0.4182,1.546961,0.826835,0.827066,0.827092,0.826835
9,0.3751,1.544071,0.833716,0.833738,0.833849,0.833705
10,0.3428,1.548834,0.825688,0.825966,0.825966,0.825688


[I 2025-03-23 12:20:19,934] Trial 96 pruned. 


Trial 97 with params: {'learning_rate': 0.0033127125232190924, 'weight_decay': 0.007, 'warmup_steps': 22, 'lambda_param': 0.0, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1246,1.174187,0.849771,0.85278,0.850625,0.849626
2,0.3884,1.146356,0.856651,0.857623,0.856161,0.856393
3,0.2113,1.230632,0.844037,0.846108,0.843311,0.843563
4,0.1302,1.185913,0.856651,0.856605,0.856708,0.856629
5,0.0894,1.226326,0.856651,0.856758,0.856456,0.856552
6,0.0682,1.226535,0.848624,0.848757,0.848404,0.848509
7,0.0547,1.285793,0.853211,0.853176,0.853288,0.853192
8,0.0458,1.263506,0.853211,0.853186,0.853119,0.853148
9,0.0393,1.243076,0.852064,0.852018,0.85212,0.852041
10,0.0338,1.238599,0.847477,0.847573,0.847278,0.847371


[I 2025-03-23 12:23:30,367] Trial 97 pruned. 


Trial 98 with params: {'learning_rate': 0.0012555012636296784, 'weight_decay': 0.01, 'warmup_steps': 32, 'lambda_param': 0.8, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.295,1.546094,0.821101,0.832228,0.822767,0.82007
2,0.5024,1.129954,0.864679,0.865431,0.864254,0.864473
3,0.2857,1.126379,0.856651,0.857623,0.856161,0.856393
4,0.1878,1.277806,0.848624,0.854067,0.847478,0.847698
5,0.1357,1.304633,0.854358,0.854399,0.854204,0.854273
6,0.1013,1.168182,0.857798,0.858688,0.857329,0.857555
7,0.0786,1.199078,0.861239,0.861902,0.860834,0.86104
8,0.0645,1.242545,0.854358,0.855152,0.853909,0.854123
9,0.0528,1.256315,0.853211,0.854257,0.852698,0.852932
10,0.0449,1.252373,0.858945,0.858942,0.858834,0.858878


[I 2025-03-23 12:27:59,746] Trial 98 finished with value: 0.8565138121910605 and parameters: {'learning_rate': 0.0012555012636296784, 'weight_decay': 0.01, 'warmup_steps': 32, 'lambda_param': 0.8, 'temperature': 3.0}. Best is trial 80 with value: 0.8646333249136988.


Trial 99 with params: {'learning_rate': 0.0017422533204379319, 'weight_decay': 0.0, 'warmup_steps': 5, 'lambda_param': 0.6000000000000001, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1624,1.377127,0.838303,0.842902,0.839364,0.838011
2,0.4504,1.076427,0.860092,0.860071,0.860003,0.860032
3,0.2505,1.044369,0.858945,0.859227,0.858666,0.85881
4,0.162,1.149259,0.860092,0.860429,0.859792,0.859947
5,0.1171,1.262408,0.852064,0.852104,0.851909,0.851978
6,0.0858,1.228246,0.849771,0.851088,0.849194,0.849438
7,0.0668,1.187534,0.857798,0.857944,0.857582,0.85769
8,0.053,1.242534,0.856651,0.8573,0.856245,0.856446
9,0.0463,1.270449,0.849771,0.850397,0.849362,0.849555
10,0.0401,1.265164,0.854358,0.854631,0.854077,0.854218


[I 2025-03-23 12:32:40,119] Trial 99 finished with value: 0.8553097345132743 and parameters: {'learning_rate': 0.0017422533204379319, 'weight_decay': 0.0, 'warmup_steps': 5, 'lambda_param': 0.6000000000000001, 'temperature': 6.0}. Best is trial 80 with value: 0.8646333249136988.


Trial 100 with params: {'learning_rate': 0.004463096479266976, 'weight_decay': 0.003, 'warmup_steps': 24, 'lambda_param': 1.0, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0961,1.179696,0.852064,0.852651,0.852456,0.852059
2,0.3702,1.246166,0.850917,0.851952,0.850404,0.850634
3,0.2009,1.224383,0.850917,0.852798,0.850236,0.850501
4,0.1275,1.31916,0.850917,0.851952,0.850404,0.850634
5,0.0883,1.31313,0.83945,0.839505,0.839269,0.839347
6,0.0646,1.248191,0.848624,0.848564,0.848615,0.848585
7,0.0533,1.363142,0.84633,0.846542,0.846068,0.846194
8,0.0443,1.285303,0.854358,0.854352,0.854246,0.854289
9,0.0377,1.279987,0.850917,0.850931,0.850783,0.850839
10,0.0331,1.249489,0.850917,0.851054,0.850699,0.850804


[I 2025-03-23 12:35:50,948] Trial 100 pruned. 


Trial 101 with params: {'learning_rate': 0.0009843685674246845, 'weight_decay': 0.006, 'warmup_steps': 32, 'lambda_param': 0.4, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3569,1.699259,0.809633,0.825392,0.811632,0.807935
2,0.5612,1.141989,0.853211,0.853778,0.852825,0.853013
3,0.3277,1.160897,0.856651,0.858701,0.855951,0.856234
4,0.213,1.127049,0.863532,0.865149,0.862918,0.863199
5,0.1558,1.32737,0.849771,0.851765,0.849068,0.849333
6,0.1198,1.211159,0.854358,0.855915,0.85374,0.854003
7,0.093,1.187982,0.861239,0.861286,0.861087,0.861158
8,0.0755,1.254539,0.856651,0.856647,0.85654,0.856583
9,0.0631,1.18786,0.862385,0.862537,0.862171,0.862281
10,0.0531,1.245074,0.857798,0.857776,0.857708,0.857738


[I 2025-03-23 12:40:24,678] Trial 101 finished with value: 0.8473306961712883 and parameters: {'learning_rate': 0.0009843685674246845, 'weight_decay': 0.006, 'warmup_steps': 32, 'lambda_param': 0.4, 'temperature': 4.5}. Best is trial 80 with value: 0.8646333249136988.


Trial 102 with params: {'learning_rate': 0.0005185195529955805, 'weight_decay': 0.01, 'warmup_steps': 30, 'lambda_param': 0.9, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.515,1.758596,0.81078,0.824278,0.812632,0.80937
2,0.7196,1.305492,0.84289,0.845344,0.8421,0.842351
3,0.4512,1.490398,0.832569,0.834089,0.831923,0.832141
4,0.3173,1.449056,0.829128,0.837538,0.827661,0.827566
5,0.2375,1.441828,0.845183,0.846903,0.844521,0.84477
6,0.1847,1.284161,0.861239,0.861636,0.860918,0.861085
7,0.1513,1.339136,0.860092,0.860327,0.859834,0.859967
8,0.1237,1.337478,0.863532,0.863486,0.863591,0.86351
9,0.1044,1.318912,0.858945,0.859405,0.859297,0.858943
10,0.0896,1.352754,0.861239,0.862958,0.861886,0.861186


[I 2025-03-23 12:45:13,357] Trial 102 finished with value: 0.8565832876110329 and parameters: {'learning_rate': 0.0005185195529955805, 'weight_decay': 0.01, 'warmup_steps': 30, 'lambda_param': 0.9, 'temperature': 3.0}. Best is trial 80 with value: 0.8646333249136988.


Trial 103 with params: {'learning_rate': 0.00045416633942113523, 'weight_decay': 0.008, 'warmup_steps': 10, 'lambda_param': 1.0, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5262,1.760113,0.816514,0.823458,0.817841,0.815909
2,0.7695,1.350565,0.836009,0.83867,0.835175,0.835401
3,0.4986,1.482095,0.848624,0.852544,0.847647,0.847904
4,0.3565,1.435444,0.834862,0.841294,0.833586,0.833664
5,0.2738,1.529279,0.84633,0.848682,0.845563,0.845823
6,0.2156,1.375498,0.857798,0.858866,0.857287,0.857528
7,0.1748,1.412505,0.852064,0.853016,0.851572,0.851797
8,0.1441,1.375486,0.856651,0.856647,0.85654,0.856583
9,0.1231,1.374186,0.858945,0.859287,0.859255,0.858945
10,0.105,1.441847,0.849771,0.849922,0.849994,0.849769


[I 2025-03-23 12:48:27,990] Trial 103 pruned. 


Trial 104 with params: {'learning_rate': 0.0007980589386485602, 'weight_decay': 0.005, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.337,1.587378,0.816514,0.822191,0.817715,0.816045
2,0.6111,1.267738,0.845183,0.846104,0.844689,0.844904
3,0.3663,1.199362,0.857798,0.85803,0.85754,0.857672
4,0.2431,1.315364,0.841743,0.845231,0.840806,0.84104
5,0.18,1.353405,0.850917,0.852798,0.850236,0.850501
6,0.1353,1.209502,0.858945,0.860315,0.858371,0.858632
7,0.1078,1.283879,0.856651,0.856674,0.856793,0.856642
8,0.0862,1.228976,0.858945,0.858909,0.858876,0.858891
9,0.0731,1.27678,0.860092,0.860092,0.860213,0.86008
10,0.0606,1.299987,0.860092,0.860381,0.860381,0.860092


[I 2025-03-23 12:52:51,935] Trial 104 finished with value: 0.8531484356498171 and parameters: {'learning_rate': 0.0007980589386485602, 'weight_decay': 0.005, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 3.5}. Best is trial 80 with value: 0.8646333249136988.


Trial 105 with params: {'learning_rate': 0.001394113520827695, 'weight_decay': 0.002, 'warmup_steps': 42, 'lambda_param': 1.0, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2989,1.560289,0.827982,0.834681,0.829271,0.827459
2,0.4902,1.115852,0.860092,0.860092,0.860213,0.86008
3,0.2731,1.097327,0.866972,0.867445,0.866633,0.866815
4,0.1769,1.24384,0.854358,0.858541,0.853362,0.853642
5,0.1308,1.278695,0.849771,0.850892,0.849236,0.84947
6,0.0972,1.187963,0.860092,0.860991,0.859624,0.859853
7,0.0742,1.254013,0.858945,0.858942,0.858834,0.858878
8,0.0602,1.270849,0.858945,0.859756,0.858498,0.858717
9,0.0507,1.289563,0.856651,0.858223,0.856035,0.856302
10,0.0436,1.289853,0.861239,0.861181,0.861255,0.861208


[I 2025-03-23 12:57:21,025] Trial 105 finished with value: 0.8588289181174557 and parameters: {'learning_rate': 0.001394113520827695, 'weight_decay': 0.002, 'warmup_steps': 42, 'lambda_param': 1.0, 'temperature': 6.0}. Best is trial 80 with value: 0.8646333249136988.


Trial 106 with params: {'learning_rate': 0.0005378088280556363, 'weight_decay': 0.01, 'warmup_steps': 21, 'lambda_param': 1.0, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4835,1.6186,0.818807,0.823375,0.819883,0.818463
2,0.7174,1.361817,0.832569,0.836258,0.831586,0.831772
3,0.4523,1.315387,0.855505,0.855577,0.85533,0.855413
4,0.3113,1.28698,0.848624,0.850997,0.847857,0.848125
5,0.2347,1.43904,0.849771,0.851765,0.849068,0.849333
6,0.183,1.259383,0.856651,0.857161,0.856287,0.85647
7,0.1481,1.344093,0.857798,0.858688,0.857329,0.857555
8,0.1231,1.33722,0.858945,0.858899,0.859003,0.858923
9,0.1019,1.317235,0.860092,0.860141,0.860255,0.860085
10,0.088,1.374656,0.854358,0.854947,0.854751,0.854353


[I 2025-03-23 13:01:56,334] Trial 106 finished with value: 0.8622809973045822 and parameters: {'learning_rate': 0.0005378088280556363, 'weight_decay': 0.01, 'warmup_steps': 21, 'lambda_param': 1.0, 'temperature': 4.0}. Best is trial 80 with value: 0.8646333249136988.


Trial 107 with params: {'learning_rate': 0.0024666082703259655, 'weight_decay': 0.002, 'warmup_steps': 41, 'lambda_param': 1.0, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.203,1.264646,0.83945,0.840988,0.840069,0.839395
2,0.4182,1.100216,0.863532,0.865637,0.862834,0.863135
3,0.2238,1.184386,0.857798,0.859267,0.857203,0.857468
4,0.143,1.201015,0.852064,0.854077,0.851362,0.851633
5,0.0994,1.224454,0.855505,0.855732,0.855245,0.855376
6,0.0731,1.197204,0.852064,0.852698,0.851657,0.851852
7,0.0574,1.281862,0.837156,0.838931,0.83647,0.836702
8,0.0469,1.259599,0.845183,0.846282,0.844647,0.844873
9,0.0406,1.220822,0.849771,0.849808,0.849615,0.849683
10,0.0358,1.23111,0.852064,0.852008,0.852035,0.85202


[I 2025-03-23 13:05:07,883] Trial 107 pruned. 


Trial 108 with params: {'learning_rate': 0.0011776245713683707, 'weight_decay': 0.006, 'warmup_steps': 24, 'lambda_param': 0.9, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2995,1.61429,0.832569,0.839345,0.83386,0.83206
2,0.507,1.152695,0.861239,0.86284,0.860623,0.8609
3,0.2892,1.158168,0.852064,0.853606,0.851446,0.851704
4,0.1892,1.220152,0.845183,0.849589,0.844142,0.844371
5,0.1365,1.301276,0.852064,0.853834,0.851404,0.851669
6,0.1025,1.188539,0.861239,0.862059,0.860792,0.861015
7,0.0801,1.194277,0.860092,0.860038,0.860129,0.860065
8,0.0663,1.192524,0.856651,0.856836,0.856414,0.856533
9,0.0542,1.246245,0.860092,0.860991,0.859624,0.859853
10,0.0459,1.260266,0.855505,0.855521,0.855372,0.855429


[I 2025-03-23 13:08:15,556] Trial 108 pruned. 


Trial 109 with params: {'learning_rate': 0.0011264217291802228, 'weight_decay': 0.001, 'warmup_steps': 42, 'lambda_param': 1.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3412,1.678678,0.81422,0.825144,0.815884,0.81315
2,0.5302,1.123889,0.857798,0.858688,0.857329,0.857555
3,0.3034,1.111016,0.858945,0.860532,0.858329,0.858601
4,0.1998,1.20595,0.860092,0.861365,0.859539,0.859797
5,0.1438,1.349972,0.847477,0.851209,0.846521,0.846776
6,0.1082,1.173909,0.860092,0.86068,0.859708,0.859903
7,0.0854,1.136428,0.864679,0.86466,0.864591,0.864621
8,0.0683,1.231318,0.854358,0.854299,0.854372,0.854325
9,0.058,1.21366,0.861239,0.861761,0.860876,0.861063
10,0.0491,1.256535,0.853211,0.853176,0.853288,0.853192


[I 2025-03-23 13:11:29,672] Trial 109 pruned. 


Trial 110 with params: {'learning_rate': 0.0022093215538851515, 'weight_decay': 0.006, 'warmup_steps': 37, 'lambda_param': 0.9, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.201,1.219087,0.847477,0.848932,0.848078,0.847432
2,0.433,1.074518,0.860092,0.861575,0.859497,0.859766
3,0.2332,1.050933,0.857798,0.858247,0.857456,0.85763
4,0.1499,1.184952,0.860092,0.861575,0.859497,0.859766
5,0.1048,1.304366,0.852064,0.852104,0.851909,0.851978
6,0.0787,1.200732,0.850917,0.851236,0.850615,0.850764
7,0.0614,1.237422,0.849771,0.850547,0.84932,0.849528
8,0.049,1.187772,0.854358,0.854631,0.854077,0.854218
9,0.0417,1.209312,0.858945,0.859461,0.858582,0.858766
10,0.0363,1.207077,0.856651,0.856695,0.856498,0.856568


[I 2025-03-23 13:16:02,017] Trial 110 finished with value: 0.8553097345132743 and parameters: {'learning_rate': 0.0022093215538851515, 'weight_decay': 0.006, 'warmup_steps': 37, 'lambda_param': 0.9, 'temperature': 6.0}. Best is trial 80 with value: 0.8646333249136988.


Trial 111 with params: {'learning_rate': 0.0003383529604768503, 'weight_decay': 0.01, 'warmup_steps': 22, 'lambda_param': 0.9, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6441,1.481384,0.817661,0.820749,0.818546,0.817459
2,0.8666,1.349266,0.838303,0.839973,0.837638,0.837871
3,0.5837,1.374756,0.840596,0.8436,0.839722,0.83996
4,0.4333,1.599011,0.825688,0.835842,0.824072,0.823811
5,0.331,1.499802,0.836009,0.835992,0.835891,0.835931


[I 2025-03-23 13:17:36,635] Trial 111 pruned. 


Trial 112 with params: {'learning_rate': 0.0009023462519898151, 'weight_decay': 0.008, 'warmup_steps': 25, 'lambda_param': 1.0, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.359,1.571407,0.819954,0.829136,0.821472,0.819126
2,0.5723,1.212992,0.840596,0.843307,0.839764,0.840005
3,0.335,1.201817,0.855505,0.858844,0.854614,0.854906
4,0.2257,1.194506,0.857798,0.858866,0.857287,0.857528
5,0.1636,1.365169,0.847477,0.851557,0.846478,0.846727
6,0.1267,1.162551,0.864679,0.866419,0.864044,0.864333
7,0.0993,1.173258,0.862385,0.862625,0.862128,0.862263
8,0.0792,1.169261,0.863532,0.863823,0.863255,0.863401
9,0.0662,1.217643,0.856651,0.856614,0.856582,0.856597
10,0.0562,1.210844,0.860092,0.860038,0.860129,0.860065


[I 2025-03-23 13:22:03,888] Trial 112 finished with value: 0.8473081840001158 and parameters: {'learning_rate': 0.0009023462519898151, 'weight_decay': 0.008, 'warmup_steps': 25, 'lambda_param': 1.0, 'temperature': 2.0}. Best is trial 80 with value: 0.8646333249136988.


Trial 113 with params: {'learning_rate': 0.0007267417633115903, 'weight_decay': 0.009000000000000001, 'warmup_steps': 23, 'lambda_param': 0.8, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4164,1.61084,0.816514,0.821055,0.817589,0.816165
2,0.6226,1.284972,0.840596,0.841497,0.840101,0.840309
3,0.3764,1.231423,0.849771,0.850225,0.85012,0.849769
4,0.2552,1.227695,0.856651,0.858701,0.855951,0.856234
5,0.1871,1.387336,0.848624,0.852544,0.847647,0.847904
6,0.1479,1.201323,0.861239,0.861525,0.86096,0.861105
7,0.117,1.287894,0.860092,0.860547,0.85975,0.859926
8,0.0939,1.273111,0.856651,0.856929,0.856372,0.856514
9,0.0785,1.245866,0.856651,0.856605,0.856708,0.856629
10,0.0663,1.273471,0.860092,0.860141,0.860255,0.860085


[I 2025-03-23 13:27:01,592] Trial 113 finished with value: 0.8565968727593261 and parameters: {'learning_rate': 0.0007267417633115903, 'weight_decay': 0.009000000000000001, 'warmup_steps': 23, 'lambda_param': 0.8, 'temperature': 3.5}. Best is trial 80 with value: 0.8646333249136988.


Trial 114 with params: {'learning_rate': 0.0016299742652784003, 'weight_decay': 0.0, 'warmup_steps': 37, 'lambda_param': 0.7000000000000001, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2494,1.400686,0.833716,0.841205,0.83507,0.833145
2,0.4596,1.085035,0.861239,0.861525,0.86096,0.861105
3,0.2525,1.1309,0.858945,0.859227,0.858666,0.85881
4,0.1628,1.326661,0.838303,0.843748,0.837133,0.837284
5,0.1163,1.331263,0.848624,0.849835,0.848068,0.848305
6,0.0875,1.256242,0.853211,0.853649,0.852867,0.853037
7,0.0683,1.300707,0.84289,0.843801,0.842395,0.842607
8,0.0545,1.28132,0.852064,0.852849,0.851614,0.851826
9,0.0462,1.273222,0.854358,0.854539,0.854119,0.854238
10,0.0403,1.286452,0.849771,0.849762,0.849657,0.849699


[I 2025-03-23 13:30:10,523] Trial 114 pruned. 


Trial 115 with params: {'learning_rate': 0.0013785329004308625, 'weight_decay': 0.003, 'warmup_steps': 38, 'lambda_param': 0.9, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2885,1.518657,0.829128,0.837501,0.830565,0.828446
2,0.4902,1.137132,0.860092,0.860286,0.860339,0.860091
3,0.2727,1.202383,0.860092,0.861365,0.859539,0.859797
4,0.175,1.273714,0.84289,0.848848,0.841679,0.841841
5,0.1267,1.313845,0.847477,0.848408,0.846984,0.847202
6,0.0958,1.215336,0.847477,0.848587,0.846942,0.847171
7,0.073,1.242433,0.854358,0.855152,0.853909,0.854123
8,0.0599,1.26992,0.860092,0.86117,0.859582,0.859826
9,0.0495,1.251194,0.841743,0.842567,0.841269,0.841473
10,0.0427,1.299667,0.849771,0.849869,0.849573,0.849666


[I 2025-03-23 13:33:20,381] Trial 115 pruned. 


Trial 116 with params: {'learning_rate': 0.00011264504731179041, 'weight_decay': 0.007, 'warmup_steps': 28, 'lambda_param': 0.4, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0975,1.546618,0.790138,0.79109,0.790635,0.790104
2,1.2801,1.466676,0.811927,0.81208,0.811653,0.811759
3,1.0571,1.40375,0.819954,0.819901,0.819872,0.819886
4,0.9044,1.584199,0.816514,0.822108,0.815273,0.815254
5,0.7794,1.421572,0.819954,0.820649,0.820378,0.819943
6,0.6819,1.39289,0.830275,0.83066,0.830597,0.830274
7,0.6101,1.529362,0.825688,0.826984,0.826261,0.825643
8,0.5504,1.522393,0.830275,0.830219,0.830302,0.830243
9,0.509,1.552871,0.826835,0.827152,0.826503,0.826643
10,0.4702,1.679498,0.816514,0.821055,0.817589,0.816165


[I 2025-03-23 13:36:52,735] Trial 116 pruned. 


Trial 117 with params: {'learning_rate': 0.000387589386601397, 'weight_decay': 0.008, 'warmup_steps': 26, 'lambda_param': 1.0, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.594,1.609955,0.808486,0.815505,0.809832,0.807829
2,0.8186,1.410984,0.821101,0.824624,0.820115,0.82025
3,0.5447,1.505226,0.836009,0.843152,0.83467,0.83472
4,0.391,1.452923,0.832569,0.835935,0.831628,0.831825
5,0.2992,1.507322,0.832569,0.833073,0.832176,0.832343


[I 2025-03-23 13:38:40,872] Trial 117 pruned. 


Trial 118 with params: {'learning_rate': 0.0011718246136221842, 'weight_decay': 0.01, 'warmup_steps': 17, 'lambda_param': 0.8, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.263,1.527453,0.823394,0.830463,0.824724,0.822812
2,0.5175,1.172158,0.854358,0.854339,0.854456,0.854342
3,0.2983,1.156606,0.862385,0.862728,0.862086,0.862243
4,0.1948,1.158936,0.862385,0.863671,0.861834,0.862095
5,0.1409,1.222212,0.863532,0.863935,0.863213,0.863381
6,0.1069,1.205017,0.848624,0.850997,0.847857,0.848125
7,0.0831,1.18171,0.863532,0.863877,0.863844,0.863532
8,0.0672,1.184293,0.857798,0.857873,0.857624,0.857708
9,0.0568,1.196541,0.856651,0.856632,0.85675,0.856636
10,0.0479,1.263337,0.847477,0.847418,0.847489,0.847443


[I 2025-03-23 13:41:59,502] Trial 118 pruned. 


Trial 119 with params: {'learning_rate': 0.0014139976615434866, 'weight_decay': 0.005, 'warmup_steps': 28, 'lambda_param': 0.1, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.266,1.425528,0.825688,0.831916,0.826934,0.825202
2,0.4831,1.077196,0.864679,0.864633,0.864633,0.864633
3,0.2649,1.103554,0.860092,0.861365,0.859539,0.859797
4,0.1731,1.182114,0.863532,0.865637,0.862834,0.863135
5,0.1255,1.425409,0.831422,0.83634,0.830292,0.830421
6,0.0937,1.238368,0.849771,0.851524,0.84911,0.849369
7,0.072,1.192592,0.856651,0.856836,0.856414,0.856533
8,0.0595,1.239651,0.858945,0.859227,0.858666,0.85881
9,0.0495,1.215299,0.854358,0.854539,0.854119,0.854238
10,0.0428,1.27307,0.857798,0.857944,0.857582,0.85769


[I 2025-03-23 13:46:31,359] Trial 119 finished with value: 0.8576513954713008 and parameters: {'learning_rate': 0.0014139976615434866, 'weight_decay': 0.005, 'warmup_steps': 28, 'lambda_param': 0.1, 'temperature': 2.0}. Best is trial 80 with value: 0.8646333249136988.


Trial 120 with params: {'learning_rate': 0.004146399611805343, 'weight_decay': 0.004, 'warmup_steps': 25, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1093,1.231726,0.856651,0.859405,0.857466,0.856533
2,0.3801,1.207124,0.84289,0.844828,0.842184,0.842432
3,0.2052,1.16405,0.849771,0.849711,0.849783,0.849737
4,0.1313,1.243226,0.850917,0.851054,0.850699,0.850804
5,0.0892,1.331787,0.84633,0.846542,0.846068,0.846194
6,0.0665,1.296686,0.84633,0.846275,0.846363,0.846301
7,0.0531,1.350255,0.852064,0.852087,0.852204,0.852055
8,0.044,1.323705,0.848624,0.848589,0.848699,0.848604
9,0.0378,1.308589,0.852064,0.853532,0.852667,0.85202
10,0.0333,1.279262,0.854358,0.854438,0.85454,0.854353


[I 2025-03-23 13:49:36,884] Trial 120 pruned. 


Trial 121 with params: {'learning_rate': 8.532115701682182e-05, 'weight_decay': 0.003, 'warmup_steps': 28, 'lambda_param': 1.0, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2304,1.579844,0.793578,0.793519,0.793593,0.793539
2,1.3691,1.510288,0.801606,0.802196,0.801139,0.801285
3,1.1685,1.433469,0.813073,0.813025,0.813116,0.813044
4,1.0419,1.548833,0.81422,0.819763,0.812979,0.812945
5,0.9191,1.399299,0.823394,0.823387,0.823251,0.823302
6,0.8319,1.467093,0.825688,0.831498,0.826892,0.825243
7,0.761,1.50434,0.831422,0.834922,0.832355,0.831209
8,0.6964,1.433766,0.830275,0.830321,0.830092,0.830167
9,0.6486,1.532276,0.825688,0.828375,0.82483,0.825017
10,0.6084,1.591531,0.822248,0.827037,0.823346,0.821891


[I 2025-03-23 13:52:33,395] Trial 121 pruned. 


Trial 122 with params: {'learning_rate': 0.004444814148059207, 'weight_decay': 0.002, 'warmup_steps': 32, 'lambda_param': 0.0, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1645,1.241988,0.847477,0.847714,0.847741,0.847477
2,0.3842,1.185471,0.849771,0.850892,0.849236,0.84947
3,0.2055,1.357172,0.845183,0.847943,0.844353,0.844609
4,0.1298,1.241502,0.852064,0.852045,0.852162,0.852048
5,0.0895,1.318173,0.858945,0.859461,0.858582,0.858766
6,0.0646,1.278574,0.854358,0.854861,0.853993,0.854173
7,0.0522,1.253429,0.858945,0.859054,0.85875,0.858847
8,0.0423,1.289339,0.850917,0.851137,0.850657,0.850785
9,0.0372,1.282917,0.852064,0.852025,0.851993,0.852008
10,0.0333,1.283044,0.854358,0.854399,0.854204,0.854273


[I 2025-03-23 13:55:40,063] Trial 122 pruned. 


Trial 123 with params: {'learning_rate': 0.0037693822226426065, 'weight_decay': 0.008, 'warmup_steps': 8, 'lambda_param': 0.0, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0891,1.20112,0.860092,0.861273,0.860634,0.860065
2,0.3818,1.143104,0.854358,0.854399,0.854204,0.854273
3,0.2045,1.168876,0.850917,0.850892,0.850825,0.850854
4,0.1305,1.282741,0.844037,0.84455,0.844405,0.844033
5,0.0897,1.277398,0.848624,0.848597,0.848531,0.848559
6,0.0681,1.22108,0.848624,0.849177,0.848236,0.84842
7,0.0533,1.234921,0.856651,0.857161,0.856287,0.85647
8,0.044,1.274889,0.847477,0.849452,0.846773,0.847033
9,0.0373,1.207996,0.860092,0.860429,0.859792,0.859947
10,0.0329,1.180149,0.855505,0.855948,0.855161,0.855333


[I 2025-03-23 14:00:27,778] Trial 123 finished with value: 0.8540951101098133 and parameters: {'learning_rate': 0.0037693822226426065, 'weight_decay': 0.008, 'warmup_steps': 8, 'lambda_param': 0.0, 'temperature': 2.0}. Best is trial 80 with value: 0.8646333249136988.


Trial 124 with params: {'learning_rate': 0.0010881782140340856, 'weight_decay': 0.001, 'warmup_steps': 32, 'lambda_param': 1.0, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3225,1.575404,0.818807,0.822685,0.819799,0.818531
2,0.549,1.188866,0.853211,0.853226,0.853077,0.853134
3,0.3143,1.168223,0.857798,0.858131,0.857498,0.857651
4,0.2046,1.194377,0.863532,0.865637,0.862834,0.863135
5,0.1475,1.316041,0.853211,0.855627,0.852446,0.852727
6,0.1117,1.202158,0.844037,0.84487,0.843563,0.84377
7,0.0878,1.19815,0.865826,0.866361,0.865465,0.865656
8,0.0699,1.258686,0.860092,0.860112,0.85996,0.860018
9,0.0589,1.252674,0.850917,0.851137,0.850657,0.850785
10,0.0498,1.306748,0.852064,0.852025,0.851993,0.852008


[I 2025-03-23 14:03:33,730] Trial 124 pruned. 


Trial 125 with params: {'learning_rate': 0.0023489068115098437, 'weight_decay': 0.005, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1136,1.172196,0.856651,0.858356,0.857298,0.856597
2,0.4085,1.192451,0.853211,0.854082,0.852741,0.85296
3,0.2234,1.22763,0.845183,0.846474,0.844605,0.84484
4,0.1428,1.322325,0.848624,0.849319,0.848194,0.848393
5,0.1004,1.295152,0.848624,0.849319,0.848194,0.848393
6,0.076,1.26843,0.850917,0.853593,0.850109,0.850385
7,0.0589,1.292168,0.849771,0.851088,0.849194,0.849438
8,0.0475,1.339237,0.848624,0.849835,0.848068,0.848305
9,0.0402,1.327571,0.84289,0.843361,0.842521,0.842691
10,0.0353,1.271295,0.84633,0.84646,0.84611,0.846214


[I 2025-03-23 14:06:39,043] Trial 125 pruned. 


Trial 126 with params: {'learning_rate': 0.0009049791490282845, 'weight_decay': 0.0, 'warmup_steps': 34, 'lambda_param': 0.0, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3768,1.722636,0.81078,0.826247,0.812758,0.809133
2,0.5679,1.167991,0.853211,0.853435,0.852951,0.85308
3,0.3366,1.101169,0.856651,0.857161,0.856287,0.85647
4,0.2242,1.15986,0.853211,0.856208,0.852362,0.852646
5,0.1618,1.317137,0.854358,0.856144,0.853698,0.853969
6,0.1246,1.146131,0.860092,0.860547,0.85975,0.859926
7,0.0974,1.149808,0.868119,0.868074,0.86818,0.868098
8,0.0781,1.206821,0.868119,0.8681,0.868222,0.868105
9,0.0666,1.189784,0.861239,0.861286,0.861087,0.861158
10,0.0561,1.282016,0.850917,0.850985,0.850741,0.850822


[I 2025-03-23 14:09:38,863] Trial 126 pruned. 


Trial 127 with params: {'learning_rate': 0.001019536154039239, 'weight_decay': 0.009000000000000001, 'warmup_steps': 16, 'lambda_param': 1.0, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3116,1.71055,0.819954,0.827651,0.821346,0.819287
2,0.5429,1.240498,0.841743,0.841801,0.841564,0.841642
3,0.317,1.128881,0.856651,0.856632,0.85675,0.856636
4,0.2096,1.232406,0.850917,0.8542,0.850025,0.8503
5,0.1507,1.381369,0.845183,0.847393,0.844437,0.844693
6,0.1148,1.231197,0.853211,0.85536,0.852488,0.852765
7,0.0899,1.217863,0.852064,0.852025,0.851993,0.852008
8,0.073,1.23765,0.853211,0.85335,0.852993,0.8531
9,0.0617,1.208366,0.862385,0.862728,0.862086,0.862243
10,0.0507,1.298948,0.857798,0.85774,0.857792,0.857762


[I 2025-03-23 14:14:27,888] Trial 127 finished with value: 0.853059505002633 and parameters: {'learning_rate': 0.001019536154039239, 'weight_decay': 0.009000000000000001, 'warmup_steps': 16, 'lambda_param': 1.0, 'temperature': 4.0}. Best is trial 80 with value: 0.8646333249136988.


Trial 128 with params: {'learning_rate': 0.0011593501697953923, 'weight_decay': 0.01, 'warmup_steps': 29, 'lambda_param': 1.0, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3059,1.586993,0.822248,0.830473,0.823682,0.821538
2,0.5222,1.162397,0.852064,0.852561,0.851699,0.851877
3,0.298,1.052045,0.863532,0.864061,0.86317,0.863359
4,0.1937,1.272459,0.845183,0.851631,0.843932,0.844091
5,0.1383,1.346575,0.849771,0.853885,0.848773,0.849032
6,0.1048,1.176205,0.856651,0.857161,0.856287,0.85647
7,0.0818,1.183118,0.848624,0.848573,0.848573,0.848573
8,0.0648,1.232672,0.857798,0.858688,0.857329,0.857555
9,0.0544,1.247942,0.848624,0.849319,0.848194,0.848393
10,0.0464,1.272226,0.854358,0.854319,0.854288,0.854302


[I 2025-03-23 14:19:03,893] Trial 128 finished with value: 0.8495551794756979 and parameters: {'learning_rate': 0.0011593501697953923, 'weight_decay': 0.01, 'warmup_steps': 29, 'lambda_param': 1.0, 'temperature': 3.0}. Best is trial 80 with value: 0.8646333249136988.


Trial 129 with params: {'learning_rate': 0.0028475228006924285, 'weight_decay': 0.004, 'warmup_steps': 14, 'lambda_param': 0.0, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1034,1.198115,0.850917,0.853784,0.851751,0.850785
2,0.3999,1.155576,0.857798,0.857817,0.857666,0.857723
3,0.2162,1.24832,0.84633,0.847529,0.845773,0.846006
4,0.1365,1.27919,0.854358,0.854539,0.854119,0.854238
5,0.0944,1.277478,0.848624,0.849177,0.848236,0.84842
6,0.0688,1.221496,0.850917,0.850867,0.850867,0.850867
7,0.0554,1.232627,0.855505,0.855732,0.855245,0.855376
8,0.0458,1.220646,0.857798,0.857873,0.857624,0.857708
9,0.0395,1.23692,0.856651,0.856836,0.856414,0.856533
10,0.0355,1.200632,0.857798,0.85775,0.85775,0.85775


[I 2025-03-23 14:23:46,180] Trial 129 finished with value: 0.8554125542834583 and parameters: {'learning_rate': 0.0028475228006924285, 'weight_decay': 0.004, 'warmup_steps': 14, 'lambda_param': 0.0, 'temperature': 2.5}. Best is trial 80 with value: 0.8646333249136988.


Trial 130 with params: {'learning_rate': 0.00030156974352495147, 'weight_decay': 0.002, 'warmup_steps': 38, 'lambda_param': 1.0, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7024,1.556952,0.817661,0.822038,0.818715,0.817332
2,0.9053,1.400815,0.827982,0.830983,0.827082,0.827269
3,0.6293,1.483144,0.836009,0.84412,0.834586,0.834582
4,0.4706,1.521558,0.832569,0.839394,0.831249,0.831287
5,0.3637,1.457006,0.836009,0.835963,0.835933,0.835947


[I 2025-03-23 14:25:22,176] Trial 130 pruned. 


Trial 131 with params: {'learning_rate': 0.0005612567161548509, 'weight_decay': 0.01, 'warmup_steps': 39, 'lambda_param': 0.0, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4997,1.614554,0.825688,0.831916,0.826934,0.825202
2,0.6971,1.373733,0.838303,0.843349,0.837175,0.837342
3,0.4336,1.376018,0.849771,0.852292,0.848983,0.849255
4,0.2985,1.299072,0.849771,0.851524,0.84911,0.849369
5,0.2248,1.388833,0.845183,0.845661,0.844816,0.844988
6,0.1759,1.228862,0.862385,0.862728,0.862086,0.862243
7,0.143,1.329577,0.856651,0.858223,0.856035,0.856302
8,0.1175,1.282112,0.864679,0.865146,0.864339,0.864519
9,0.0986,1.283231,0.864679,0.864679,0.864802,0.864668
10,0.0831,1.343643,0.860092,0.860141,0.860255,0.860085


[I 2025-03-23 14:30:08,462] Trial 131 finished with value: 0.8611726224074798 and parameters: {'learning_rate': 0.0005612567161548509, 'weight_decay': 0.01, 'warmup_steps': 39, 'lambda_param': 0.0, 'temperature': 4.0}. Best is trial 80 with value: 0.8646333249136988.


Trial 132 with params: {'learning_rate': 0.0017373682198792734, 'weight_decay': 0.008, 'warmup_steps': 37, 'lambda_param': 0.2, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2405,1.252785,0.831422,0.833043,0.83206,0.831358
2,0.456,1.093024,0.864679,0.865026,0.864381,0.864539
3,0.2478,1.158866,0.84633,0.846876,0.845942,0.846123
4,0.1593,1.222708,0.847477,0.849707,0.846731,0.846994
5,0.1136,1.42472,0.837156,0.838499,0.836554,0.836777


[I 2025-03-23 14:31:38,102] Trial 132 pruned. 


Trial 133 with params: {'learning_rate': 0.00047231529676467333, 'weight_decay': 0.01, 'warmup_steps': 37, 'lambda_param': 0.0, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.541,1.594351,0.822248,0.825079,0.823093,0.822077
2,0.7511,1.362968,0.823394,0.826951,0.822409,0.822554
3,0.4764,1.456771,0.845183,0.849965,0.8441,0.844318
4,0.3381,1.489515,0.832569,0.841869,0.831039,0.830924
5,0.2573,1.42994,0.837156,0.838126,0.836638,0.836846
6,0.2028,1.274394,0.863532,0.864724,0.863002,0.863259
7,0.1667,1.390442,0.857798,0.859492,0.857161,0.857435
8,0.1387,1.336734,0.863532,0.863486,0.863591,0.86351
9,0.1147,1.344183,0.864679,0.86535,0.865096,0.864672
10,0.0998,1.405881,0.863532,0.864444,0.864012,0.863518


[I 2025-03-23 14:36:06,855] Trial 133 finished with value: 0.8634529168635017 and parameters: {'learning_rate': 0.00047231529676467333, 'weight_decay': 0.01, 'warmup_steps': 37, 'lambda_param': 0.0, 'temperature': 4.0}. Best is trial 80 with value: 0.8646333249136988.


Trial 134 with params: {'learning_rate': 6.558978114640059e-05, 'weight_decay': 0.0, 'warmup_steps': 19, 'lambda_param': 0.1, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3721,1.651177,0.788991,0.790022,0.789509,0.788951
2,1.4836,1.557782,0.793578,0.794202,0.793087,0.793226
3,1.284,1.471467,0.797018,0.797163,0.797224,0.797016
4,1.1573,1.550264,0.813073,0.818396,0.811853,0.811826
5,1.0437,1.406546,0.819954,0.819926,0.81983,0.819869


[I 2025-03-23 14:37:35,584] Trial 134 pruned. 


Trial 135 with params: {'learning_rate': 0.00021614932610812554, 'weight_decay': 0.009000000000000001, 'warmup_steps': 36, 'lambda_param': 0.1, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8356,1.672037,0.780963,0.795762,0.782973,0.778961
2,1.0317,1.369529,0.831422,0.831402,0.831302,0.831342
3,0.7567,1.436853,0.827982,0.832648,0.826871,0.826991
4,0.5986,1.681351,0.81078,0.822933,0.808969,0.808309
5,0.4794,1.480357,0.838303,0.838326,0.838438,0.838292
6,0.3933,1.508371,0.840596,0.840844,0.840311,0.840443
7,0.335,1.536803,0.83945,0.842292,0.838596,0.838831
8,0.2876,1.609079,0.83945,0.84318,0.840406,0.839233
9,0.2549,1.567248,0.837156,0.838289,0.83769,0.837125
10,0.2262,1.770204,0.837156,0.841206,0.838154,0.836908


[I 2025-03-23 14:40:31,619] Trial 135 pruned. 


Trial 136 with params: {'learning_rate': 0.00037807648214725325, 'weight_decay': 0.007, 'warmup_steps': 40, 'lambda_param': 0.0, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6413,1.56748,0.816514,0.82037,0.817504,0.816234
2,0.8319,1.474389,0.821101,0.82764,0.819778,0.819732
3,0.5606,1.431426,0.844037,0.848242,0.843016,0.843244
4,0.4048,1.506161,0.836009,0.845154,0.834502,0.834436
5,0.308,1.505507,0.840596,0.840684,0.840395,0.840485


[I 2025-03-23 14:42:10,970] Trial 136 pruned. 


Trial 137 with params: {'learning_rate': 0.0010247326912156238, 'weight_decay': 0.01, 'warmup_steps': 31, 'lambda_param': 0.1, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3409,1.64964,0.818807,0.830447,0.820514,0.817699
2,0.5556,1.17446,0.852064,0.852849,0.851614,0.851826
3,0.3234,1.146288,0.854358,0.854999,0.853951,0.854149
4,0.2089,1.173614,0.865826,0.867457,0.865212,0.865499
5,0.1525,1.32576,0.852064,0.853834,0.851404,0.851669
6,0.1151,1.237035,0.853211,0.854447,0.852656,0.852901
7,0.0904,1.250241,0.858945,0.859287,0.859255,0.858945
8,0.0713,1.279278,0.855505,0.855481,0.855414,0.855443
9,0.0604,1.254787,0.853211,0.853186,0.853119,0.853148
10,0.0514,1.272942,0.853211,0.853176,0.853288,0.853192


[I 2025-03-23 14:45:15,065] Trial 137 pruned. 


Trial 138 with params: {'learning_rate': 0.0007244232855387846, 'weight_decay': 0.008, 'warmup_steps': 41, 'lambda_param': 0.1, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4447,1.668404,0.81078,0.821883,0.812463,0.809656
2,0.616,1.302131,0.841743,0.844611,0.84089,0.841134
3,0.3758,1.271586,0.853211,0.854257,0.852698,0.852932
4,0.2541,1.26833,0.860092,0.86068,0.859708,0.859903
5,0.1867,1.356955,0.847477,0.849452,0.846773,0.847033
6,0.1431,1.240578,0.852064,0.853606,0.851446,0.851704
7,0.1143,1.203938,0.861239,0.861203,0.861171,0.861186
8,0.0921,1.249793,0.862385,0.862351,0.862465,0.862367
9,0.0768,1.221026,0.860092,0.860092,0.860213,0.86008
10,0.0653,1.27554,0.860092,0.860092,0.860213,0.86008


[I 2025-03-23 14:50:06,201] Trial 138 finished with value: 0.8496470591332251 and parameters: {'learning_rate': 0.0007244232855387846, 'weight_decay': 0.008, 'warmup_steps': 41, 'lambda_param': 0.1, 'temperature': 3.5}. Best is trial 80 with value: 0.8646333249136988.


Trial 139 with params: {'learning_rate': 0.00039441639533558683, 'weight_decay': 0.01, 'warmup_steps': 41, 'lambda_param': 0.0, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6312,1.571985,0.818807,0.822362,0.819757,0.818563
2,0.8095,1.440171,0.830275,0.83223,0.829544,0.829759
3,0.535,1.462087,0.840596,0.844928,0.839553,0.83976
4,0.3817,1.501221,0.838303,0.842965,0.837217,0.837399
5,0.2927,1.466134,0.844037,0.844164,0.843816,0.843918


[I 2025-03-23 14:51:37,150] Trial 139 pruned. 


Trial 140 with params: {'learning_rate': 0.0009627780812583816, 'weight_decay': 0.009000000000000001, 'warmup_steps': 30, 'lambda_param': 0.0, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3654,1.792477,0.822248,0.834294,0.823977,0.821128
2,0.5672,1.187042,0.849771,0.849869,0.849573,0.849666
3,0.3291,1.245362,0.855505,0.858227,0.854698,0.854989
4,0.2181,1.247241,0.849771,0.852882,0.848899,0.849171
5,0.1558,1.3089,0.855505,0.858844,0.854614,0.854906
6,0.1196,1.209826,0.853211,0.854652,0.852614,0.85287
7,0.0936,1.284587,0.84289,0.84298,0.84269,0.842781
8,0.0752,1.254084,0.852064,0.852698,0.851657,0.851852
9,0.0632,1.258557,0.855505,0.855481,0.855414,0.855443
10,0.0533,1.298846,0.854358,0.854381,0.854498,0.854348


[I 2025-03-23 14:56:01,600] Trial 140 finished with value: 0.8542732810223925 and parameters: {'learning_rate': 0.0009627780812583816, 'weight_decay': 0.009000000000000001, 'warmup_steps': 30, 'lambda_param': 0.0, 'temperature': 4.0}. Best is trial 80 with value: 0.8646333249136988.


Trial 141 with params: {'learning_rate': 0.0006528535607526034, 'weight_decay': 0.01, 'warmup_steps': 36, 'lambda_param': 0.0, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4525,1.708327,0.815367,0.822071,0.816673,0.814782
2,0.6524,1.330262,0.838303,0.841586,0.837385,0.837609
3,0.4001,1.276714,0.854358,0.854462,0.854161,0.854256
4,0.271,1.212662,0.855505,0.856385,0.855035,0.855258
5,0.1991,1.344231,0.84289,0.84364,0.842437,0.842636


[I 2025-03-23 14:57:39,749] Trial 141 pruned. 


Trial 142 with params: {'learning_rate': 0.0002110000609793905, 'weight_decay': 0.01, 'warmup_steps': 30, 'lambda_param': 0.0, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.831,1.438315,0.81422,0.816349,0.814957,0.814102
2,1.038,1.414123,0.831422,0.831374,0.831344,0.831358
3,0.7682,1.455862,0.823394,0.827978,0.822283,0.822377
4,0.6161,1.519452,0.831422,0.834932,0.83046,0.830647
5,0.4931,1.481141,0.829128,0.829081,0.829176,0.829101
6,0.4059,1.535593,0.838303,0.83846,0.838059,0.83817
7,0.3474,1.521418,0.833716,0.83494,0.833133,0.833347
8,0.2968,1.605779,0.833716,0.838266,0.834775,0.833416
9,0.263,1.545345,0.83945,0.83996,0.839816,0.839446
10,0.2334,1.607392,0.834862,0.835144,0.835144,0.834862


[I 2025-03-23 15:00:56,847] Trial 142 pruned. 


Trial 143 with params: {'learning_rate': 0.00023920652540942373, 'weight_decay': 0.01, 'warmup_steps': 39, 'lambda_param': 0.4, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7971,1.552801,0.803899,0.812206,0.80537,0.803058
2,0.9905,1.379341,0.824541,0.824677,0.824293,0.824397
3,0.7185,1.466744,0.829128,0.833275,0.828082,0.828232
4,0.5556,1.471657,0.819954,0.823003,0.81903,0.819182
5,0.4425,1.50751,0.830275,0.83178,0.829629,0.829842


[I 2025-03-23 15:02:43,616] Trial 143 pruned. 


Trial 144 with params: {'learning_rate': 0.004873734927812304, 'weight_decay': 0.002, 'warmup_steps': 41, 'lambda_param': 1.0, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1448,1.335188,0.852064,0.854791,0.852877,0.851943
2,0.3945,1.190268,0.857798,0.859986,0.857077,0.857366
3,0.2084,1.140337,0.857798,0.857798,0.857919,0.857786
4,0.1304,1.214848,0.855505,0.856224,0.855077,0.855285
5,0.0883,1.240518,0.862385,0.86298,0.862002,0.8622
6,0.0651,1.216259,0.860092,0.86024,0.859876,0.859986
7,0.0504,1.219407,0.860092,0.860045,0.860045,0.860045
8,0.0412,1.168893,0.863532,0.864061,0.86317,0.863359
9,0.0364,1.197555,0.869266,0.869623,0.868969,0.869131
10,0.0336,1.182144,0.872706,0.872661,0.872769,0.872686


[I 2025-03-23 15:07:12,965] Trial 144 finished with value: 0.8645927095670483 and parameters: {'learning_rate': 0.004873734927812304, 'weight_decay': 0.002, 'warmup_steps': 41, 'lambda_param': 1.0, 'temperature': 6.5}. Best is trial 80 with value: 0.8646333249136988.


Trial 145 with params: {'learning_rate': 0.00030418416992950784, 'weight_decay': 0.01, 'warmup_steps': 24, 'lambda_param': 1.0, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.683,1.527145,0.81422,0.818389,0.815252,0.813903
2,0.9023,1.376348,0.832569,0.834309,0.831881,0.832102
3,0.6259,1.471037,0.829128,0.834793,0.827913,0.827988
4,0.4659,1.51865,0.832569,0.837702,0.831418,0.831544
5,0.3586,1.556758,0.833716,0.833949,0.833975,0.833715


[I 2025-03-23 15:08:49,086] Trial 145 pruned. 


Trial 146 with params: {'learning_rate': 0.003430404003921716, 'weight_decay': 0.002, 'warmup_steps': 43, 'lambda_param': 0.7000000000000001, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.138,1.22988,0.844037,0.846577,0.844826,0.843918
2,0.3862,1.23999,0.850917,0.852564,0.850278,0.850537
3,0.2073,1.219016,0.849771,0.849808,0.849615,0.849683
4,0.129,1.283708,0.857798,0.858688,0.857329,0.857555
5,0.09,1.285027,0.848624,0.848689,0.848447,0.848527
6,0.0701,1.184869,0.862385,0.86313,0.86196,0.862176
7,0.0526,1.251836,0.850917,0.851054,0.850699,0.850804
8,0.043,1.238672,0.854358,0.855701,0.853783,0.854035
9,0.0368,1.268735,0.848624,0.849648,0.84811,0.848336
10,0.0329,1.216224,0.857798,0.858526,0.857371,0.857582


[I 2025-03-23 15:13:27,816] Trial 146 finished with value: 0.8563928249112336 and parameters: {'learning_rate': 0.003430404003921716, 'weight_decay': 0.002, 'warmup_steps': 43, 'lambda_param': 0.7000000000000001, 'temperature': 6.5}. Best is trial 80 with value: 0.8646333249136988.


Trial 147 with params: {'learning_rate': 0.0031173733644972268, 'weight_decay': 0.003, 'warmup_steps': 43, 'lambda_param': 1.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1643,1.11264,0.849771,0.850356,0.850162,0.849766
2,0.3972,1.254794,0.850917,0.852345,0.85032,0.850571
3,0.2143,1.214405,0.852064,0.852698,0.851657,0.851852
4,0.1362,1.245088,0.856651,0.856593,0.856666,0.85662
5,0.0918,1.300932,0.840596,0.842064,0.839974,0.840208


[I 2025-03-23 15:15:12,152] Trial 147 pruned. 


Trial 148 with params: {'learning_rate': 0.00041567747760923817, 'weight_decay': 0.007, 'warmup_steps': 36, 'lambda_param': 0.7000000000000001, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5882,1.580865,0.806193,0.81232,0.807453,0.805628
2,0.7887,1.350542,0.831422,0.833764,0.830629,0.830843
3,0.5148,1.453244,0.847477,0.850878,0.846563,0.846823
4,0.3703,1.661969,0.822248,0.834305,0.820483,0.820021
5,0.2815,1.437249,0.84633,0.848175,0.845647,0.845901
6,0.2228,1.325038,0.854358,0.854861,0.853993,0.854173
7,0.183,1.37964,0.856651,0.857454,0.856203,0.85642
8,0.153,1.369805,0.855505,0.85547,0.855582,0.855486
9,0.1282,1.426454,0.852064,0.853753,0.852709,0.852008
10,0.1112,1.45832,0.860092,0.860758,0.860508,0.860085


[I 2025-03-23 15:20:02,475] Trial 148 finished with value: 0.8577075931043558 and parameters: {'learning_rate': 0.00041567747760923817, 'weight_decay': 0.007, 'warmup_steps': 36, 'lambda_param': 0.7000000000000001, 'temperature': 3.5}. Best is trial 80 with value: 0.8646333249136988.


Trial 149 with params: {'learning_rate': 0.002093527576873113, 'weight_decay': 0.001, 'warmup_steps': 30, 'lambda_param': 1.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1881,1.489,0.834862,0.844677,0.836406,0.834077
2,0.4316,1.068162,0.860092,0.860828,0.859666,0.859879
3,0.234,1.133332,0.854358,0.85532,0.853867,0.854095
4,0.1498,1.166203,0.856651,0.857623,0.856161,0.856393
5,0.1038,1.333348,0.841743,0.841712,0.841648,0.841676
6,0.079,1.195067,0.852064,0.852334,0.851783,0.851922
7,0.0607,1.213841,0.852064,0.852334,0.851783,0.851922
8,0.0495,1.214607,0.857798,0.857848,0.857961,0.857791
9,0.0428,1.260999,0.847477,0.847961,0.84711,0.847284
10,0.0374,1.23428,0.847477,0.847573,0.847278,0.847371


[I 2025-03-23 15:23:21,188] Trial 149 pruned. 


In [32]:
print(best_trial2)

BestRun(run_id='80', objective=0.8646333249136988, hyperparameters={'learning_rate': 0.0006819636125122306, 'weight_decay': 0.01, 'warmup_steps': 30, 'lambda_param': 0.9, 'temperature': 3.0}, run_summary=None)


In [33]:
#Nápočet epoch na steps
data_length = len(all_train_data)
min_r = math.ceil(data_length/batch_size)*5
max_r = math.ceil(data_length/batch_size)*num_epochs
warm_up = math.ceil(data_length/batch_size/10)

In [34]:
base.reset_seed()

In [35]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base-embedd_aug_hp-search", logging_dir=f"~/logs/{DATASET}/bilstm-base-embedd_aug_hp-search", epochs=num_epochs, batch_size=batch_size)

In [36]:
def hp_space(trial):
    params =  {
        "learning_rate": trial.suggest_float("learning_rate", 5e-5, 5e-3, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0, 1e-2, step=1e-3),
        "warmup_steps" : trial.suggest_int("warmup_steps", 0, warm_up),
    }
    print(f"Trial {trial.number} with params: {params}")
    return params

In [37]:
pruner = optuna.pruners.HyperbandPruner(min_resource=min_r, max_resource=max_r, reduction_factor=2, bootstrap_count=2)
sampler = optuna.samplers.TPESampler(seed=42, multivariate=True)



In [38]:
trainer = Trainer(
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    model_init = lambda: get_BiLSTM(),
)
  

In [None]:
best_trial3 = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    compute_objective=lambda metrics: metrics["eval_f1"],
    pruner=pruner,
    sampler=sampler,
    study_name="Base-aug-embedd",
    n_trials=150
)

[I 2025-03-23 15:23:21,625] A new study created in memory with name: Base-aug-embedd


Trial 0 with params: {'learning_rate': 0.0002805758207667253, 'weight_decay': 0.01, 'warmup_steps': 305}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1938,0.532892,0.84633,0.847017,0.8459,0.846096
2,0.0796,0.677365,0.833716,0.833795,0.833891,0.83371
3,0.0509,0.889196,0.826835,0.828421,0.826166,0.826372
4,0.0354,1.003145,0.826835,0.826787,0.826882,0.826807
5,0.0259,1.285178,0.827982,0.827982,0.828092,0.827967


[I 2025-03-23 15:33:46,156] Trial 0 pruned. 


Trial 1 with params: {'learning_rate': 0.0007875660249889869, 'weight_decay': 0.001, 'warmup_steps': 65}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1393,0.625033,0.833716,0.83558,0.833007,0.833231
2,0.0495,0.876779,0.824541,0.824564,0.824672,0.82453
3,0.0282,1.095953,0.816514,0.816603,0.816284,0.816375
4,0.0175,1.220537,0.813073,0.813462,0.812695,0.812837
5,0.0117,1.558773,0.806193,0.806128,0.806191,0.80615
6,0.0082,1.718021,0.805046,0.805466,0.804643,0.804783
7,0.006,1.945276,0.81078,0.81133,0.811158,0.810774
8,0.0045,2.015203,0.81078,0.81076,0.810863,0.81076
9,0.0035,2.270739,0.81422,0.814494,0.814494,0.81422
10,0.0027,2.52092,0.821101,0.821212,0.821293,0.821097


[I 2025-03-23 15:54:50,074] Trial 1 pruned. 


Trial 2 with params: {'learning_rate': 6.533369619026643e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 251}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2902,0.446408,0.818807,0.820233,0.818157,0.818345
2,0.162,0.519151,0.827982,0.827944,0.827882,0.827908
3,0.1201,0.705791,0.807339,0.813535,0.806012,0.805865
4,0.0981,0.751174,0.825688,0.828375,0.82483,0.825017
5,0.0829,0.766297,0.837156,0.8371,0.837185,0.837125


[I 2025-03-23 16:06:47,427] Trial 2 pruned. 


Trial 3 with params: {'learning_rate': 0.0013035123791853842, 'weight_decay': 0.0, 'warmup_steps': 405}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1397,0.576641,0.841743,0.842414,0.841311,0.841502
2,0.0456,0.84408,0.825688,0.8258,0.825882,0.825684
3,0.0255,1.240026,0.824541,0.827387,0.825387,0.824373
4,0.016,1.084816,0.813073,0.813269,0.812779,0.812894
5,0.0107,1.487125,0.811927,0.812163,0.811611,0.811732
6,0.0078,1.612212,0.816514,0.816759,0.816199,0.816324
7,0.0057,1.721354,0.81078,0.811872,0.81019,0.810361
8,0.0042,2.292768,0.811927,0.81191,0.811779,0.811828
9,0.0033,2.533651,0.815367,0.815799,0.815705,0.815365
10,0.0029,2.656094,0.808486,0.810032,0.809116,0.808413


[I 2025-03-23 16:30:05,313] Trial 3 pruned. 


Trial 4 with params: {'learning_rate': 0.002311294500510415, 'weight_decay': 0.002, 'warmup_steps': 76}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1169,0.705251,0.83945,0.83945,0.839564,0.839436
2,0.0404,0.943708,0.818807,0.81875,0.818831,0.818773
3,0.0239,1.148195,0.815367,0.815799,0.815705,0.815365
4,0.0162,1.208786,0.808486,0.808422,0.808485,0.808444
5,0.0118,1.915259,0.801606,0.80258,0.802107,0.801574


[I 2025-03-23 16:41:40,777] Trial 4 pruned. 


Trial 5 with params: {'learning_rate': 0.00011635338541918901, 'weight_decay': 0.003, 'warmup_steps': 219}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2448,0.488077,0.831422,0.83283,0.830797,0.831011
2,0.1203,0.553635,0.833716,0.833654,0.833681,0.833666
3,0.0869,0.680367,0.829128,0.829354,0.828839,0.828964
4,0.068,0.80814,0.825688,0.826055,0.825335,0.825482
5,0.0546,0.833398,0.822248,0.823838,0.822882,0.82218
6,0.0447,0.983309,0.827982,0.830421,0.827166,0.827368
7,0.0374,0.976903,0.836009,0.83659,0.835596,0.835774
8,0.0317,1.119787,0.817661,0.817721,0.817452,0.817534
9,0.0271,1.290712,0.818807,0.819536,0.818325,0.818498
10,0.0233,1.397105,0.819954,0.819889,0.819915,0.819901


[I 2025-03-23 17:05:25,027] Trial 5 pruned. 


Trial 6 with params: {'learning_rate': 0.0003654769917956456, 'weight_decay': 0.003, 'warmup_steps': 255}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1774,0.591202,0.83945,0.842592,0.838554,0.838785
2,0.0688,0.774408,0.834862,0.834975,0.835059,0.834859
3,0.0426,0.970351,0.829128,0.829562,0.828755,0.828912
4,0.0289,1.097255,0.826835,0.827262,0.826461,0.826616
5,0.0205,1.348431,0.817661,0.817641,0.817746,0.817641


[I 2025-03-23 17:17:03,872] Trial 6 pruned. 


Trial 7 with params: {'learning_rate': 9.505122659935192e-05, 'weight_decay': 0.003, 'warmup_steps': 153}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2565,0.480656,0.826835,0.828213,0.826208,0.826413
2,0.1322,0.55466,0.830275,0.830383,0.83005,0.830147
3,0.0967,0.66434,0.823394,0.824469,0.82283,0.823022
4,0.0772,0.766917,0.825688,0.826445,0.825208,0.82539
5,0.0636,0.824611,0.824541,0.825394,0.825008,0.824523
6,0.0536,0.936784,0.822248,0.822783,0.82183,0.821993
7,0.0458,0.892475,0.832569,0.832954,0.832218,0.83237
8,0.0395,1.006066,0.819954,0.819926,0.81983,0.819869
9,0.0344,1.170595,0.821101,0.82157,0.820704,0.82086
10,0.0304,1.191946,0.823394,0.823387,0.823251,0.823302


[I 2025-03-23 17:51:39,553] Trial 7 finished with value: 0.8232606132075472 and parameters: {'learning_rate': 9.505122659935192e-05, 'weight_decay': 0.003, 'warmup_steps': 153}. Best is trial 7 with value: 0.8232606132075472.


Trial 8 with params: {'learning_rate': 0.00040842279473800845, 'weight_decay': 0.008, 'warmup_steps': 83}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1656,0.604656,0.840596,0.8436,0.839722,0.83996
2,0.0652,0.811736,0.823394,0.823338,0.823419,0.823361
3,0.0398,1.029987,0.825688,0.825729,0.825503,0.825577
4,0.0267,1.157562,0.825688,0.825953,0.825377,0.825508
5,0.0185,1.424787,0.823394,0.824884,0.824009,0.823335
6,0.0132,1.773107,0.815367,0.815799,0.815705,0.815365
7,0.0096,1.745192,0.819954,0.819901,0.819872,0.819886
8,0.007,2.140489,0.833716,0.83395,0.833428,0.833556
9,0.0053,2.368503,0.830275,0.830218,0.830218,0.830218
10,0.0041,2.440037,0.827982,0.828087,0.827755,0.827851


[I 2025-03-23 18:26:12,517] Trial 8 finished with value: 0.830202878720101 and parameters: {'learning_rate': 0.00040842279473800845, 'weight_decay': 0.008, 'warmup_steps': 83}. Best is trial 8 with value: 0.830202878720101.


Trial 9 with params: {'learning_rate': 0.0005338741354740678, 'weight_decay': 0.006, 'warmup_steps': 19}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1519,0.61067,0.834862,0.837107,0.834091,0.834318
2,0.0572,0.779876,0.831422,0.831568,0.831176,0.831283
3,0.034,1.058796,0.81078,0.810746,0.810653,0.81069
4,0.0218,1.336424,0.81422,0.814185,0.814284,0.814196
5,0.015,1.492376,0.813073,0.81376,0.813495,0.813061


[I 2025-03-23 18:37:47,084] Trial 9 pruned. 


Trial 10 with params: {'learning_rate': 6.888788881730778e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 39}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2788,0.451349,0.816514,0.818352,0.815778,0.815956
2,0.158,0.531829,0.829128,0.829081,0.829176,0.829101
3,0.1168,0.661901,0.817661,0.820124,0.81682,0.816984
4,0.0954,0.763401,0.822248,0.824499,0.821451,0.821638
5,0.0803,0.779869,0.829128,0.829151,0.82926,0.829117
6,0.07,0.874211,0.827982,0.827978,0.82784,0.827891
7,0.0616,0.847624,0.821101,0.82157,0.820704,0.82086
8,0.055,0.905888,0.818807,0.81875,0.818831,0.818773
9,0.0492,1.04251,0.830275,0.830219,0.830302,0.830243
10,0.0448,1.066584,0.823394,0.823387,0.823251,0.823302


[I 2025-03-23 19:01:07,869] Trial 10 pruned. 


Trial 11 with params: {'learning_rate': 8.238154754398708e-05, 'weight_decay': 0.003, 'warmup_steps': 29}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2639,0.460298,0.821101,0.823216,0.820325,0.820511
2,0.1433,0.538334,0.825688,0.825653,0.825756,0.825665
3,0.1053,0.656193,0.832569,0.834089,0.831923,0.832141
4,0.0854,0.768791,0.825688,0.827847,0.824914,0.825113
5,0.0713,0.806994,0.831422,0.831865,0.831765,0.83142
6,0.0611,0.903879,0.821101,0.82127,0.82083,0.820942
7,0.0529,0.867581,0.825688,0.826301,0.82525,0.825423
8,0.0463,0.970882,0.817661,0.817669,0.817494,0.817555
9,0.0408,1.11087,0.826835,0.826785,0.826755,0.826769
10,0.0366,1.131816,0.821101,0.821197,0.820872,0.820965


[I 2025-03-23 19:24:04,676] Trial 11 pruned. 


Trial 12 with params: {'learning_rate': 0.0004229895735463087, 'weight_decay': 0.009000000000000001, 'warmup_steps': 123}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1652,0.625832,0.833716,0.83694,0.832797,0.833002
2,0.0636,0.830278,0.824541,0.82449,0.824461,0.824475
3,0.0389,1.012732,0.830275,0.830321,0.830092,0.830167
4,0.026,1.181482,0.833716,0.834289,0.833302,0.833477
5,0.018,1.422402,0.819954,0.821989,0.820672,0.81985


[I 2025-03-23 19:35:41,250] Trial 12 pruned. 


Trial 13 with params: {'learning_rate': 0.002704032693225816, 'weight_decay': 0.008, 'warmup_steps': 159}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.119,0.639544,0.832569,0.833884,0.831965,0.832179
2,0.042,0.840788,0.827982,0.827982,0.828092,0.827967
3,0.0257,0.940402,0.823394,0.823354,0.823293,0.823319
4,0.0182,1.094368,0.81422,0.814669,0.813821,0.81397
5,0.014,1.669737,0.822248,0.822314,0.82204,0.822124
6,0.0108,1.796656,0.81422,0.814793,0.813779,0.813937
7,0.0085,1.934825,0.81422,0.814175,0.814116,0.814141
8,0.0065,2.022441,0.818807,0.8189,0.818578,0.81867
9,0.0051,2.372221,0.805046,0.80498,0.80498,0.80498
10,0.0042,2.466074,0.818807,0.819083,0.819083,0.818807


[I 2025-03-23 19:59:12,995] Trial 13 pruned. 


Trial 14 with params: {'learning_rate': 5.716528877895461e-05, 'weight_decay': 0.0, 'warmup_steps': 131}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2957,0.438466,0.816514,0.817233,0.816031,0.8162
2,0.1726,0.514258,0.826835,0.826787,0.826882,0.826807
3,0.1293,0.685509,0.806193,0.812569,0.804843,0.80467
4,0.1062,0.720717,0.825688,0.828104,0.824872,0.825066
5,0.0903,0.760848,0.831422,0.831375,0.83147,0.831395
6,0.0793,0.843623,0.825688,0.825682,0.825545,0.825596
7,0.0709,0.832443,0.829128,0.829066,0.829092,0.829078
8,0.0641,0.880254,0.824541,0.824478,0.824503,0.824489
9,0.0582,0.990884,0.830275,0.830218,0.830218,0.830218
10,0.0537,1.013502,0.823394,0.823329,0.823377,0.823349


[I 2025-03-23 20:22:42,195] Trial 14 pruned. 


Trial 15 with params: {'learning_rate': 0.00012116148911900525, 'weight_decay': 0.006, 'warmup_steps': 166}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2389,0.490783,0.836009,0.837247,0.835428,0.835646
2,0.1173,0.554726,0.831422,0.831402,0.831302,0.831342
3,0.0845,0.679839,0.829128,0.829451,0.828797,0.828939
4,0.0658,0.828807,0.825688,0.826171,0.825293,0.825453
5,0.0524,0.850292,0.822248,0.824059,0.822925,0.822163
6,0.0427,1.022555,0.829128,0.831997,0.82825,0.828446
7,0.0357,1.012419,0.834862,0.835147,0.834554,0.834692
8,0.0302,1.177328,0.81422,0.815081,0.813695,0.813867
9,0.0256,1.346951,0.817661,0.818619,0.817115,0.817295
10,0.0219,1.478126,0.821101,0.821101,0.821209,0.821086


[I 2025-03-23 20:58:30,452] Trial 15 finished with value: 0.8198496466182332 and parameters: {'learning_rate': 0.00012116148911900525, 'weight_decay': 0.006, 'warmup_steps': 166}. Best is trial 8 with value: 0.830202878720101.


Trial 16 with params: {'learning_rate': 0.0003247175832033686, 'weight_decay': 0.004, 'warmup_steps': 149}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1792,0.557204,0.841743,0.843553,0.841058,0.841301
2,0.0732,0.739775,0.833716,0.834286,0.834102,0.83371
3,0.046,0.927061,0.830275,0.830905,0.829839,0.830017
4,0.0315,1.048225,0.830275,0.830218,0.830218,0.830218
5,0.0227,1.375923,0.823394,0.823775,0.823714,0.823394


[I 2025-03-23 21:10:15,895] Trial 16 pruned. 


Trial 17 with params: {'learning_rate': 0.0020085822314002493, 'weight_decay': 0.008, 'warmup_steps': 337}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1294,0.592288,0.838303,0.838258,0.838227,0.838241
2,0.0425,0.869878,0.813073,0.813007,0.813031,0.813018
3,0.0246,1.097387,0.817661,0.818502,0.818125,0.817641
4,0.0165,1.072417,0.821101,0.822753,0.820409,0.820602
5,0.0115,1.559181,0.818807,0.81875,0.818831,0.818773
6,0.0088,1.561853,0.797018,0.799183,0.797771,0.796877
7,0.0066,1.59801,0.817661,0.817641,0.817746,0.817641
8,0.0051,1.885943,0.81422,0.814331,0.81441,0.814216
9,0.0039,2.221262,0.805046,0.805794,0.805485,0.805029
10,0.003,2.363179,0.817661,0.819032,0.818252,0.817607


[I 2025-03-23 21:33:58,768] Trial 17 pruned. 


Trial 18 with params: {'learning_rate': 0.0001044907148504563, 'weight_decay': 0.006, 'warmup_steps': 382}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2602,0.480087,0.826835,0.82802,0.82625,0.826451
2,0.1278,0.550554,0.830275,0.83046,0.830008,0.830124
3,0.0923,0.674142,0.827982,0.828908,0.827461,0.827654
4,0.073,0.787306,0.823394,0.823756,0.82304,0.823185
5,0.0594,0.823388,0.821101,0.8222,0.82163,0.821067
6,0.0494,0.961669,0.830275,0.831387,0.829713,0.829917
7,0.0419,0.913803,0.833716,0.834289,0.833302,0.833477
8,0.0357,1.048182,0.815367,0.815489,0.815115,0.815215
9,0.0308,1.204362,0.819954,0.820482,0.819536,0.819696
10,0.0269,1.287124,0.816514,0.816759,0.816199,0.816324


[I 2025-03-23 21:57:17,472] Trial 18 pruned. 


Trial 19 with params: {'learning_rate': 5.157191818813809e-05, 'weight_decay': 0.001, 'warmup_steps': 285}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.309,0.432283,0.81422,0.815245,0.813652,0.813828
2,0.1807,0.493618,0.825688,0.825629,0.825629,0.825629
3,0.1364,0.657678,0.807339,0.813535,0.806012,0.805865
4,0.1123,0.678317,0.821101,0.823216,0.820325,0.820511
5,0.0959,0.732648,0.831422,0.83136,0.831386,0.831372
6,0.0846,0.785075,0.832569,0.832512,0.832512,0.832512
7,0.0759,0.794442,0.826835,0.826787,0.826882,0.826807
8,0.0687,0.833642,0.824541,0.824516,0.824419,0.824458
9,0.063,0.944787,0.829128,0.829067,0.829134,0.82909
10,0.0583,0.962661,0.822248,0.822221,0.822125,0.822163


[I 2025-03-23 22:20:40,933] Trial 19 pruned. 


Trial 20 with params: {'learning_rate': 0.00113734347773058, 'weight_decay': 0.009000000000000001, 'warmup_steps': 10}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1273,0.644004,0.841743,0.841947,0.841479,0.841602
2,0.0445,0.878118,0.818807,0.818772,0.818873,0.818784
3,0.0252,1.034318,0.821101,0.821035,0.821083,0.821055
4,0.0158,1.230218,0.815367,0.815348,0.815452,0.815347
5,0.0107,1.375545,0.816514,0.816456,0.816536,0.816479
6,0.0077,1.620941,0.811927,0.81201,0.811695,0.811784
7,0.0055,1.924594,0.816514,0.816563,0.816662,0.816505
8,0.0041,2.455092,0.806193,0.806215,0.806317,0.80618
9,0.0035,2.159152,0.807339,0.807611,0.807611,0.807339
10,0.0028,2.379157,0.809633,0.809658,0.809443,0.809512


[I 2025-03-23 22:44:12,282] Trial 20 pruned. 


Trial 21 with params: {'learning_rate': 8.729841446711565e-05, 'weight_decay': 0.006, 'warmup_steps': 176}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2642,0.478343,0.819954,0.821938,0.819199,0.819384
2,0.1385,0.550975,0.832569,0.832568,0.832428,0.832481
3,0.1015,0.666523,0.825688,0.827375,0.824998,0.825202
4,0.0818,0.779664,0.822248,0.824499,0.821451,0.821638
5,0.0679,0.821558,0.826835,0.827538,0.827261,0.826824


[I 2025-03-23 22:56:01,204] Trial 21 pruned. 


Trial 22 with params: {'learning_rate': 0.0002784633305891539, 'weight_decay': 0.006, 'warmup_steps': 226}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1904,0.534184,0.844037,0.84487,0.843563,0.84377
2,0.0793,0.694027,0.831422,0.831501,0.831597,0.831416
3,0.0508,0.896533,0.819954,0.821491,0.819283,0.819473
4,0.0354,1.024335,0.826835,0.826772,0.826798,0.826784
5,0.026,1.285921,0.829128,0.82957,0.829471,0.829126
6,0.0192,1.366891,0.818807,0.819058,0.818494,0.81862
7,0.0147,1.506285,0.813073,0.813462,0.812695,0.812837
8,0.0111,2.137576,0.81078,0.811277,0.810358,0.810508
9,0.0084,2.219142,0.819954,0.819965,0.819788,0.81985
10,0.0065,2.724195,0.811927,0.813508,0.811232,0.811402


[I 2025-03-23 23:19:33,190] Trial 22 pruned. 


Trial 23 with params: {'learning_rate': 0.0001522985605867988, 'weight_decay': 0.004, 'warmup_steps': 127}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2219,0.506062,0.845183,0.845543,0.844858,0.845012
2,0.105,0.584195,0.844037,0.843981,0.844068,0.844007
3,0.0738,0.723518,0.831422,0.831862,0.83105,0.831209
4,0.0555,0.894593,0.832569,0.83268,0.832344,0.832442
5,0.043,0.910894,0.825688,0.826797,0.826219,0.825655


[I 2025-03-23 23:31:12,199] Trial 23 pruned. 


Trial 24 with params: {'learning_rate': 0.00036388258672534674, 'weight_decay': 0.01, 'warmup_steps': 39}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.168,0.583806,0.840596,0.8436,0.839722,0.83996
2,0.0674,0.765539,0.833716,0.833654,0.833681,0.833666
3,0.0418,0.974824,0.830275,0.830772,0.829881,0.830046
4,0.0283,1.04788,0.833716,0.833668,0.833639,0.833652
5,0.0198,1.388367,0.817661,0.818353,0.818083,0.817649
6,0.0145,1.744979,0.829128,0.829451,0.828797,0.828939
7,0.0108,1.899042,0.818807,0.81927,0.81841,0.818563
8,0.008,2.137479,0.829128,0.830522,0.828503,0.828712
9,0.006,2.373675,0.829128,0.829202,0.828924,0.829009


[I 2025-03-24 00:06:15,794] Trial 25 pruned. 


Trial 26 with params: {'learning_rate': 0.000140707263625762, 'weight_decay': 0.006, 'warmup_steps': 38}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2258,0.490422,0.84289,0.842921,0.842732,0.842799
2,0.1111,0.566349,0.838303,0.838287,0.838185,0.838226
3,0.0787,0.705918,0.832569,0.832954,0.832218,0.83237
4,0.0599,0.851477,0.830275,0.83055,0.829965,0.8301
5,0.0466,0.911573,0.826835,0.828038,0.827387,0.826796
6,0.0371,1.0719,0.821101,0.823216,0.820325,0.820511
7,0.0307,1.096758,0.825688,0.826171,0.825293,0.825453
8,0.0254,1.280834,0.818807,0.819536,0.818325,0.818498
9,0.0212,1.47599,0.821101,0.821994,0.820578,0.820761
10,0.0177,1.628978,0.818807,0.819186,0.819125,0.818806


[I 2025-03-24 00:29:33,169] Trial 26 pruned. 


Trial 27 with params: {'learning_rate': 0.00021059103361382344, 'weight_decay': 0.001, 'warmup_steps': 406}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2139,0.5229,0.848624,0.849835,0.848068,0.848305
2,0.0917,0.628635,0.838303,0.838636,0.838606,0.838303
3,0.0613,0.815624,0.815367,0.81687,0.814694,0.814874
4,0.0441,0.938377,0.827982,0.827925,0.828008,0.827949
5,0.0333,1.085251,0.819954,0.821141,0.820504,0.819914


[I 2025-03-24 00:41:03,445] Trial 27 pruned. 


Trial 28 with params: {'learning_rate': 6.271883608260356e-05, 'weight_decay': 0.004, 'warmup_steps': 149}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2888,0.448217,0.815367,0.817551,0.814568,0.814733
2,0.165,0.51597,0.831422,0.831375,0.83147,0.831395
3,0.1224,0.678909,0.808486,0.813336,0.807306,0.80728
4,0.1002,0.752852,0.823394,0.827279,0.822367,0.822497
5,0.0849,0.773322,0.834862,0.8348,0.834849,0.83482
6,0.0745,0.870057,0.824541,0.82461,0.824335,0.824419
7,0.0661,0.841452,0.827982,0.827944,0.827882,0.827908
8,0.0594,0.893138,0.830275,0.830239,0.830176,0.830203
9,0.0535,1.019229,0.827982,0.827924,0.827924,0.827924
10,0.049,1.031103,0.821101,0.821035,0.821083,0.821055


[I 2025-03-24 01:04:23,505] Trial 28 pruned. 


Trial 29 with params: {'learning_rate': 0.0004071960384346933, 'weight_decay': 0.007, 'warmup_steps': 169}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1691,0.599681,0.833716,0.83694,0.832797,0.833002
2,0.0655,0.799506,0.823394,0.823354,0.823293,0.823319
3,0.0403,0.965982,0.819954,0.820162,0.819662,0.819781
4,0.0269,1.095288,0.827982,0.829081,0.827419,0.827619
5,0.0188,1.408382,0.826835,0.826983,0.82705,0.826833
6,0.0134,1.608658,0.816514,0.816625,0.816705,0.81651
7,0.0099,1.666494,0.831422,0.83136,0.831428,0.831385
8,0.0073,2.120574,0.832569,0.832533,0.83247,0.832497
9,0.0055,2.374269,0.827982,0.827925,0.828008,0.827949
10,0.0042,2.80576,0.81422,0.814793,0.813779,0.813937


[I 2025-03-24 01:28:03,378] Trial 29 pruned. 


Trial 30 with params: {'learning_rate': 0.00016132293904726119, 'weight_decay': 0.008, 'warmup_steps': 173}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2205,0.506807,0.848624,0.848937,0.84832,0.848468
2,0.1029,0.585796,0.838303,0.838287,0.838185,0.838226
3,0.0717,0.73907,0.831422,0.831987,0.831007,0.83118
4,0.0536,0.911407,0.833716,0.83395,0.833428,0.833556
5,0.0413,0.911892,0.825688,0.827184,0.826303,0.825629
6,0.0326,1.145243,0.823394,0.826951,0.822409,0.822554
7,0.0265,1.142555,0.832569,0.833207,0.832134,0.832314
8,0.0214,1.323343,0.827982,0.828472,0.827587,0.82775
9,0.0175,1.537699,0.817661,0.817864,0.817368,0.817486
10,0.0143,1.871142,0.823394,0.823335,0.823335,0.823335


[I 2025-03-24 02:03:32,438] Trial 30 finished with value: 0.8324621660744835 and parameters: {'learning_rate': 0.00016132293904726119, 'weight_decay': 0.008, 'warmup_steps': 173}. Best is trial 30 with value: 0.8324621660744835.


Trial 31 with params: {'learning_rate': 0.00014213612862715435, 'weight_decay': 0.008, 'warmup_steps': 169}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2284,0.502084,0.841743,0.842414,0.841311,0.841502
2,0.1094,0.574283,0.840596,0.840552,0.840522,0.840536
3,0.0777,0.713501,0.824541,0.824962,0.824166,0.824319
4,0.0592,0.869813,0.829128,0.829451,0.828797,0.828939
5,0.0463,0.881545,0.824541,0.825932,0.825135,0.824489


[I 2025-03-24 02:15:22,825] Trial 31 pruned. 


Trial 32 with params: {'learning_rate': 0.0003451585430552036, 'weight_decay': 0.009000000000000001, 'warmup_steps': 247}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1803,0.575952,0.844037,0.846931,0.843184,0.843436
2,0.0713,0.759583,0.840596,0.841045,0.840943,0.840594
3,0.0445,0.95042,0.829128,0.829686,0.828713,0.828883
4,0.0304,1.151111,0.817661,0.817631,0.817536,0.817574
5,0.0217,1.313318,0.817661,0.817739,0.817831,0.817655
6,0.0157,1.510051,0.809633,0.810068,0.809232,0.809376
7,0.0117,1.694629,0.813073,0.814363,0.812442,0.812618
8,0.0088,2.080169,0.815367,0.815879,0.814947,0.815102
9,0.0064,2.553483,0.831422,0.831499,0.831218,0.831305
10,0.0051,2.897781,0.813073,0.81456,0.8124,0.812574


[I 2025-03-24 02:38:31,010] Trial 32 pruned. 


Trial 33 with params: {'learning_rate': 0.00023852726625205314, 'weight_decay': 0.007, 'warmup_steps': 84}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1934,0.527036,0.841743,0.842275,0.841353,0.84153
2,0.0849,0.656847,0.84289,0.843612,0.843321,0.84288
3,0.0554,0.865378,0.823394,0.825064,0.822704,0.822902
4,0.0391,0.954561,0.833716,0.833668,0.833765,0.833689
5,0.0291,1.204281,0.815367,0.817151,0.816042,0.815279


[I 2025-03-24 02:50:20,591] Trial 33 pruned. 


Trial 34 with params: {'learning_rate': 0.00011899395188704171, 'weight_decay': 0.009000000000000001, 'warmup_steps': 168}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2402,0.48663,0.834862,0.835999,0.834302,0.834514
2,0.1184,0.55235,0.832569,0.832533,0.83247,0.832497
3,0.0852,0.682616,0.829128,0.829354,0.828839,0.828964
4,0.0665,0.83011,0.827982,0.828252,0.827671,0.827804
5,0.0532,0.842554,0.823394,0.825567,0.824135,0.823282
6,0.0434,1.009998,0.826835,0.829677,0.825956,0.826143
7,0.0364,0.99221,0.837156,0.837674,0.836764,0.836936
8,0.0308,1.145565,0.816514,0.816603,0.816284,0.816375
9,0.0262,1.324777,0.816514,0.817551,0.815947,0.816127
10,0.0225,1.418374,0.822248,0.822185,0.822251,0.822208


[I 2025-03-24 03:25:42,954] Trial 34 finished with value: 0.816417187730268 and parameters: {'learning_rate': 0.00011899395188704171, 'weight_decay': 0.009000000000000001, 'warmup_steps': 168}. Best is trial 30 with value: 0.8324621660744835.


Trial 35 with params: {'learning_rate': 9.026605403962055e-05, 'weight_decay': 0.007, 'warmup_steps': 236}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2644,0.468393,0.821101,0.822978,0.820367,0.820557
2,0.1368,0.545384,0.834862,0.834977,0.834638,0.834737
3,0.0997,0.662859,0.831422,0.833268,0.830713,0.830931
4,0.0799,0.769742,0.825688,0.827161,0.82504,0.825243
5,0.066,0.815408,0.830275,0.830778,0.830639,0.830272
6,0.0558,0.928454,0.822248,0.822554,0.821914,0.822051
7,0.0479,0.875049,0.826835,0.827385,0.826419,0.826587
8,0.0414,0.981058,0.817661,0.817631,0.817536,0.817574
9,0.0362,1.12676,0.824541,0.824677,0.824293,0.824397
10,0.0321,1.140277,0.826835,0.826974,0.826587,0.826692


[I 2025-03-24 03:59:00,234] Trial 35 finished with value: 0.8255329679297836 and parameters: {'learning_rate': 9.026605403962055e-05, 'weight_decay': 0.007, 'warmup_steps': 236}. Best is trial 30 with value: 0.8324621660744835.


Trial 36 with params: {'learning_rate': 0.002248224121235652, 'weight_decay': 0.004, 'warmup_steps': 209}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1229,0.595885,0.837156,0.837445,0.836849,0.836988
2,0.0417,0.887936,0.817661,0.819624,0.816905,0.817083
3,0.0247,1.136688,0.818807,0.818918,0.818999,0.818804
4,0.0167,1.448269,0.813073,0.81376,0.813495,0.813061
5,0.0122,1.780285,0.809633,0.809569,0.809569,0.809569
6,0.0092,1.866782,0.799312,0.800177,0.79876,0.79891
7,0.0069,1.598224,0.792431,0.792587,0.79213,0.792232
8,0.0054,1.82374,0.808486,0.808598,0.808232,0.808329
9,0.0043,2.403934,0.811927,0.811954,0.811737,0.811807
10,0.0035,2.183799,0.802752,0.803284,0.802307,0.802452


[I 2025-03-24 04:12:53,139] Trial 36 pruned. 


Trial 37 with params: {'learning_rate': 5.043982660362456e-05, 'weight_decay': 0.006, 'warmup_steps': 299}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3114,0.43029,0.819954,0.820616,0.819494,0.819664
2,0.1829,0.495031,0.824541,0.82449,0.824461,0.824475
3,0.1381,0.646658,0.809633,0.814692,0.808432,0.808398
4,0.1138,0.675621,0.821101,0.823469,0.820283,0.820462
5,0.0972,0.729103,0.836009,0.835962,0.836059,0.835983
6,0.0857,0.774454,0.833716,0.833654,0.833681,0.833666
7,0.0771,0.794163,0.829128,0.829109,0.829218,0.82911
8,0.0699,0.82937,0.822248,0.822221,0.822125,0.822163
9,0.0642,0.936302,0.830275,0.830219,0.830302,0.830243
10,0.0594,0.956504,0.823394,0.823335,0.823335,0.823335


[I 2025-03-24 04:33:39,865] Trial 37 finished with value: 0.8118464331763282 and parameters: {'learning_rate': 5.043982660362456e-05, 'weight_decay': 0.006, 'warmup_steps': 299}. Best is trial 30 with value: 0.8324621660744835.


Trial 38 with params: {'learning_rate': 0.00014261859903102946, 'weight_decay': 0.009000000000000001, 'warmup_steps': 235}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2308,0.493972,0.844037,0.844245,0.843774,0.843898
2,0.1092,0.568577,0.837156,0.837159,0.837017,0.83707
3,0.0773,0.708852,0.826835,0.827262,0.826461,0.826616
4,0.0588,0.871368,0.824541,0.824962,0.824166,0.824319
5,0.0458,0.88958,0.826835,0.828441,0.827471,0.826769
6,0.0366,1.078336,0.821101,0.825289,0.82003,0.820132
7,0.0301,1.105545,0.823394,0.823871,0.822998,0.823156
8,0.0249,1.316321,0.817661,0.818619,0.817115,0.817295
9,0.0207,1.414976,0.813073,0.813708,0.812611,0.812772
10,0.0172,1.669215,0.816514,0.816625,0.816705,0.81651


[I 2025-03-24 04:47:06,306] Trial 38 pruned. 


Trial 39 with params: {'learning_rate': 0.00015227206045384653, 'weight_decay': 0.002, 'warmup_steps': 155}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2231,0.501331,0.852064,0.852165,0.851867,0.851961
2,0.1056,0.579163,0.836009,0.835948,0.835975,0.835961
3,0.0743,0.716354,0.826835,0.827262,0.826461,0.826616
4,0.056,0.891772,0.829128,0.829451,0.828797,0.828939
5,0.0435,0.884965,0.825688,0.826984,0.826261,0.825643
6,0.0345,1.094424,0.824541,0.827955,0.823577,0.823734
7,0.0283,1.126996,0.831422,0.831987,0.831007,0.83118
8,0.0231,1.294874,0.823394,0.823871,0.822998,0.823156
9,0.0191,1.482165,0.819954,0.820083,0.819704,0.819806
10,0.0157,1.760379,0.825688,0.825623,0.825671,0.825643


[I 2025-03-24 05:07:18,547] Trial 39 finished with value: 0.8198686092394573 and parameters: {'learning_rate': 0.00015227206045384653, 'weight_decay': 0.002, 'warmup_steps': 155}. Best is trial 30 with value: 0.8324621660744835.


Trial 40 with params: {'learning_rate': 0.0002081476747934512, 'weight_decay': 0.006, 'warmup_steps': 295}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2092,0.521474,0.838303,0.839973,0.837638,0.837871
2,0.091,0.642473,0.834862,0.835369,0.835228,0.834859
3,0.0607,0.823797,0.822248,0.823801,0.821577,0.821773
4,0.0436,0.961006,0.831422,0.831568,0.831176,0.831283
5,0.0329,1.112988,0.822248,0.823838,0.822882,0.82218


[I 2025-03-24 05:14:03,339] Trial 40 pruned. 


Trial 41 with params: {'learning_rate': 0.00019407876163222518, 'weight_decay': 0.002, 'warmup_steps': 133}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2062,0.525867,0.83945,0.840112,0.839017,0.839205
2,0.0935,0.628959,0.845183,0.84542,0.845447,0.845183
3,0.0633,0.810028,0.822248,0.823067,0.821746,0.821927
4,0.046,0.956452,0.831422,0.831987,0.831007,0.83118
5,0.035,1.013592,0.822248,0.822946,0.822672,0.822236
6,0.027,1.314048,0.822248,0.824252,0.821493,0.821685
7,0.0214,1.274046,0.834862,0.835508,0.834428,0.834611
8,0.0169,1.51996,0.824541,0.824758,0.824251,0.824373
9,0.0135,1.683172,0.819954,0.819926,0.81983,0.819869
10,0.0106,2.293107,0.817661,0.817607,0.817578,0.817591


[I 2025-03-24 05:27:48,016] Trial 41 pruned. 


Trial 42 with params: {'learning_rate': 0.0001971985349566944, 'weight_decay': 0.0, 'warmup_steps': 163}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2068,0.524034,0.845183,0.845794,0.844774,0.844961
2,0.093,0.641262,0.845183,0.84552,0.845489,0.845183
3,0.0627,0.812953,0.825688,0.826603,0.825166,0.825356
4,0.0455,0.940881,0.836009,0.836163,0.835765,0.835874
5,0.0345,1.03613,0.832569,0.832954,0.832891,0.832568
6,0.0266,1.29951,0.819954,0.821938,0.819199,0.819384
7,0.0211,1.295558,0.827982,0.828025,0.827797,0.827872
8,0.0166,1.556869,0.832569,0.832568,0.832428,0.832481
9,0.0131,1.78303,0.822248,0.82246,0.821956,0.822077
10,0.0103,2.158561,0.817661,0.817612,0.817704,0.817632


[I 2025-03-24 05:41:48,734] Trial 42 pruned. 


Trial 43 with params: {'learning_rate': 0.0008969164072375918, 'weight_decay': 0.007, 'warmup_steps': 95}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1364,0.63319,0.836009,0.837247,0.835428,0.835646
2,0.0475,0.883958,0.819954,0.820162,0.819662,0.819781
3,0.027,1.080976,0.813073,0.813007,0.813031,0.813018
4,0.0169,1.179409,0.817661,0.817595,0.81762,0.817607
5,0.0113,1.503181,0.811927,0.812037,0.812116,0.811923
6,0.0079,1.752423,0.819954,0.820362,0.819578,0.819726
7,0.0057,1.91578,0.81422,0.814206,0.814073,0.814122
8,0.0043,2.57846,0.813073,0.813908,0.813537,0.813053
9,0.0034,2.578597,0.81078,0.810715,0.810779,0.810738
10,0.0027,2.89502,0.806193,0.806513,0.806485,0.806192


[I 2025-03-24 05:55:37,585] Trial 43 pruned. 


Trial 44 with params: {'learning_rate': 0.00011820049960409872, 'weight_decay': 0.002, 'warmup_steps': 159}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2404,0.491061,0.833716,0.834756,0.833175,0.833382
2,0.1189,0.555323,0.830275,0.830218,0.830218,0.830218
3,0.0857,0.686738,0.829128,0.829451,0.828797,0.828939
4,0.0669,0.82707,0.823394,0.823871,0.822998,0.823156
5,0.0535,0.852478,0.822248,0.824059,0.822925,0.822163


[I 2025-03-24 06:02:30,474] Trial 44 pruned. 


Trial 45 with params: {'learning_rate': 6.607759156809839e-05, 'weight_decay': 0.003, 'warmup_steps': 221}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.288,0.448467,0.818807,0.820442,0.818115,0.818302
2,0.161,0.528543,0.831422,0.831375,0.83147,0.831395
3,0.1195,0.700445,0.808486,0.8141,0.807222,0.807135
4,0.0976,0.75183,0.825688,0.827604,0.824956,0.825158
5,0.0825,0.778323,0.836009,0.83599,0.836101,0.835992
6,0.0722,0.873031,0.826835,0.826785,0.826755,0.826769
7,0.0637,0.836389,0.823394,0.823567,0.823125,0.823237
8,0.0569,0.898309,0.821101,0.82127,0.82083,0.820942
9,0.0511,1.023424,0.829128,0.829066,0.829092,0.829078
10,0.0466,1.044392,0.821101,0.821137,0.820914,0.820987


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--accuracy/f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Sat Oct 12 13:56:14 2024) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.
[I 2025-03-24 06:23:05,862] Trial 45 finished with value: 0.8118276174235247 and parameters: {'learning_rate': 6.607759156809839e-05, 'weight_decay': 0.003, 'warmup_steps': 221}. Best is trial 30 with value: 0.8324621660744835.


Trial 46 with params: {'learning_rate': 6.55815398076603e-05, 'weight_decay': 0.01, 'warmup_steps': 319}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2931,0.440101,0.818807,0.819536,0.818325,0.818498
2,0.1619,0.522797,0.829128,0.829081,0.829176,0.829101
3,0.1199,0.6555,0.816514,0.819375,0.81561,0.815754
4,0.0979,0.75814,0.823394,0.826638,0.822451,0.82261
5,0.0827,0.772683,0.838303,0.838326,0.838438,0.838292
6,0.0723,0.864989,0.822248,0.822261,0.822083,0.822145
7,0.0637,0.836964,0.826835,0.827056,0.826545,0.826669
8,0.057,0.888708,0.821101,0.821066,0.821167,0.821077
9,0.0513,1.026799,0.826835,0.826772,0.826798,0.826784
10,0.0467,1.050285,0.821101,0.821041,0.821041,0.821041


[I 2025-03-24 06:36:36,876] Trial 46 pruned. 


Trial 47 with params: {'learning_rate': 0.00010793900050683744, 'weight_decay': 0.006, 'warmup_steps': 248}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2515,0.482482,0.827982,0.829081,0.827419,0.827619
2,0.1252,0.54776,0.83945,0.83957,0.839227,0.839328
3,0.0906,0.675074,0.826835,0.827385,0.826419,0.826587
4,0.0714,0.786029,0.823394,0.824,0.822956,0.823126
5,0.0579,0.81597,0.825688,0.827184,0.826303,0.825629
6,0.048,0.929181,0.832569,0.834543,0.831839,0.83206
7,0.0405,0.918752,0.837156,0.837552,0.836806,0.836963
8,0.0345,1.062482,0.816514,0.816501,0.816368,0.816417
9,0.0297,1.232974,0.817661,0.818459,0.817157,0.817332
10,0.0258,1.301281,0.821101,0.821137,0.820914,0.820987


[I 2025-03-24 06:56:54,684] Trial 47 finished with value: 0.8175336083858105 and parameters: {'learning_rate': 0.00010793900050683744, 'weight_decay': 0.006, 'warmup_steps': 248}. Best is trial 30 with value: 0.8324621660744835.


Trial 48 with params: {'learning_rate': 0.00030144665270360565, 'weight_decay': 0.002, 'warmup_steps': 17}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1772,0.560976,0.83945,0.841242,0.838764,0.839002
2,0.0744,0.681338,0.83945,0.839562,0.839648,0.839446
3,0.047,0.886767,0.825688,0.825865,0.825419,0.825533
4,0.0324,1.010382,0.833716,0.833795,0.833512,0.8336
5,0.0235,1.258491,0.824541,0.82614,0.825177,0.824475
6,0.0174,1.588811,0.823394,0.825787,0.822577,0.822764
7,0.013,1.660435,0.822248,0.822918,0.821788,0.821961
8,0.0097,2.099288,0.817661,0.819395,0.816947,0.817129
9,0.0072,2.466537,0.825688,0.826055,0.825335,0.825482
10,0.0056,2.63182,0.813073,0.813708,0.812611,0.812772


[I 2025-03-24 07:10:13,110] Trial 48 pruned. 


Trial 49 with params: {'learning_rate': 5.3252681664836025e-05, 'weight_decay': 0.007, 'warmup_steps': 162}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3032,0.435733,0.815367,0.816486,0.814779,0.814958
2,0.179,0.496302,0.827982,0.827925,0.828008,0.827949
3,0.135,0.668016,0.809633,0.815886,0.808306,0.808176
4,0.1112,0.690091,0.825688,0.827847,0.824914,0.825113
5,0.0947,0.73109,0.833716,0.833696,0.833807,0.833698
6,0.0834,0.798178,0.830275,0.830273,0.830134,0.830186
7,0.0749,0.801212,0.824541,0.824478,0.824503,0.824489
8,0.0678,0.84125,0.826835,0.826785,0.826755,0.826769
9,0.0619,0.942137,0.831422,0.83136,0.831428,0.831385
10,0.0572,0.962445,0.823394,0.823329,0.823377,0.823349


[I 2025-03-24 07:30:20,731] Trial 49 finished with value: 0.8175547376579559 and parameters: {'learning_rate': 5.3252681664836025e-05, 'weight_decay': 0.007, 'warmup_steps': 162}. Best is trial 30 with value: 0.8324621660744835.


Trial 50 with params: {'learning_rate': 0.00020965151071803784, 'weight_decay': 0.003, 'warmup_steps': 218}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2063,0.527965,0.84633,0.846876,0.845942,0.846123


In [None]:
print(best_trial3)

In [None]:
base.reset_seed()

In [None]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill-embedd_aug_hp-search", logging_dir=f"~/logs/{DATASET}/bilstm-distill-embedd_aug_hp-search", remove_unused_columns=False, epochs=num_epochs, batch_size=batch_size)

In [None]:
def hp_space(trial):
    params =  {
        "learning_rate": trial.suggest_float("learning_rate", 5e-5, 5e-3, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0, 1e-2, step=1e-3),
        "warmup_steps" : trial.suggest_int("warmup_steps", 0, warm_up),
        "lambda_param": trial.suggest_float("lambda_param",0,1,step=.1),
        "temperature": trial.suggest_float("temperature", 2,7, step=.5)
    }
    print(f"Trial {trial.number} with params: {params}")
    return params

In [None]:
pruner = optuna.pruners.HyperbandPruner(min_resource=min_r, max_resource=max_r, reduction_factor=2, bootstrap_count=2)
sampler = optuna.samplers.TPESampler(seed=42, multivariate=True)

In [None]:
trainer = base.DistilTrainer(
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    model_init = lambda: get_BiLSTM(),
)
  

In [None]:
best_trial4 = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    compute_objective=lambda metrics: metrics["eval_f1"],
    pruner=pruner,
    sampler=sampler,
    study_name="Distill-aug-embedd",
    n_trials=150
)

In [None]:
print(best_trial4)

In [None]:
print("Best normal training score: ", best_trial)
print("Best distilation trianing score: ", best_trial2)
print("Best normal training score with augmentations: ", best_trial3)
print("Best distilation trianing score with augmentations: ",best_trial4)