In [1]:
from transformers import BasicTokenizer, Trainer
from datasets import concatenate_datasets, load_from_disk
import kagglehub
import optuna
import torch
import math
import base

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [None]:
base.reset_seed()

In [3]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


In [4]:
my_glove = kagglehub.dataset_download("thanakomsn/glove6b300dtxt")
print(my_glove)

/home/jovyan/.cache/kagglehub/datasets/thanakomsn/glove6b300dtxt/versions/1


In [None]:
GLOVE_FILE = f"{my_glove}/glove.6B.300d.txt"
DATASET = "sst2"

In [None]:
train_data = load_from_disk(f"~/data/{DATASET}/train-logits")
eval_data = load_from_disk(f"~/data/{DATASET}/eval-logits")
test_data = load_from_disk(f"~/data/{DATASET}/test-logits")

all_train_data = load_from_disk(f"~/data/{DATASET}/train-logits-augmented")

all_data = concatenate_datasets([load_from_disk(file) for file in [f"~/data/{DATASET}/eval-logits", f"~/data/{DATASET}/test-logits", f"~/data/{DATASET}/train-logits-augmented"]])
tokenizer = BasicTokenizer(do_lower_case=True)

In [7]:
train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), train_data))
eval_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), eval_data))
test_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), test_data))

all_train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_train_data))

all_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_data))

In [8]:
vocab = base.get_vocab(all_data_tokens)

In [9]:
word_index = dict(zip(vocab, range(len(vocab))))

In [10]:
embeddings_index = base.get_embeddings_indeces(GLOVE_FILE)

Found 400000 word vectors.


In [None]:
print(len(vocab))
num_tokens = len(vocab) + 2
embedding_dim = 300

14621


In [12]:
embedding_matrix = base.get_embedding_matrix(num_tokens, embedding_dim, word_index, embeddings_index)

Converted 14305 words (316) misses


In [13]:
train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),train_data_tokens))
eval_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),eval_data_tokens))
test_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),test_data_tokens))

all_train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),all_train_data_tokens))

In [14]:
train_padded_data = list(map(lambda x: base.padd(x,60), train_data_index))
eval_padded_data = list(map(lambda x: base.padd(x,60), eval_data_index))
test_padded_data = list(map(lambda x: base.padd(x,60), test_data_index))

all_train_padded_data = list(map(lambda x: base.padd(x,60), all_train_data_index))

In [15]:
train_data = train_data.add_column("input_ids", train_padded_data)
eval_data = eval_data.add_column("input_ids", eval_padded_data)
test_data = test_data.add_column("input_ids", test_padded_data)

all_train_data = all_train_data.add_column("input_ids", all_train_padded_data)

In [None]:
num_epochs = 15
batch_size = 128

In [17]:
#Nápočet epoch na steps
data_length = len(train_data)
min_r = math.ceil(data_length/batch_size)*5
max_r = math.ceil(data_length/batch_size)*num_epochs
warm_up = math.ceil(data_length/batch_size/10)

In [18]:
def hp_space(trial):
    params =  {
        "learning_rate": trial.suggest_float("learning_rate", 5e-5, 5e-3, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0, 1e-2, step=1e-3),
        "warmup_steps" : trial.suggest_int("warmup_steps", 0, warm_up)
    }   
    print(f"Trial {trial.number} with params: {params}")
    return params

In [19]:
pruner = optuna.pruners.HyperbandPruner(min_resource=min_r, max_resource=max_r, reduction_factor=2, bootstrap_count=2)
sampler = optuna.samplers.TPESampler(seed=42, multivariate=True)



In [20]:
def get_BiLSTM():
    return base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=2, freeze_embed=True)

In [None]:
base.reset_seed()

In [22]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base_hp-search", logging_dir=f"~/logs/{DATASET}/bilstm-base_hp-search", epochs=num_epochs, batch_size=batch_size)

In [23]:
trainer = Trainer(
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    model_init = lambda: get_BiLSTM()
)
  

In [24]:
best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    compute_objective=lambda metrics: metrics["eval_f1"],
    pruner=pruner,
    sampler=sampler,
    study_name="Base",
    n_trials=150
)

[I 2025-03-23 01:15:54,583] A new study created in memory with name: Base


Trial 0 with params: {'learning_rate': 0.0002805758207667253, 'weight_decay': 0.01, 'warmup_steps': 32}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4226,0.434365,0.788991,0.79719,0.790477,0.788053
2,0.3378,0.419709,0.809633,0.812991,0.808643,0.808727
3,0.3026,0.417057,0.809633,0.811507,0.810327,0.809533
4,0.2738,0.473746,0.811927,0.823243,0.810179,0.809619
5,0.2423,0.436711,0.827982,0.828483,0.828345,0.827978
6,0.2157,0.468983,0.829128,0.833631,0.828039,0.828174
7,0.1939,0.43561,0.832569,0.833693,0.832007,0.832216
8,0.1765,0.484555,0.84633,0.847529,0.845773,0.846006
9,0.16,0.524556,0.83945,0.839415,0.839522,0.839428
10,0.1471,0.504254,0.829128,0.835197,0.830355,0.828672


[I 2025-03-23 01:17:35,258] Trial 0 pruned. 


Trial 1 with params: {'learning_rate': 0.0007875660249889869, 'weight_decay': 0.001, 'warmup_steps': 6}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3888,0.4346,0.797018,0.808253,0.798739,0.795739
2,0.3042,0.401513,0.831422,0.837135,0.830208,0.830297
3,0.2431,0.381669,0.847477,0.849213,0.846815,0.84707
4,0.191,0.472234,0.834862,0.837372,0.834049,0.834273
5,0.1503,0.451819,0.850917,0.850863,0.850951,0.850889


[I 2025-03-23 01:18:43,933] Trial 1 pruned. 


Trial 2 with params: {'learning_rate': 6.533369619026643e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 26}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4958,0.460724,0.772936,0.772861,0.772901,0.772877
2,0.3804,0.459922,0.780963,0.784045,0.779942,0.779886
3,0.3644,0.46012,0.787844,0.792628,0.786615,0.786428
4,0.3542,0.456391,0.792431,0.79696,0.791246,0.791123
5,0.3447,0.427135,0.794725,0.794758,0.794508,0.794582
6,0.3362,0.427839,0.792431,0.794127,0.793098,0.792333
7,0.3305,0.436246,0.798165,0.800218,0.798897,0.798037
8,0.3245,0.431581,0.802752,0.803235,0.803107,0.802748
9,0.319,0.436428,0.806193,0.806338,0.806401,0.80619
10,0.3126,0.444432,0.793578,0.801419,0.795024,0.792723


[I 2025-03-23 01:20:53,318] Trial 2 pruned. 


Trial 3 with params: {'learning_rate': 0.0013035123791853842, 'weight_decay': 0.0, 'warmup_steps': 42}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3869,0.433915,0.793578,0.803366,0.795192,0.79246
2,0.2815,0.408047,0.836009,0.839263,0.835091,0.835305
3,0.2087,0.416998,0.830275,0.833931,0.829292,0.829468
4,0.1536,0.470885,0.854358,0.858186,0.853404,0.853688
5,0.1094,0.521016,0.857798,0.85774,0.857792,0.857762


[I 2025-03-23 01:22:13,423] Trial 3 pruned. 


Trial 4 with params: {'learning_rate': 0.002311294500510415, 'weight_decay': 0.002, 'warmup_steps': 8}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.365,0.42593,0.798165,0.807566,0.799739,0.79714
2,0.2534,0.397882,0.840596,0.84186,0.840016,0.840243
3,0.1723,0.438381,0.834862,0.838925,0.833839,0.834023
4,0.1162,0.487608,0.84289,0.849285,0.841637,0.841781
5,0.0771,0.499642,0.853211,0.853157,0.853246,0.853183
6,0.0481,0.556037,0.844037,0.84434,0.843732,0.843876
7,0.0311,0.668976,0.854358,0.85451,0.854582,0.854356
8,0.0223,0.807854,0.853211,0.85335,0.852993,0.8531
9,0.0149,0.864902,0.855505,0.855446,0.855498,0.855467
10,0.0112,0.945799,0.850917,0.851054,0.850699,0.850804


[I 2025-03-23 01:25:28,192] Trial 4 finished with value: 0.8496043015339487 and parameters: {'learning_rate': 0.002311294500510415, 'weight_decay': 0.002, 'warmup_steps': 8}. Best is trial 4 with value: 0.8496043015339487.


Trial 5 with params: {'learning_rate': 0.00011635338541918901, 'weight_decay': 0.003, 'warmup_steps': 23}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.459,0.447281,0.784404,0.788265,0.785426,0.784036
2,0.3658,0.445509,0.794725,0.797189,0.793835,0.793905
3,0.3455,0.446464,0.799312,0.800899,0.798592,0.798727
4,0.3321,0.461228,0.798165,0.802634,0.797003,0.79693
5,0.3168,0.422235,0.806193,0.806124,0.806148,0.806135


[I 2025-03-23 01:26:26,391] Trial 5 pruned. 


Trial 6 with params: {'learning_rate': 0.0003654769917956456, 'weight_decay': 0.003, 'warmup_steps': 26}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4112,0.437381,0.790138,0.79908,0.791688,0.789106
2,0.3276,0.417023,0.815367,0.81963,0.814273,0.814335
3,0.2848,0.411296,0.821101,0.821698,0.820662,0.820829
4,0.2482,0.506715,0.821101,0.834679,0.81923,0.818619
5,0.2141,0.454078,0.836009,0.837645,0.836649,0.835947
6,0.1847,0.488407,0.83945,0.842907,0.838511,0.838736
7,0.1627,0.475942,0.838303,0.838546,0.838017,0.838148
8,0.1436,0.501026,0.856651,0.856614,0.856582,0.856597
9,0.1258,0.586877,0.847477,0.84806,0.847868,0.847472
10,0.1132,0.539183,0.848624,0.852781,0.849625,0.848393


[I 2025-03-23 01:29:28,065] Trial 6 finished with value: 0.8450349171663452 and parameters: {'learning_rate': 0.0003654769917956456, 'weight_decay': 0.003, 'warmup_steps': 26}. Best is trial 4 with value: 0.8496043015339487.


Trial 7 with params: {'learning_rate': 9.505122659935192e-05, 'weight_decay': 0.003, 'warmup_steps': 16}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4706,0.451794,0.77867,0.779761,0.779206,0.778621
2,0.3711,0.44256,0.786697,0.787903,0.786036,0.786153
3,0.3524,0.447841,0.795872,0.797751,0.795087,0.795198
4,0.3406,0.458858,0.794725,0.800025,0.793456,0.793276
5,0.3277,0.425229,0.805046,0.804977,0.805022,0.804996


[I 2025-03-23 01:30:22,194] Trial 7 pruned. 


Trial 8 with params: {'learning_rate': 0.00040842279473800845, 'weight_decay': 0.008, 'warmup_steps': 8}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4058,0.437152,0.797018,0.802227,0.798192,0.796524
2,0.3271,0.410845,0.823394,0.826638,0.822451,0.82261
3,0.2822,0.413632,0.819954,0.821708,0.819241,0.81943
4,0.2431,0.454389,0.824541,0.830947,0.82324,0.823234
5,0.2069,0.440132,0.844037,0.844228,0.844279,0.844036


[I 2025-03-23 01:31:39,576] Trial 8 pruned. 


Trial 9 with params: {'learning_rate': 0.0005338741354740678, 'weight_decay': 0.006, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3959,0.433147,0.800459,0.807114,0.801781,0.7998
2,0.3171,0.418104,0.826835,0.830605,0.825829,0.825983
3,0.2676,0.408952,0.831422,0.834318,0.830544,0.830749
4,0.2248,0.422627,0.829128,0.831448,0.828334,0.828542
5,0.185,0.425146,0.852064,0.852797,0.852499,0.852055
6,0.1534,0.542699,0.831422,0.839902,0.829955,0.829881
7,0.1291,0.471851,0.854358,0.856389,0.853656,0.853933
8,0.1075,0.522057,0.856651,0.857243,0.857045,0.856647
9,0.0884,0.659806,0.84633,0.84947,0.847205,0.846172
10,0.073,0.619202,0.852064,0.854509,0.852835,0.851961


[I 2025-03-23 01:34:42,050] Trial 9 finished with value: 0.8520313332412541 and parameters: {'learning_rate': 0.0005338741354740678, 'weight_decay': 0.006, 'warmup_steps': 2}. Best is trial 9 with value: 0.8520313332412541.


Trial 10 with params: {'learning_rate': 0.0026025741521183794, 'weight_decay': 0.007, 'warmup_steps': 20}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3661,0.42661,0.794725,0.804288,0.796319,0.793648
2,0.2469,0.389532,0.850917,0.853593,0.850109,0.850385
3,0.1669,0.442832,0.84289,0.846559,0.841932,0.842167
4,0.112,0.438471,0.850917,0.853047,0.850194,0.850464
5,0.0724,0.507424,0.860092,0.862299,0.859371,0.859667
6,0.0454,0.596148,0.856651,0.856758,0.856456,0.856552
7,0.0317,0.645976,0.860092,0.860057,0.860171,0.860073
8,0.0217,0.787935,0.853211,0.854257,0.852698,0.852932
9,0.0129,0.881171,0.845183,0.845164,0.845279,0.845167
10,0.0121,0.985305,0.849771,0.849762,0.849657,0.849699


[I 2025-03-23 01:37:12,209] Trial 10 pruned. 


Trial 11 with params: {'learning_rate': 0.0020056372842325635, 'weight_decay': 0.006, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3626,0.402781,0.807339,0.81111,0.808327,0.807046
2,0.2579,0.391191,0.841743,0.845913,0.840722,0.840939
3,0.18,0.487311,0.836009,0.849311,0.834207,0.833868
4,0.1246,0.532839,0.84633,0.850941,0.845268,0.845498
5,0.0823,0.533727,0.836009,0.838137,0.835259,0.83549
6,0.0554,0.680521,0.857798,0.860847,0.85695,0.857251
7,0.0362,0.753649,0.853211,0.853923,0.852783,0.852988
8,0.0237,0.854453,0.853211,0.854447,0.852656,0.852901
9,0.0158,0.90636,0.862385,0.862339,0.862339,0.862339
10,0.0128,0.915811,0.855505,0.85547,0.855582,0.855486


[I 2025-03-23 01:40:37,627] Trial 11 finished with value: 0.8541965366016144 and parameters: {'learning_rate': 0.0020056372842325635, 'weight_decay': 0.006, 'warmup_steps': 0}. Best is trial 11 with value: 0.8541965366016144.


Trial 12 with params: {'learning_rate': 0.00347910804452505, 'weight_decay': 0.006, 'warmup_steps': 4}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3524,0.400476,0.81422,0.816599,0.815,0.814079
2,0.2341,0.405727,0.849771,0.850141,0.849446,0.849604
3,0.1541,0.461088,0.848624,0.848635,0.848489,0.848544
4,0.1012,0.602638,0.829128,0.837041,0.827703,0.827641
5,0.063,0.571703,0.849771,0.849711,0.849783,0.849737


[I 2025-03-23 01:41:33,470] Trial 12 pruned. 


Trial 13 with params: {'learning_rate': 0.0009263363105887989, 'weight_decay': 0.006, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3827,0.438404,0.788991,0.802573,0.790898,0.787289
2,0.2957,0.414626,0.836009,0.838396,0.835217,0.835446
3,0.2306,0.437481,0.830275,0.836172,0.829039,0.82911
4,0.177,0.461199,0.833716,0.835139,0.833091,0.83331
5,0.1353,0.479793,0.854358,0.854302,0.85433,0.854315
6,0.1026,0.529189,0.848624,0.851879,0.847731,0.847997
7,0.0757,0.549785,0.853211,0.854873,0.852572,0.852836
8,0.0532,0.647503,0.853211,0.853162,0.853162,0.853162
9,0.0388,0.775014,0.84633,0.847343,0.845815,0.846038
10,0.0266,0.870917,0.853211,0.853731,0.853583,0.853208


[I 2025-03-23 01:44:43,159] Trial 13 finished with value: 0.8588913227951769 and parameters: {'learning_rate': 0.0009263363105887989, 'weight_decay': 0.006, 'warmup_steps': 1}. Best is trial 13 with value: 0.8588913227951769.


Trial 14 with params: {'learning_rate': 7.814614589561196e-05, 'weight_decay': 0.0, 'warmup_steps': 43}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4888,0.455067,0.774083,0.774301,0.774322,0.774082
2,0.3746,0.458778,0.78211,0.78595,0.780984,0.780855
3,0.3579,0.45303,0.795872,0.798479,0.794961,0.795026
4,0.3474,0.455286,0.795872,0.799959,0.79475,0.794695
5,0.3361,0.427015,0.798165,0.798095,0.798139,0.798113


[I 2025-03-23 01:45:43,263] Trial 14 pruned. 


Trial 15 with params: {'learning_rate': 0.001195386628779761, 'weight_decay': 0.009000000000000001, 'warmup_steps': 7}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3803,0.429201,0.792431,0.800948,0.79394,0.791477
2,0.2859,0.406918,0.837156,0.838931,0.83647,0.836702
3,0.2153,0.445328,0.833716,0.837936,0.83267,0.832843
4,0.1596,0.50768,0.84633,0.854482,0.844931,0.845026
5,0.1158,0.476013,0.856651,0.856614,0.856582,0.856597
6,0.0848,0.518842,0.854358,0.857523,0.853488,0.853776
7,0.0593,0.576817,0.847477,0.848408,0.846984,0.847202
8,0.04,0.658202,0.853211,0.853226,0.853077,0.853134
9,0.028,0.788651,0.855505,0.856321,0.855961,0.855492
10,0.0189,0.871339,0.853211,0.853162,0.853162,0.853162


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--f1/34c46321f42186df33a6260966e34a368f14868d9cc2ba47d142112e2800d233 (last modified on Fri Jan 10 23:14:01 2025) since it couldn't be found locally at evaluate-metric--f1, or remotely on the Hugging Face Hub.
[I 2025-03-23 01:48:57,657] Trial 15 finished with value: 0.8531615727877411 and parameters: {'learning_rate': 0.001195386628779761, 'weight_decay': 0.009000000000000001, 'warmup_steps': 7}. Best is trial 13 with value: 0.8588913227951769.


Trial 16 with params: {'learning_rate': 0.0002973179777011377, 'weight_decay': 0.003, 'warmup_steps': 0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4128,0.435405,0.794725,0.799539,0.795855,0.79427
2,0.3383,0.418953,0.813073,0.816034,0.812147,0.812271
3,0.3027,0.410547,0.809633,0.810007,0.809948,0.809632
4,0.2719,0.450427,0.818807,0.825289,0.817483,0.81742
5,0.238,0.41992,0.832569,0.833074,0.832933,0.832565


[I 2025-03-23 01:49:57,228] Trial 16 pruned. 


Trial 17 with params: {'learning_rate': 0.0011864769262023693, 'weight_decay': 0.005, 'warmup_steps': 14}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3833,0.421547,0.807339,0.810797,0.808285,0.80708
2,0.2859,0.413719,0.833716,0.836639,0.832839,0.833051
3,0.2159,0.398132,0.83945,0.842292,0.838596,0.838831
4,0.1618,0.451749,0.847477,0.850261,0.846647,0.846912
5,0.1175,0.421869,0.861239,0.861219,0.861339,0.861224
6,0.0864,0.586614,0.84633,0.852581,0.8451,0.845275
7,0.0593,0.599913,0.856651,0.857161,0.856287,0.85647
8,0.0406,0.664168,0.856651,0.856614,0.856582,0.856597
9,0.029,0.745708,0.861239,0.861636,0.860918,0.861085
10,0.0194,0.886173,0.863532,0.863582,0.863381,0.863453


[I 2025-03-23 01:53:40,372] Trial 17 finished with value: 0.8472019657055526 and parameters: {'learning_rate': 0.0011864769262023693, 'weight_decay': 0.005, 'warmup_steps': 14}. Best is trial 13 with value: 0.8588913227951769.


Trial 18 with params: {'learning_rate': 0.003996373427382285, 'weight_decay': 0.01, 'warmup_steps': 5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3522,0.389576,0.825688,0.827872,0.826429,0.825577
2,0.2277,0.414471,0.852064,0.854609,0.851278,0.851556
3,0.1473,0.480591,0.833716,0.83694,0.832797,0.833002
4,0.0954,0.501838,0.840596,0.842064,0.839974,0.840208
5,0.06,0.6303,0.848624,0.850734,0.847899,0.848164
6,0.0404,0.643057,0.860092,0.86024,0.859876,0.859986
7,0.0287,0.751195,0.850917,0.850858,0.850909,0.850879
8,0.0193,1.000726,0.849771,0.849762,0.849657,0.849699
9,0.0143,0.977389,0.849771,0.849711,0.849783,0.849737
10,0.0093,1.062169,0.850917,0.851137,0.850657,0.850785


[I 2025-03-23 01:55:38,753] Trial 18 pruned. 


Trial 19 with params: {'learning_rate': 0.00017098269191031398, 'weight_decay': 0.005, 'warmup_steps': 43}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4456,0.445953,0.78555,0.795367,0.787183,0.784352
2,0.3547,0.436239,0.805046,0.806575,0.804349,0.804502
3,0.3294,0.433751,0.802752,0.802969,0.802433,0.802549
4,0.3109,0.457023,0.811927,0.814731,0.811021,0.811148
5,0.2897,0.431666,0.813073,0.81322,0.813284,0.813071


[I 2025-03-23 01:56:46,704] Trial 19 pruned. 


Trial 20 with params: {'learning_rate': 7.828712010044815e-05, 'weight_decay': 0.006, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4779,0.456174,0.774083,0.774389,0.774364,0.774082
2,0.3761,0.45763,0.780963,0.784045,0.779942,0.779886
3,0.3595,0.453474,0.791284,0.7941,0.79033,0.790357
4,0.3488,0.450945,0.795872,0.800296,0.794708,0.794623
5,0.3379,0.424353,0.798165,0.798165,0.798266,0.798148
6,0.3283,0.426165,0.793578,0.795609,0.794308,0.793447
7,0.3212,0.440473,0.797018,0.800241,0.797939,0.796761
8,0.3139,0.431399,0.805046,0.806901,0.805738,0.804943
9,0.3072,0.437897,0.802752,0.802701,0.802644,0.802668
10,0.2988,0.446983,0.799312,0.807514,0.800781,0.798451


[I 2025-03-23 01:58:42,971] Trial 20 pruned. 


Trial 21 with params: {'learning_rate': 0.0011359091682404978, 'weight_decay': 0.008, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.378,0.426467,0.791284,0.798628,0.792688,0.790481
2,0.2855,0.409963,0.838303,0.840451,0.837554,0.837791
3,0.2156,0.422872,0.827982,0.831937,0.826955,0.827108
4,0.1618,0.437635,0.831422,0.834932,0.83046,0.830647
5,0.1186,0.463749,0.852064,0.852025,0.851993,0.852008
6,0.0855,0.551175,0.837156,0.845553,0.835712,0.835703
7,0.0607,0.569909,0.858945,0.858926,0.859045,0.85893
8,0.0403,0.673981,0.840596,0.840537,0.840564,0.840549
9,0.0277,0.818986,0.857798,0.858379,0.857413,0.857606
10,0.0181,0.913404,0.844037,0.844687,0.844447,0.844029


[I 2025-03-23 02:00:57,894] Trial 21 pruned. 


Trial 22 with params: {'learning_rate': 0.0041707518022780265, 'weight_decay': 0.002, 'warmup_steps': 26}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3607,0.424207,0.816514,0.821798,0.817673,0.816087
2,0.2341,0.40546,0.845183,0.849965,0.8441,0.844318
3,0.1535,0.435109,0.848624,0.849177,0.848236,0.84842
4,0.0988,0.615606,0.834862,0.845353,0.833249,0.833084
5,0.0623,0.672852,0.854358,0.855915,0.85374,0.854003
6,0.0432,0.694627,0.847477,0.847648,0.847236,0.847352
7,0.0318,0.82662,0.848624,0.848737,0.848825,0.848621
8,0.0225,0.856466,0.844037,0.844576,0.843647,0.843826
9,0.0154,0.819487,0.84633,0.846302,0.846236,0.846265
10,0.0095,1.095017,0.844037,0.844576,0.843647,0.843826


[I 2025-03-23 02:03:12,225] Trial 22 pruned. 


Trial 23 with params: {'learning_rate': 0.0002626971788994903, 'weight_decay': 0.01, 'warmup_steps': 1}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4183,0.439234,0.787844,0.795786,0.789309,0.786934
2,0.3409,0.419379,0.807339,0.808686,0.806685,0.806848
3,0.3078,0.415742,0.809633,0.810887,0.8102,0.809584
4,0.2789,0.48023,0.81078,0.819611,0.809222,0.808877
5,0.2469,0.421262,0.838303,0.83875,0.838648,0.838301


[I 2025-03-23 02:04:11,103] Trial 23 pruned. 


Trial 24 with params: {'learning_rate': 0.0016388291759872771, 'weight_decay': 0.006, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3696,0.429362,0.78555,0.794864,0.787141,0.784425
2,0.2684,0.388087,0.840596,0.842516,0.83989,0.840132
3,0.1934,0.447756,0.833716,0.837257,0.832754,0.832951
4,0.1349,0.457208,0.841743,0.845231,0.840806,0.84104
5,0.0911,0.49476,0.847477,0.849707,0.846731,0.846994
6,0.0614,0.552779,0.865826,0.866233,0.865507,0.865677
7,0.0401,0.67428,0.850917,0.850917,0.851036,0.850905
8,0.0279,0.799504,0.855505,0.85545,0.85554,0.855477
9,0.0196,0.913549,0.855505,0.856676,0.856045,0.855477
10,0.0129,1.115601,0.833716,0.833949,0.833975,0.833715


[I 2025-03-23 02:06:17,687] Trial 24 pruned. 


Trial 25 with params: {'learning_rate': 0.0009022433542720399, 'weight_decay': 0.01, 'warmup_steps': 9}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3854,0.429661,0.799312,0.80849,0.800865,0.798325
2,0.2968,0.408487,0.83945,0.842907,0.838511,0.838736
3,0.2337,0.398294,0.83945,0.844715,0.838301,0.838467
4,0.1805,0.46557,0.831422,0.834932,0.83046,0.830647
5,0.1393,0.448115,0.860092,0.860057,0.860171,0.860073
6,0.1077,0.514799,0.847477,0.855919,0.846058,0.84615
7,0.0806,0.526923,0.855505,0.86108,0.854361,0.854621
8,0.0582,0.585748,0.860092,0.860758,0.860508,0.860085
9,0.0407,0.723201,0.856651,0.857731,0.857171,0.856629
10,0.0291,0.831031,0.853211,0.853162,0.853162,0.853162


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--accuracy/f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Sat Oct 12 13:56:14 2024) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.
[I 2025-03-23 02:12:13,677] Trial 25 finished with value: 0.8554559232129326 and parameters: {'learning_rate': 0.0009022433542720399, 'weight_decay': 0.01, 'warmup_steps': 9}. Best is trial 13 with value: 0.8588913227951769.


Trial 26 with params: {'learning_rate': 0.00036570148061332796, 'weight_decay': 0.01, 'warmup_steps': 12}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4089,0.437464,0.790138,0.796005,0.791393,0.789526
2,0.3299,0.415327,0.821101,0.826396,0.819904,0.819941
3,0.2877,0.410358,0.808486,0.808438,0.808527,0.808456
4,0.2515,0.473874,0.824541,0.833839,0.822988,0.822777
5,0.2172,0.437938,0.834862,0.836841,0.835565,0.834775
6,0.1879,0.502233,0.831422,0.837994,0.830123,0.830166
7,0.1649,0.464555,0.836009,0.836889,0.835512,0.835713
8,0.1463,0.497179,0.853211,0.853281,0.853035,0.853118
9,0.1278,0.609823,0.84633,0.847889,0.846952,0.846279
10,0.1147,0.577754,0.832569,0.836926,0.833607,0.832283


[I 2025-03-23 02:14:14,263] Trial 26 pruned. 


Trial 27 with params: {'learning_rate': 0.0011896768980299269, 'weight_decay': 0.01, 'warmup_steps': 22}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3839,0.435824,0.791284,0.802611,0.793024,0.789931
2,0.2857,0.412515,0.836009,0.841012,0.83488,0.835035
3,0.2156,0.413759,0.823394,0.828737,0.822198,0.822249
4,0.1617,0.497782,0.84289,0.848848,0.841679,0.841841
5,0.1187,0.479635,0.861239,0.86143,0.861002,0.861124
6,0.0875,0.562801,0.845183,0.852088,0.843889,0.84403
7,0.0597,0.587682,0.861239,0.86143,0.861002,0.861124
8,0.0404,0.701477,0.848624,0.850037,0.848026,0.848272
9,0.0287,0.872294,0.856651,0.857808,0.856119,0.856364
10,0.0192,0.976965,0.857798,0.858618,0.858255,0.857786


[I 2025-03-23 02:17:39,836] Trial 27 finished with value: 0.8599473407056346 and parameters: {'learning_rate': 0.0011896768980299269, 'weight_decay': 0.01, 'warmup_steps': 22}. Best is trial 27 with value: 0.8599473407056346.


Trial 28 with params: {'learning_rate': 0.0008475129382444201, 'weight_decay': 0.01, 'warmup_steps': 21}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3895,0.437497,0.792431,0.80465,0.794235,0.790966
2,0.2989,0.416234,0.834862,0.837948,0.833965,0.834178
3,0.2356,0.429613,0.825688,0.830313,0.824577,0.824684
4,0.1844,0.460867,0.830275,0.834617,0.829208,0.829356
5,0.1436,0.438759,0.861239,0.861181,0.861255,0.861208
6,0.1122,0.516092,0.848624,0.852901,0.847605,0.847855
7,0.0847,0.536129,0.860092,0.862042,0.859413,0.859701
8,0.0621,0.573083,0.864679,0.86535,0.865096,0.864672
9,0.0466,0.725336,0.863532,0.863486,0.863591,0.86351
10,0.0321,0.816864,0.853211,0.853176,0.853288,0.853192


[I 2025-03-23 02:20:48,058] Trial 28 finished with value: 0.8600026319252534 and parameters: {'learning_rate': 0.0008475129382444201, 'weight_decay': 0.01, 'warmup_steps': 21}. Best is trial 28 with value: 0.8600026319252534.


Trial 29 with params: {'learning_rate': 0.0015109783095214179, 'weight_decay': 0.01, 'warmup_steps': 25}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3805,0.433373,0.780963,0.794521,0.782889,0.779151
2,0.2732,0.398134,0.83945,0.840807,0.838848,0.839076
3,0.2002,0.419916,0.833716,0.836352,0.832881,0.833099
4,0.1449,0.473771,0.83945,0.841482,0.838722,0.838962
5,0.1016,0.566819,0.845183,0.849229,0.844184,0.844422
6,0.0698,0.696205,0.84289,0.848427,0.841721,0.8419
7,0.0473,0.636777,0.836009,0.836455,0.836354,0.836007
8,0.0309,0.819272,0.84289,0.842847,0.842816,0.84283
9,0.0204,0.907127,0.848624,0.849279,0.849036,0.848617
10,0.0144,1.200404,0.837156,0.837664,0.837522,0.837153


[I 2025-03-23 02:22:36,129] Trial 29 pruned. 


Trial 30 with params: {'learning_rate': 0.0005619127524154119, 'weight_decay': 0.008, 'warmup_steps': 35}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4039,0.434769,0.791284,0.798628,0.792688,0.790481
2,0.3141,0.41393,0.831422,0.834932,0.83046,0.830647
3,0.2597,0.420005,0.826835,0.830281,0.825871,0.826038
4,0.2149,0.448173,0.826835,0.828644,0.826124,0.82633
5,0.1766,0.445027,0.857798,0.857848,0.857961,0.857791
6,0.1441,0.516763,0.852064,0.856213,0.851067,0.851337
7,0.1189,0.488988,0.850917,0.852564,0.850278,0.850537
8,0.0974,0.501154,0.860092,0.860141,0.860255,0.860085
9,0.0783,0.638197,0.841743,0.846958,0.842869,0.841409
10,0.0632,0.691187,0.848624,0.851189,0.849415,0.848509


[I 2025-03-23 02:25:44,205] Trial 30 finished with value: 0.8473515943871675 and parameters: {'learning_rate': 0.0005619127524154119, 'weight_decay': 0.008, 'warmup_steps': 35}. Best is trial 28 with value: 0.8600026319252534.


Trial 31 with params: {'learning_rate': 0.0009716553864605126, 'weight_decay': 0.01, 'warmup_steps': 19}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3852,0.437872,0.790138,0.804076,0.792067,0.788401
2,0.2942,0.40967,0.838303,0.841908,0.837343,0.837559
3,0.2286,0.427136,0.834862,0.842693,0.83346,0.833461
4,0.1762,0.476933,0.834862,0.837653,0.834007,0.834227
5,0.1334,0.42752,0.863532,0.863773,0.863802,0.863532
6,0.102,0.537386,0.827982,0.837667,0.826408,0.826212
7,0.0744,0.530326,0.866972,0.868498,0.86638,0.866663
8,0.0527,0.572551,0.862385,0.86298,0.862002,0.8622
9,0.0374,0.722464,0.868119,0.868362,0.868391,0.868119
10,0.0257,0.864416,0.860092,0.860286,0.860339,0.860091


[I 2025-03-23 02:28:46,496] Trial 31 finished with value: 0.8622976707461507 and parameters: {'learning_rate': 0.0009716553864605126, 'weight_decay': 0.01, 'warmup_steps': 19}. Best is trial 31 with value: 0.8622976707461507.


Trial 32 with params: {'learning_rate': 0.001665322400127431, 'weight_decay': 0.01, 'warmup_steps': 20}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3777,0.416506,0.795872,0.802444,0.797192,0.795198
2,0.2706,0.394045,0.847477,0.849707,0.846731,0.846994
3,0.1945,0.42921,0.824541,0.826566,0.823788,0.823985
4,0.1386,0.45392,0.841743,0.846657,0.840637,0.840831
5,0.0944,0.495936,0.847477,0.848587,0.846942,0.847171
6,0.0632,0.56338,0.850917,0.852564,0.850278,0.850537
7,0.0434,0.623196,0.865826,0.865979,0.866054,0.865824
8,0.0276,0.768479,0.858945,0.85889,0.858918,0.858903
9,0.0183,0.892598,0.862385,0.862787,0.862718,0.862385
10,0.014,0.918421,0.860092,0.860045,0.860045,0.860045


[I 2025-03-23 02:31:54,329] Trial 32 finished with value: 0.8623266584217035 and parameters: {'learning_rate': 0.001665322400127431, 'weight_decay': 0.01, 'warmup_steps': 20}. Best is trial 32 with value: 0.8623266584217035.


Trial 33 with params: {'learning_rate': 0.000996109556045072, 'weight_decay': 0.009000000000000001, 'warmup_steps': 19}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3855,0.431791,0.798165,0.808082,0.799781,0.797072
2,0.2924,0.411027,0.83945,0.842292,0.838596,0.838831
3,0.226,0.410007,0.83945,0.846446,0.838133,0.838221
4,0.1725,0.470594,0.84289,0.845921,0.842016,0.842262
5,0.1307,0.444524,0.856651,0.856605,0.856708,0.856629
6,0.0996,0.525046,0.860092,0.864178,0.859118,0.859426
7,0.072,0.575274,0.849771,0.850261,0.849404,0.849581
8,0.0516,0.58535,0.865826,0.865943,0.865633,0.865732
9,0.0363,0.661069,0.855505,0.855793,0.855793,0.855505
10,0.0253,0.820047,0.862385,0.863054,0.862802,0.862379


[I 2025-03-23 02:35:19,004] Trial 33 finished with value: 0.8553554502369668 and parameters: {'learning_rate': 0.000996109556045072, 'weight_decay': 0.009000000000000001, 'warmup_steps': 19}. Best is trial 32 with value: 0.8623266584217035.


Trial 34 with params: {'learning_rate': 0.0007084308517196042, 'weight_decay': 0.009000000000000001, 'warmup_steps': 26}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3959,0.445013,0.78555,0.80327,0.78773,0.783189
2,0.3067,0.403199,0.836009,0.839263,0.835091,0.835305
3,0.2467,0.42452,0.822248,0.83095,0.820735,0.820543
4,0.1982,0.468149,0.849771,0.853535,0.848815,0.84908
5,0.1578,0.435808,0.853211,0.853261,0.853372,0.853204
6,0.125,0.569331,0.836009,0.847425,0.834333,0.834122
7,0.0986,0.476716,0.861239,0.861761,0.860876,0.861063
8,0.0759,0.552934,0.864679,0.864679,0.864802,0.864668
9,0.0573,0.706879,0.861239,0.861319,0.861423,0.861234
10,0.0423,0.761208,0.861239,0.861834,0.861634,0.861234


[I 2025-03-23 02:38:19,692] Trial 34 finished with value: 0.8657619572039268 and parameters: {'learning_rate': 0.0007084308517196042, 'weight_decay': 0.009000000000000001, 'warmup_steps': 26}. Best is trial 34 with value: 0.8657619572039268.


Trial 35 with params: {'learning_rate': 0.00048676671322538887, 'weight_decay': 0.007, 'warmup_steps': 27}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4047,0.439575,0.790138,0.800086,0.791772,0.788965
2,0.3186,0.413152,0.831422,0.834932,0.83046,0.830647
3,0.2679,0.421948,0.818807,0.823308,0.817694,0.817763
4,0.2258,0.475206,0.824541,0.828613,0.823493,0.823621
5,0.1892,0.457403,0.847477,0.8475,0.847615,0.847467


[I 2025-03-23 02:39:22,093] Trial 35 pruned. 


Trial 36 with params: {'learning_rate': 0.004730964405415112, 'weight_decay': 0.009000000000000001, 'warmup_steps': 33}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3576,0.397933,0.827982,0.828921,0.828471,0.827959
2,0.2271,0.387652,0.861239,0.861237,0.861129,0.861173
3,0.1496,0.445315,0.858945,0.859756,0.858498,0.858717
4,0.0957,0.533229,0.847477,0.851557,0.846478,0.846727
5,0.0613,0.591702,0.858945,0.859927,0.858455,0.858691
6,0.043,0.690661,0.848624,0.84891,0.84891,0.848624
7,0.0326,0.639432,0.853211,0.854447,0.852656,0.852901
8,0.0197,0.873444,0.84633,0.84675,0.845984,0.846148
9,0.0124,1.178701,0.848624,0.852098,0.849541,0.848444
10,0.0124,0.999201,0.847477,0.848364,0.847952,0.847461


[I 2025-03-23 02:41:16,377] Trial 36 pruned. 


Trial 37 with params: {'learning_rate': 0.0004016655692173208, 'weight_decay': 0.01, 'warmup_steps': 29}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4097,0.440309,0.780963,0.791157,0.782637,0.779663
2,0.3236,0.413222,0.823394,0.827279,0.822367,0.822497
3,0.2783,0.417749,0.813073,0.816637,0.812063,0.812154
4,0.2405,0.480297,0.823394,0.832391,0.821862,0.82166
5,0.2043,0.467075,0.841743,0.842885,0.842279,0.841713


[I 2025-03-23 02:42:08,903] Trial 37 pruned. 


Trial 38 with params: {'learning_rate': 0.00047577436222617683, 'weight_decay': 0.009000000000000001, 'warmup_steps': 24}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4042,0.439329,0.791284,0.800013,0.792814,0.790292
2,0.3197,0.412686,0.827982,0.831604,0.826998,0.827163
3,0.2701,0.42052,0.818807,0.824056,0.81761,0.817632
4,0.2296,0.430406,0.834862,0.836191,0.834259,0.834478
5,0.1924,0.446733,0.848624,0.84891,0.84891,0.848624


[I 2025-03-23 02:42:59,039] Trial 38 pruned. 


Trial 39 with params: {'learning_rate': 0.0015446807458019355, 'weight_decay': 0.009000000000000001, 'warmup_steps': 18}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3793,0.418695,0.807339,0.812133,0.808453,0.806933
2,0.2748,0.408945,0.832569,0.835334,0.831713,0.831924
3,0.1993,0.444507,0.831422,0.838447,0.830081,0.830097
4,0.1427,0.496247,0.833716,0.844452,0.832081,0.831885
5,0.1003,0.484565,0.861239,0.862231,0.86075,0.860988
6,0.0675,0.550563,0.857798,0.863418,0.856656,0.856928
7,0.0455,0.577882,0.866972,0.867264,0.867264,0.866972
8,0.0293,0.780368,0.857798,0.858131,0.857498,0.857651
9,0.02,0.929231,0.860092,0.86068,0.859708,0.859903
10,0.0145,1.007473,0.853211,0.853162,0.853162,0.853162


[I 2025-03-23 02:46:12,080] Trial 39 finished with value: 0.8564926541354472 and parameters: {'learning_rate': 0.0015446807458019355, 'weight_decay': 0.009000000000000001, 'warmup_steps': 18}. Best is trial 34 with value: 0.8657619572039268.


Trial 40 with params: {'learning_rate': 0.003632690234479149, 'weight_decay': 0.01, 'warmup_steps': 21}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.362,0.411752,0.815367,0.819378,0.816378,0.815069
2,0.2376,0.403872,0.847477,0.851557,0.846478,0.846727
3,0.157,0.452812,0.845183,0.845217,0.845026,0.845094
4,0.1022,0.531107,0.84633,0.850941,0.845268,0.845498
5,0.0642,0.627504,0.841743,0.841801,0.841564,0.841642


[I 2025-03-23 02:47:17,079] Trial 40 pruned. 


Trial 41 with params: {'learning_rate': 0.0011939679119801073, 'weight_decay': 0.01, 'warmup_steps': 20}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3836,0.424186,0.797018,0.802605,0.798234,0.796476
2,0.2846,0.404812,0.837156,0.839972,0.836301,0.836529
3,0.2154,0.421746,0.823394,0.832391,0.821862,0.82166
4,0.1617,0.492743,0.837156,0.840583,0.836217,0.836432
5,0.1176,0.488951,0.868119,0.868122,0.868012,0.868057
6,0.0861,0.538527,0.838303,0.841908,0.837343,0.837559
7,0.0593,0.61372,0.855505,0.855948,0.855161,0.855333
8,0.04,0.718201,0.855505,0.855793,0.855793,0.855505
9,0.0288,0.786873,0.855505,0.855505,0.855624,0.855492
10,0.0186,1.002959,0.853211,0.854791,0.853835,0.853162


[I 2025-03-23 02:50:19,263] Trial 41 finished with value: 0.8600026319252534 and parameters: {'learning_rate': 0.0011939679119801073, 'weight_decay': 0.01, 'warmup_steps': 20}. Best is trial 34 with value: 0.8657619572039268.


Trial 42 with params: {'learning_rate': 0.0005866438545342746, 'weight_decay': 0.01, 'warmup_steps': 21}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3983,0.435736,0.793578,0.804427,0.795277,0.792315
2,0.3137,0.410401,0.827982,0.831286,0.82704,0.827217
3,0.2589,0.416848,0.81422,0.819763,0.812979,0.812945
4,0.2141,0.466547,0.836009,0.837892,0.835301,0.835531
5,0.1746,0.432753,0.852064,0.852144,0.852246,0.852059
6,0.1427,0.538991,0.847477,0.853533,0.846268,0.846459
7,0.1171,0.489511,0.855505,0.856753,0.854951,0.8552
8,0.0969,0.486958,0.866972,0.867168,0.867222,0.866972
9,0.0771,0.679017,0.848624,0.851189,0.849415,0.848509
10,0.0624,0.659286,0.860092,0.860617,0.860466,0.860089


[I 2025-03-23 02:53:13,947] Trial 42 finished with value: 0.8542180331861173 and parameters: {'learning_rate': 0.0005866438545342746, 'weight_decay': 0.01, 'warmup_steps': 21}. Best is trial 34 with value: 0.8657619572039268.


Trial 43 with params: {'learning_rate': 0.000778596824069969, 'weight_decay': 0.01, 'warmup_steps': 19}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3915,0.439099,0.786697,0.801435,0.788688,0.784794
2,0.3032,0.407009,0.838303,0.841586,0.837385,0.837609
3,0.2419,0.421723,0.838303,0.842245,0.837301,0.837508
4,0.192,0.47451,0.838303,0.843748,0.837133,0.837284
5,0.1511,0.437581,0.857798,0.85774,0.857792,0.857762
6,0.1185,0.518533,0.840596,0.847857,0.839259,0.839344
7,0.0921,0.487943,0.84633,0.846876,0.845942,0.846123
8,0.0695,0.581647,0.857798,0.858197,0.858129,0.857797
9,0.0515,0.655754,0.850917,0.85111,0.851162,0.850917
10,0.0373,0.727901,0.852064,0.852959,0.852541,0.852048


[I 2025-03-23 02:56:19,107] Trial 43 finished with value: 0.8484891563600752 and parameters: {'learning_rate': 0.000778596824069969, 'weight_decay': 0.01, 'warmup_steps': 19}. Best is trial 34 with value: 0.8657619572039268.


Trial 44 with params: {'learning_rate': 0.0017335370023710403, 'weight_decay': 0.008, 'warmup_steps': 26}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3778,0.410784,0.803899,0.809989,0.805159,0.803328
2,0.2669,0.385826,0.848624,0.848689,0.848447,0.848527
3,0.1918,0.406324,0.838303,0.839973,0.837638,0.837871
4,0.1364,0.489863,0.836009,0.83867,0.835175,0.835401
5,0.0928,0.586919,0.848624,0.852901,0.847605,0.847855


[I 2025-03-23 02:57:55,424] Trial 44 pruned. 


Trial 45 with params: {'learning_rate': 0.0018179082598964922, 'weight_decay': 0.01, 'warmup_steps': 22}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3753,0.410551,0.803899,0.809589,0.805117,0.803375
2,0.2652,0.389751,0.840596,0.841192,0.840185,0.840368
3,0.1881,0.427687,0.841743,0.844914,0.840848,0.841088
4,0.132,0.504298,0.829128,0.83439,0.827955,0.828052
5,0.0895,0.505739,0.858945,0.861557,0.858161,0.858461
6,0.0606,0.628607,0.833716,0.837257,0.832754,0.832951
7,0.0385,0.672141,0.868119,0.868173,0.86797,0.868043
8,0.0267,0.850168,0.853211,0.854257,0.852698,0.852932
9,0.0176,0.992038,0.860092,0.86024,0.859876,0.859986
10,0.0127,0.944829,0.860092,0.860045,0.860045,0.860045


[I 2025-03-23 03:01:10,984] Trial 45 finished with value: 0.8610848892031129 and parameters: {'learning_rate': 0.0018179082598964922, 'weight_decay': 0.01, 'warmup_steps': 22}. Best is trial 34 with value: 0.8657619572039268.


Trial 46 with params: {'learning_rate': 0.00012827851737332596, 'weight_decay': 0.0, 'warmup_steps': 12}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4509,0.447049,0.784404,0.788915,0.78551,0.783949
2,0.3631,0.443881,0.792431,0.79439,0.791625,0.791719
3,0.342,0.445597,0.802752,0.804264,0.802054,0.802202
4,0.3278,0.46136,0.797018,0.800051,0.796045,0.796085
5,0.311,0.423971,0.806193,0.806134,0.806106,0.806119


[I 2025-03-23 03:02:13,786] Trial 46 pruned. 


Trial 47 with params: {'learning_rate': 0.0012407285404195305, 'weight_decay': 0.009000000000000001, 'warmup_steps': 35}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3854,0.424968,0.800459,0.806293,0.801697,0.799902
2,0.2839,0.406973,0.84633,0.84988,0.845394,0.845647
3,0.2131,0.433127,0.841743,0.843795,0.841016,0.841262
4,0.1577,0.486943,0.840596,0.845685,0.839469,0.83965
5,0.1145,0.512266,0.863532,0.864061,0.86317,0.863359
6,0.0831,0.579544,0.837156,0.843642,0.83588,0.835975
7,0.0575,0.709318,0.848624,0.850487,0.847941,0.848201
8,0.0397,0.682439,0.862385,0.862728,0.862086,0.862243
9,0.0269,0.787524,0.864679,0.864834,0.864465,0.864576
10,0.0184,0.902209,0.855505,0.855577,0.85533,0.855413


[I 2025-03-23 03:05:13,138] Trial 47 finished with value: 0.8564699778647737 and parameters: {'learning_rate': 0.0012407285404195305, 'weight_decay': 0.009000000000000001, 'warmup_steps': 35}. Best is trial 34 with value: 0.8657619572039268.


Trial 48 with params: {'learning_rate': 0.0002138037226393511, 'weight_decay': 0.007, 'warmup_steps': 17}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4307,0.446653,0.78555,0.79992,0.78752,0.783684
2,0.3464,0.426452,0.808486,0.810603,0.807685,0.807829
3,0.3175,0.429125,0.803899,0.805049,0.804443,0.803855
4,0.2938,0.4798,0.816514,0.825799,0.814936,0.814626
5,0.2687,0.433579,0.821101,0.82148,0.82142,0.8211
6,0.2446,0.44458,0.824541,0.825905,0.823914,0.824114
7,0.2244,0.411647,0.818807,0.818807,0.818915,0.818792
8,0.2074,0.470422,0.819954,0.820482,0.819536,0.819696
9,0.191,0.493495,0.833716,0.833865,0.833933,0.833714
10,0.1806,0.47456,0.819954,0.823059,0.820841,0.819755


[I 2025-03-23 03:07:07,178] Trial 48 pruned. 


Trial 49 with params: {'learning_rate': 0.003031230103542952, 'weight_decay': 0.01, 'warmup_steps': 19}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3663,0.409426,0.807339,0.815014,0.808748,0.806598
2,0.245,0.380726,0.848624,0.852544,0.847647,0.847904
3,0.1642,0.442792,0.847477,0.848781,0.846899,0.847139
4,0.1087,0.521223,0.84289,0.848021,0.841763,0.841957
5,0.0682,0.638006,0.840596,0.843909,0.83968,0.839912


[I 2025-03-23 03:08:04,265] Trial 49 pruned. 


Trial 50 with params: {'learning_rate': 0.001573192104242805, 'weight_decay': 0.01, 'warmup_steps': 27}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3784,0.425311,0.797018,0.804258,0.798402,0.796266
2,0.273,0.391213,0.84289,0.845079,0.842142,0.842392
3,0.1977,0.42808,0.840596,0.8436,0.839722,0.83996
4,0.1436,0.499864,0.844037,0.847891,0.843058,0.843295
5,0.1008,0.589102,0.848624,0.849648,0.84811,0.848336
6,0.0687,0.591372,0.854358,0.858912,0.853319,0.853594
7,0.0446,0.586646,0.848624,0.848573,0.848573,0.848573
8,0.0299,0.773192,0.852064,0.852334,0.851783,0.851922
9,0.02,0.927895,0.863532,0.863498,0.863465,0.86348
10,0.0137,1.073517,0.856651,0.857161,0.856287,0.85647


[I 2025-03-23 03:10:08,325] Trial 50 pruned. 


Trial 51 with params: {'learning_rate': 0.0014174043265211445, 'weight_decay': 0.008, 'warmup_steps': 17}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3805,0.414453,0.799312,0.803166,0.800318,0.798988
2,0.2775,0.414588,0.830275,0.833611,0.829334,0.829521
3,0.203,0.424761,0.830275,0.834983,0.829166,0.829297
4,0.1487,0.443255,0.841743,0.846657,0.840637,0.840831
5,0.1035,0.491291,0.860092,0.86117,0.859582,0.859826
6,0.0732,0.524308,0.853211,0.855627,0.852446,0.852727
7,0.0486,0.626946,0.856651,0.857808,0.856119,0.856364
8,0.033,0.739398,0.858945,0.859927,0.858455,0.858691
9,0.022,0.808902,0.862385,0.862435,0.862549,0.862379
10,0.0148,1.006265,0.858945,0.858991,0.858792,0.858863


[I 2025-03-23 03:13:23,443] Trial 51 finished with value: 0.8577615011023589 and parameters: {'learning_rate': 0.0014174043265211445, 'weight_decay': 0.008, 'warmup_steps': 17}. Best is trial 34 with value: 0.8657619572039268.


Trial 52 with params: {'learning_rate': 0.001505210848641366, 'weight_decay': 0.01, 'warmup_steps': 24}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3811,0.411772,0.807339,0.813281,0.80858,0.806802
2,0.2729,0.403542,0.837156,0.838708,0.836512,0.83674
3,0.1997,0.431783,0.824541,0.831389,0.823198,0.823162
4,0.1442,0.472833,0.845183,0.849589,0.844142,0.844371
5,0.101,0.526847,0.858945,0.861013,0.858245,0.858534
6,0.0699,0.623441,0.83945,0.844715,0.838301,0.838467
7,0.0472,0.617515,0.857798,0.857798,0.857919,0.857786
8,0.0306,0.944165,0.850917,0.850917,0.851036,0.850905
9,0.0205,0.834597,0.849771,0.84973,0.849699,0.849714
10,0.015,0.957574,0.847477,0.847648,0.847236,0.847352


[I 2025-03-23 03:15:05,808] Trial 52 pruned. 


Trial 53 with params: {'learning_rate': 0.0009474942895377063, 'weight_decay': 0.009000000000000001, 'warmup_steps': 18}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3878,0.431174,0.797018,0.805646,0.798529,0.796085
2,0.2943,0.415648,0.830275,0.835364,0.829124,0.829237
3,0.2286,0.425439,0.830275,0.837502,0.828913,0.828906
4,0.176,0.442438,0.83945,0.842907,0.838511,0.838736
5,0.1335,0.435918,0.858945,0.858899,0.859003,0.858923
6,0.1031,0.501847,0.844037,0.847235,0.843142,0.843391
7,0.0749,0.543204,0.856651,0.857808,0.856119,0.856364
8,0.0542,0.556651,0.856651,0.856647,0.85654,0.856583
9,0.0386,0.674064,0.856651,0.856632,0.85675,0.856636
10,0.0263,0.773656,0.856651,0.856647,0.85654,0.856583


[I 2025-03-23 03:18:06,138] Trial 53 finished with value: 0.8542732810223925 and parameters: {'learning_rate': 0.0009474942895377063, 'weight_decay': 0.009000000000000001, 'warmup_steps': 18}. Best is trial 34 with value: 0.8657619572039268.


Trial 54 with params: {'learning_rate': 0.000403916017640712, 'weight_decay': 0.0, 'warmup_steps': 31}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4099,0.439651,0.783257,0.79352,0.784931,0.78197
2,0.3236,0.41365,0.821101,0.824949,0.820072,0.820192
3,0.2782,0.414652,0.819954,0.822442,0.819115,0.819287
4,0.24,0.495318,0.823394,0.83292,0.821819,0.821577
5,0.2038,0.477178,0.836009,0.837645,0.836649,0.835947


[I 2025-03-23 03:19:08,497] Trial 54 pruned. 


Trial 55 with params: {'learning_rate': 0.0008953750478722926, 'weight_decay': 0.0, 'warmup_steps': 17}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3873,0.438187,0.792431,0.806464,0.794361,0.790713
2,0.2972,0.403719,0.836009,0.839582,0.835049,0.835255
3,0.2321,0.421686,0.823394,0.830439,0.82203,0.82197
4,0.1803,0.447531,0.845183,0.84766,0.844395,0.844652
5,0.139,0.444622,0.860092,0.860141,0.860255,0.860085
6,0.1065,0.505125,0.850917,0.854528,0.849983,0.850255
7,0.0803,0.544551,0.852064,0.853197,0.85153,0.851768
8,0.0579,0.562435,0.862385,0.8625,0.862592,0.862382
9,0.0418,0.719975,0.850917,0.851728,0.851372,0.850905
10,0.0287,0.809252,0.856651,0.856605,0.856708,0.856629


[I 2025-03-23 03:23:18,119] Trial 55 finished with value: 0.8634011492058895 and parameters: {'learning_rate': 0.0008953750478722926, 'weight_decay': 0.0, 'warmup_steps': 17}. Best is trial 34 with value: 0.8657619572039268.


Trial 56 with params: {'learning_rate': 0.0016225415045434041, 'weight_decay': 0.001, 'warmup_steps': 15}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3759,0.425087,0.795872,0.804714,0.797403,0.794901
2,0.2737,0.391853,0.845183,0.846282,0.844647,0.844873
3,0.1976,0.444801,0.837156,0.841987,0.836049,0.836218
4,0.1398,0.475702,0.84289,0.843361,0.842521,0.842691
5,0.0967,0.492095,0.864679,0.865146,0.864339,0.864519
6,0.0664,0.535901,0.857798,0.863418,0.856656,0.856928
7,0.0436,0.68868,0.855505,0.855505,0.855624,0.855492
8,0.0302,0.804867,0.850917,0.850931,0.850783,0.850839
9,0.0194,0.882382,0.855505,0.855619,0.855709,0.855502
10,0.0134,0.969027,0.854358,0.854339,0.854456,0.854342


[I 2025-03-23 03:25:22,293] Trial 56 pruned. 


Trial 57 with params: {'learning_rate': 0.0012099413816211812, 'weight_decay': 0.002, 'warmup_steps': 12}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3821,0.424566,0.801606,0.807261,0.802823,0.801076
2,0.286,0.401468,0.838303,0.840712,0.837512,0.837748
3,0.2149,0.399866,0.833716,0.83608,0.832923,0.833145
4,0.1593,0.459839,0.832569,0.837318,0.83146,0.831604
5,0.1168,0.472372,0.858945,0.858991,0.858792,0.858863
6,0.0852,0.553128,0.84633,0.852581,0.8451,0.845275
7,0.0589,0.557504,0.862385,0.862625,0.862128,0.862263
8,0.0402,0.671451,0.854358,0.854999,0.853951,0.854149
9,0.0279,0.763967,0.855505,0.85545,0.85554,0.855477
10,0.0186,0.857579,0.855505,0.855902,0.855835,0.855504


[I 2025-03-23 03:28:30,196] Trial 57 finished with value: 0.8497261757079768 and parameters: {'learning_rate': 0.0012099413816211812, 'weight_decay': 0.002, 'warmup_steps': 12}. Best is trial 34 with value: 0.8657619572039268.


Trial 58 with params: {'learning_rate': 0.0014044945606658866, 'weight_decay': 0.001, 'warmup_steps': 21}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3797,0.416715,0.802752,0.808231,0.803949,0.802249
2,0.2765,0.41083,0.833716,0.836352,0.832881,0.833099
3,0.2035,0.410512,0.824541,0.830108,0.823324,0.82337
4,0.1493,0.481006,0.838303,0.841586,0.837385,0.837609
5,0.1065,0.51547,0.863532,0.864724,0.863002,0.863259
6,0.0757,0.560214,0.852064,0.855203,0.851193,0.851473
7,0.0497,0.575311,0.852064,0.852561,0.851699,0.851877
8,0.0333,0.733213,0.860092,0.860045,0.860045,0.860045
9,0.0228,0.839238,0.849771,0.850397,0.849362,0.849555
10,0.0156,0.94988,0.849771,0.849711,0.849783,0.849737


[I 2025-03-23 03:30:24,621] Trial 58 pruned. 


Trial 59 with params: {'learning_rate': 0.0013518426086071456, 'weight_decay': 0.0, 'warmup_steps': 2}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3753,0.424422,0.792431,0.800948,0.79394,0.791477
2,0.2799,0.401493,0.836009,0.836348,0.83568,0.835828
3,0.2056,0.436067,0.830275,0.834983,0.829166,0.829297
4,0.1485,0.467833,0.837156,0.838708,0.836512,0.83674
5,0.1053,0.491252,0.858945,0.859927,0.858455,0.858691
6,0.0732,0.593962,0.83945,0.843584,0.838427,0.838634
7,0.0493,0.536211,0.855505,0.855456,0.855456,0.855456
8,0.0325,0.749286,0.857798,0.857744,0.857834,0.857771
9,0.0224,0.829068,0.853211,0.853162,0.853162,0.853162
10,0.015,0.821828,0.857798,0.857798,0.857919,0.857786


[I 2025-03-23 03:33:42,309] Trial 59 finished with value: 0.8542379886253404 and parameters: {'learning_rate': 0.0013518426086071456, 'weight_decay': 0.0, 'warmup_steps': 2}. Best is trial 34 with value: 0.8657619572039268.


Trial 60 with params: {'learning_rate': 0.0006260084840194911, 'weight_decay': 0.001, 'warmup_steps': 22}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.397,0.440078,0.792431,0.805844,0.794319,0.7908
2,0.3112,0.407899,0.827982,0.830983,0.827082,0.827269
3,0.2549,0.419149,0.813073,0.820067,0.811684,0.811526
4,0.2093,0.467359,0.836009,0.839917,0.835007,0.835203
5,0.1694,0.44246,0.849771,0.850008,0.850036,0.84977
6,0.1373,0.544019,0.840596,0.847857,0.839259,0.839344
7,0.1118,0.470674,0.860092,0.86068,0.859708,0.859903
8,0.0911,0.507863,0.869266,0.869266,0.86939,0.869255
9,0.0704,0.665807,0.849771,0.85278,0.850625,0.849626
10,0.0553,0.679878,0.856651,0.856993,0.856961,0.856651


[I 2025-03-23 03:36:43,646] Trial 60 finished with value: 0.8622809973045822 and parameters: {'learning_rate': 0.0006260084840194911, 'weight_decay': 0.001, 'warmup_steps': 22}. Best is trial 34 with value: 0.8657619572039268.


Trial 61 with params: {'learning_rate': 0.0003568622546478908, 'weight_decay': 0.0, 'warmup_steps': 19}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4106,0.436267,0.792431,0.798336,0.793687,0.791826
2,0.3294,0.415268,0.821101,0.825289,0.82003,0.820132
3,0.2874,0.409829,0.809633,0.809565,0.809611,0.809584
4,0.2519,0.47889,0.823394,0.831383,0.821946,0.821819
5,0.2181,0.436561,0.827982,0.830708,0.828808,0.827829


[I 2025-03-23 03:37:42,217] Trial 61 pruned. 


Trial 62 with params: {'learning_rate': 0.0007197810633371478, 'weight_decay': 0.0, 'warmup_steps': 14}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3909,0.437561,0.788991,0.799696,0.790688,0.7877
2,0.3064,0.413837,0.832569,0.835627,0.83167,0.831875
3,0.247,0.436091,0.833716,0.839069,0.832544,0.832668
4,0.1978,0.472775,0.830275,0.833305,0.829376,0.829572
5,0.1586,0.454349,0.853211,0.85387,0.853625,0.853204
6,0.126,0.558986,0.84289,0.849285,0.841637,0.841781
7,0.1016,0.507441,0.854358,0.855152,0.853909,0.854123
8,0.0785,0.553873,0.858945,0.858899,0.859003,0.858923
9,0.0605,0.649572,0.854358,0.856545,0.855088,0.854273
10,0.0449,0.692221,0.856651,0.85711,0.857003,0.85665


[I 2025-03-23 03:41:25,426] Trial 62 finished with value: 0.8623672206701706 and parameters: {'learning_rate': 0.0007197810633371478, 'weight_decay': 0.0, 'warmup_steps': 14}. Best is trial 34 with value: 0.8657619572039268.


Trial 63 with params: {'learning_rate': 0.0006645572191508551, 'weight_decay': 0.0, 'warmup_steps': 15}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3933,0.433746,0.792431,0.800021,0.793856,0.791602
2,0.3084,0.412835,0.832569,0.836949,0.831502,0.831662
3,0.2513,0.403818,0.840596,0.843028,0.839806,0.840049
4,0.2049,0.445885,0.840596,0.842516,0.83989,0.840132
5,0.1653,0.430451,0.853211,0.853211,0.85333,0.853199
6,0.1318,0.511131,0.844037,0.849391,0.84289,0.843082
7,0.1074,0.473484,0.854358,0.857523,0.853488,0.853776
8,0.0854,0.551396,0.861239,0.862732,0.861844,0.861197
9,0.0663,0.666193,0.848624,0.851189,0.849415,0.848509
10,0.0506,0.709625,0.863532,0.864444,0.864012,0.863518


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--accuracy/f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Sat Oct 12 13:56:14 2024) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.
[I 2025-03-23 03:44:46,336] Trial 63 finished with value: 0.8634198476095709 and parameters: {'learning_rate': 0.0006645572191508551, 'weight_decay': 0.0, 'warmup_steps': 15}. Best is trial 34 with value: 0.8657619572039268.


Trial 64 with params: {'learning_rate': 0.0007728867189874863, 'weight_decay': 0.0, 'warmup_steps': 17}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3906,0.438099,0.790138,0.804707,0.792109,0.788311
2,0.3022,0.397254,0.837156,0.838708,0.836512,0.83674
3,0.2408,0.420188,0.831422,0.837994,0.830123,0.830166
4,0.192,0.46459,0.840596,0.844233,0.839638,0.839863
5,0.1519,0.442823,0.856651,0.856674,0.856793,0.856642
6,0.1188,0.570186,0.833716,0.84388,0.832123,0.831965
7,0.0932,0.497061,0.855505,0.856385,0.855035,0.855258
8,0.0697,0.584568,0.855505,0.858107,0.856298,0.855395
9,0.0521,0.706586,0.853211,0.854378,0.853751,0.853183
10,0.0383,0.751429,0.863532,0.863555,0.863676,0.863523


[I 2025-03-23 03:47:39,097] Trial 64 finished with value: 0.8588630989429471 and parameters: {'learning_rate': 0.0007728867189874863, 'weight_decay': 0.0, 'warmup_steps': 17}. Best is trial 34 with value: 0.8657619572039268.


Trial 65 with params: {'learning_rate': 0.0003714952949369485, 'weight_decay': 0.003, 'warmup_steps': 19}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4098,0.436069,0.794725,0.799901,0.795898,0.794224
2,0.3285,0.416177,0.818807,0.824452,0.817568,0.817564
3,0.2853,0.41357,0.813073,0.813462,0.812695,0.812837
4,0.249,0.482745,0.824541,0.833317,0.82303,0.822858
5,0.215,0.435192,0.830275,0.833928,0.831228,0.830046


[I 2025-03-23 03:48:43,724] Trial 65 pruned. 


Trial 66 with params: {'learning_rate': 0.0006302828157209924, 'weight_decay': 0.0, 'warmup_steps': 11}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3938,0.430873,0.799312,0.807048,0.800739,0.79851
2,0.3109,0.406403,0.830275,0.833611,0.829334,0.829521
3,0.2545,0.403072,0.829128,0.830732,0.82846,0.828672
4,0.2087,0.451197,0.836009,0.837247,0.835428,0.835646
5,0.1694,0.447849,0.857798,0.85774,0.857792,0.857762
6,0.1378,0.524371,0.834862,0.843191,0.833418,0.833389
7,0.1133,0.464519,0.852064,0.852698,0.851657,0.851852
8,0.091,0.539611,0.855505,0.855698,0.855751,0.855504
9,0.0721,0.663719,0.852064,0.856812,0.85313,0.851797
10,0.0561,0.662572,0.849771,0.850837,0.850288,0.849747


[I 2025-03-23 03:50:36,129] Trial 66 pruned. 


Trial 67 with params: {'learning_rate': 0.0011231985463623071, 'weight_decay': 0.0, 'warmup_steps': 16}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.384,0.430871,0.793578,0.802858,0.79515,0.792529
2,0.2866,0.412375,0.834862,0.839653,0.833754,0.833911
3,0.2183,0.418272,0.827982,0.834692,0.826661,0.826665
4,0.1653,0.490325,0.83945,0.844322,0.838343,0.838524
5,0.1218,0.452276,0.862385,0.862351,0.862465,0.862367
6,0.0905,0.549046,0.850917,0.8542,0.850025,0.8503
7,0.0632,0.566478,0.862385,0.862385,0.862507,0.862374
8,0.043,0.710199,0.848624,0.848573,0.848573,0.848573
9,0.0303,0.778838,0.850917,0.850863,0.850951,0.850889
10,0.0193,1.076854,0.852064,0.852216,0.852288,0.852062


[I 2025-03-23 03:52:33,962] Trial 67 pruned. 


Trial 68 with params: {'learning_rate': 0.0009987240089048446, 'weight_decay': 0.0, 'warmup_steps': 23}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3866,0.427033,0.799312,0.805328,0.80057,0.798727
2,0.292,0.418152,0.840596,0.843909,0.83968,0.839912
3,0.2264,0.425186,0.833716,0.839478,0.832502,0.832606
4,0.1731,0.465896,0.831422,0.834033,0.830586,0.830797
5,0.1311,0.419504,0.864679,0.864679,0.864802,0.864668
6,0.0998,0.575646,0.832569,0.842975,0.830955,0.830766
7,0.0721,0.590509,0.850917,0.853593,0.850109,0.850385
8,0.0501,0.609632,0.860092,0.86024,0.859876,0.859986
9,0.0353,0.73479,0.857798,0.857992,0.858045,0.857797
10,0.0242,0.846951,0.853211,0.853152,0.853204,0.853173


[I 2025-03-23 03:54:28,337] Trial 68 pruned. 


Trial 69 with params: {'learning_rate': 0.0005764125026309609, 'weight_decay': 0.002, 'warmup_steps': 27}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4004,0.438163,0.795872,0.806251,0.797529,0.794695
2,0.3144,0.40806,0.827982,0.832285,0.826913,0.82705
3,0.2602,0.418657,0.815367,0.819291,0.814315,0.814398
4,0.2157,0.442102,0.836009,0.836732,0.835554,0.835745
5,0.1771,0.452168,0.852064,0.852216,0.852288,0.852062
6,0.1445,0.554859,0.841743,0.847465,0.840553,0.840717
7,0.12,0.477895,0.862385,0.862846,0.862044,0.862222
8,0.0989,0.496638,0.865826,0.865906,0.866012,0.865821
9,0.0781,0.698871,0.83945,0.845458,0.840658,0.83904
10,0.0644,0.672058,0.861239,0.861391,0.861465,0.861237


[I 2025-03-23 03:57:25,657] Trial 69 finished with value: 0.8508044137466307 and parameters: {'learning_rate': 0.0005764125026309609, 'weight_decay': 0.002, 'warmup_steps': 27}. Best is trial 34 with value: 0.8657619572039268.


Trial 70 with params: {'learning_rate': 0.0004110254079797419, 'weight_decay': 0.001, 'warmup_steps': 10}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4054,0.439637,0.786697,0.793518,0.788057,0.785936
2,0.3266,0.414452,0.823394,0.829557,0.822114,0.822113
3,0.2814,0.413872,0.824541,0.827356,0.823661,0.82384
4,0.2425,0.451152,0.826835,0.835175,0.825366,0.825252
5,0.2061,0.449392,0.83945,0.840408,0.839943,0.839428
6,0.1768,0.476642,0.844037,0.846108,0.843311,0.843563
7,0.1528,0.469864,0.833716,0.835139,0.833091,0.83331
8,0.1336,0.488999,0.860092,0.860381,0.860381,0.860092
9,0.1157,0.64018,0.849771,0.852484,0.850583,0.849647
10,0.1011,0.55728,0.848624,0.850417,0.849289,0.848559


[I 2025-03-23 03:59:19,179] Trial 70 pruned. 


Trial 71 with params: {'learning_rate': 0.0007732653123861305, 'weight_decay': 0.002, 'warmup_steps': 18}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3913,0.43989,0.78555,0.802567,0.787688,0.783293
2,0.3035,0.406189,0.83945,0.842907,0.838511,0.838736
3,0.2423,0.41697,0.838303,0.843748,0.837133,0.837284
4,0.1923,0.468973,0.832569,0.835055,0.831755,0.831971
5,0.1511,0.441072,0.856651,0.856674,0.856793,0.856642
6,0.1185,0.542332,0.836009,0.84182,0.834796,0.834915
7,0.0929,0.469728,0.853211,0.853923,0.852783,0.852988
8,0.0701,0.575948,0.861239,0.861261,0.861381,0.86123
9,0.0513,0.678995,0.862385,0.862464,0.862213,0.862298
10,0.0374,0.731445,0.857798,0.858322,0.858171,0.857795


[I 2025-03-23 04:02:09,476] Trial 71 finished with value: 0.8564699778647737 and parameters: {'learning_rate': 0.0007732653123861305, 'weight_decay': 0.002, 'warmup_steps': 18}. Best is trial 34 with value: 0.8657619572039268.


Trial 72 with params: {'learning_rate': 0.0006625218480325202, 'weight_decay': 0.0, 'warmup_steps': 16}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3935,0.448484,0.78555,0.802567,0.787688,0.783293
2,0.3091,0.410441,0.830275,0.834266,0.82925,0.829413
3,0.2516,0.415197,0.834862,0.838259,0.833923,0.834129
4,0.2048,0.466523,0.836009,0.84182,0.834796,0.834915
5,0.1646,0.443286,0.852064,0.852216,0.852288,0.852062
6,0.1322,0.548582,0.841743,0.849766,0.840343,0.8404
7,0.1071,0.491786,0.854358,0.856144,0.853698,0.853969
8,0.0848,0.530815,0.864679,0.864874,0.864928,0.864678
9,0.0656,0.664066,0.854358,0.856292,0.855045,0.854289
10,0.0502,0.708534,0.848624,0.849781,0.849162,0.848595


[I 2025-03-23 04:04:10,587] Trial 72 pruned. 


Trial 73 with params: {'learning_rate': 5.953168512495511e-05, 'weight_decay': 0.01, 'warmup_steps': 38}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5049,0.464502,0.774083,0.774277,0.773733,0.773832
2,0.3832,0.461945,0.77867,0.781448,0.77769,0.777652
3,0.3674,0.460804,0.788991,0.793282,0.787825,0.7877
4,0.3572,0.459055,0.791284,0.79708,0.789951,0.789687
5,0.3484,0.42869,0.790138,0.790091,0.790004,0.790038


[I 2025-03-23 04:05:11,802] Trial 73 pruned. 


Trial 74 with params: {'learning_rate': 0.0004337409147184581, 'weight_decay': 0.001, 'warmup_steps': 15}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4049,0.439585,0.78555,0.792993,0.786973,0.784694
2,0.3247,0.413422,0.819954,0.824297,0.818862,0.818948
3,0.2781,0.41324,0.818807,0.821417,0.817946,0.81811
4,0.2386,0.474695,0.823394,0.831879,0.821904,0.82174
5,0.202,0.455178,0.837156,0.837799,0.837564,0.837148
6,0.1726,0.54286,0.827982,0.833026,0.826829,0.826929
7,0.1488,0.485278,0.838303,0.840451,0.837554,0.837791
8,0.1298,0.494162,0.853211,0.853261,0.853372,0.853204
9,0.1105,0.617164,0.84289,0.845564,0.8437,0.842761
10,0.0958,0.558758,0.83945,0.842246,0.84028,0.839307


[I 2025-03-23 04:07:08,836] Trial 74 pruned. 


Trial 75 with params: {'learning_rate': 0.003446225919555679, 'weight_decay': 0.0, 'warmup_steps': 10}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3574,0.42325,0.818807,0.821758,0.819672,0.81862
2,0.2343,0.392752,0.84289,0.846901,0.84189,0.842117
3,0.1553,0.431135,0.847477,0.84743,0.847531,0.847453
4,0.1017,0.54388,0.836009,0.839582,0.835049,0.835255
5,0.0662,0.596481,0.850917,0.852141,0.850362,0.850603
6,0.0399,0.80798,0.833716,0.83494,0.833133,0.833347
7,0.0288,0.944302,0.852064,0.852025,0.851993,0.852008
8,0.0178,1.087392,0.848624,0.849319,0.848194,0.848393
9,0.0142,1.174537,0.855505,0.85545,0.85554,0.855477
10,0.0101,1.250872,0.841743,0.841947,0.841479,0.841602


[I 2025-03-23 04:08:51,109] Trial 75 pruned. 


Trial 76 with params: {'learning_rate': 5.7423270605816206e-05, 'weight_decay': 0.007, 'warmup_steps': 14}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5023,0.467108,0.774083,0.77482,0.773522,0.77363
2,0.3842,0.464063,0.774083,0.777339,0.773017,0.772897
3,0.3685,0.461563,0.786697,0.791634,0.785447,0.785233
4,0.3584,0.460142,0.790138,0.795711,0.788825,0.788574
5,0.3498,0.428517,0.792431,0.792418,0.792256,0.792311
6,0.3419,0.428283,0.793578,0.795385,0.794266,0.793469
7,0.3371,0.432527,0.792431,0.793924,0.793056,0.792352
8,0.3319,0.431223,0.797018,0.797439,0.79735,0.797016
9,0.3272,0.433899,0.802752,0.802801,0.802896,0.802743
10,0.3214,0.440866,0.795872,0.803763,0.797318,0.795026


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--f1/34c46321f42186df33a6260966e34a368f14868d9cc2ba47d142112e2800d233 (last modified on Fri Jan 10 23:14:01 2025) since it couldn't be found locally at evaluate-metric--f1, or remotely on the Hugging Face Hub.
[I 2025-03-23 04:11:14,991] Trial 76 pruned. 


Trial 77 with params: {'learning_rate': 0.0005663297775827886, 'weight_decay': 0.0, 'warmup_steps': 20}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3996,0.437249,0.793578,0.80498,0.795319,0.792239
2,0.3149,0.411128,0.826835,0.830605,0.825829,0.825983
3,0.261,0.413763,0.817661,0.822321,0.816526,0.816578
4,0.2168,0.451566,0.838303,0.839035,0.837848,0.838042
5,0.1776,0.429531,0.853211,0.853211,0.85333,0.853199
6,0.1451,0.550736,0.84633,0.852147,0.845142,0.845334
7,0.1203,0.477995,0.847477,0.850261,0.846647,0.846912
8,0.1,0.47309,0.864679,0.86497,0.86497,0.864679
9,0.08,0.655878,0.84289,0.845855,0.843742,0.842739
10,0.066,0.626916,0.855505,0.856676,0.856045,0.855477


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--recall/11f90e583db35601050aed380d48e83202a896976b9608432fba9244fb447f24 (last modified on Fri Jan 10 23:14:00 2025) since it couldn't be found locally at evaluate-metric--recall, or remotely on the Hugging Face Hub.
[I 2025-03-23 04:15:13,540] Trial 77 finished with value: 0.8541227858722613 and parameters: {'learning_rate': 0.0005663297775827886, 'weight_decay': 0.0, 'warmup_steps': 20}. Best is trial 34 with value: 0.8657619572039268.


Trial 78 with params: {'learning_rate': 0.0007141118917986414, 'weight_decay': 0.002, 'warmup_steps': 11}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.391,0.436162,0.786697,0.797331,0.788394,0.785392
2,0.3065,0.407159,0.827982,0.831937,0.826955,0.827108
3,0.2468,0.411783,0.831422,0.833764,0.830629,0.830843
4,0.1984,0.476067,0.836009,0.838959,0.835133,0.835354
5,0.1585,0.459577,0.849771,0.849922,0.849994,0.849769
6,0.1265,0.572331,0.837156,0.84773,0.835543,0.835403
7,0.1022,0.480029,0.845183,0.846474,0.844605,0.84484
8,0.0789,0.607989,0.854358,0.854596,0.854624,0.854358
9,0.0617,0.653836,0.849771,0.850661,0.850246,0.849755
10,0.0457,0.747716,0.848624,0.849978,0.849204,0.848585


[I 2025-03-23 04:17:51,947] Trial 78 pruned. 


Trial 79 with params: {'learning_rate': 0.0019885055835747547, 'weight_decay': 0.003, 'warmup_steps': 32}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3768,0.410413,0.802752,0.809029,0.804033,0.802153
2,0.2602,0.383218,0.84633,0.847945,0.845689,0.845938
3,0.1819,0.404209,0.849771,0.850141,0.849446,0.849604
4,0.1274,0.514679,0.838303,0.842597,0.837259,0.837454
5,0.0842,0.594084,0.840596,0.841337,0.840143,0.840339
6,0.0558,0.702332,0.845183,0.846104,0.844689,0.844904
7,0.0378,0.70801,0.84289,0.84304,0.843111,0.842888
8,0.0244,0.964752,0.837156,0.837552,0.836806,0.836963
9,0.0163,1.06252,0.84289,0.843612,0.843321,0.84288
10,0.0109,1.210626,0.848624,0.848757,0.848404,0.848509


[I 2025-03-23 04:20:13,015] Trial 79 pruned. 


Trial 80 with params: {'learning_rate': 0.0006584993114807881, 'weight_decay': 0.008, 'warmup_steps': 29}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3976,0.448344,0.788991,0.803194,0.79094,0.7872
2,0.3106,0.40131,0.832569,0.835334,0.831713,0.831924
3,0.2537,0.409713,0.830275,0.83576,0.829081,0.829174
4,0.206,0.440606,0.841743,0.842414,0.841311,0.841502
5,0.1656,0.444473,0.855505,0.856027,0.855877,0.855502
6,0.1328,0.595104,0.836009,0.84412,0.834586,0.834582
7,0.1073,0.469023,0.864679,0.86476,0.864507,0.864593
8,0.0847,0.5069,0.873853,0.873969,0.874063,0.873851
9,0.0645,0.680558,0.854358,0.857397,0.855214,0.854218
10,0.0503,0.690964,0.857798,0.857798,0.857919,0.857786


[I 2025-03-23 04:23:07,361] Trial 80 finished with value: 0.865747825823779 and parameters: {'learning_rate': 0.0006584993114807881, 'weight_decay': 0.008, 'warmup_steps': 29}. Best is trial 34 with value: 0.8657619572039268.


Trial 81 with params: {'learning_rate': 0.00029971825483957623, 'weight_decay': 0.008, 'warmup_steps': 27}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4185,0.432283,0.797018,0.802227,0.798192,0.796524
2,0.335,0.420372,0.809633,0.813629,0.808559,0.808602
3,0.2981,0.413749,0.809633,0.811287,0.810285,0.809552
4,0.267,0.471008,0.822248,0.829037,0.820904,0.820851
5,0.2346,0.44866,0.826835,0.827857,0.827345,0.826807
6,0.207,0.487736,0.823394,0.829557,0.822114,0.822113
7,0.1853,0.454534,0.831422,0.832633,0.830839,0.831049
8,0.1681,0.503782,0.84633,0.848175,0.845647,0.845901
9,0.1508,0.554317,0.848624,0.849141,0.848994,0.848621
10,0.1384,0.512086,0.827982,0.833419,0.829145,0.827582


[I 2025-03-23 04:24:58,478] Trial 81 pruned. 


Trial 82 with params: {'learning_rate': 0.0012315261197628753, 'weight_decay': 0.005, 'warmup_steps': 40}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3885,0.434288,0.795872,0.805724,0.797487,0.794766
2,0.2835,0.406535,0.833716,0.836639,0.832839,0.833051
3,0.2125,0.415051,0.83945,0.843238,0.838469,0.838686
4,0.1576,0.462014,0.83945,0.842907,0.838511,0.838736
5,0.1132,0.460024,0.853211,0.853186,0.853119,0.853148
6,0.0831,0.565503,0.844037,0.847891,0.843058,0.843295
7,0.0555,0.550444,0.864679,0.865208,0.865054,0.864676
8,0.0379,0.775269,0.856651,0.856632,0.85675,0.856636
9,0.0263,0.886634,0.856651,0.856596,0.856624,0.856609
10,0.0181,0.994553,0.860092,0.860206,0.860297,0.860089


[I 2025-03-23 04:27:44,948] Trial 82 finished with value: 0.8496992854163624 and parameters: {'learning_rate': 0.0012315261197628753, 'weight_decay': 0.005, 'warmup_steps': 40}. Best is trial 34 with value: 0.8657619572039268.


Trial 83 with params: {'learning_rate': 0.0008181427846361111, 'weight_decay': 0.008, 'warmup_steps': 31}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3936,0.449223,0.78555,0.803988,0.787773,0.783083
2,0.302,0.406193,0.834862,0.838925,0.833839,0.834023
3,0.2383,0.419898,0.833716,0.839478,0.832502,0.832606
4,0.1871,0.495692,0.83945,0.844322,0.838343,0.838524
5,0.1462,0.451533,0.858945,0.859405,0.859297,0.858943
6,0.1145,0.555066,0.84289,0.851719,0.841427,0.841453
7,0.0867,0.536962,0.868119,0.868532,0.867801,0.867973
8,0.0641,0.59683,0.862385,0.862464,0.862213,0.862298
9,0.0467,0.73141,0.861239,0.8617,0.861592,0.861237
10,0.0336,0.828225,0.852064,0.854509,0.852835,0.851961


[I 2025-03-23 04:30:54,777] Trial 83 finished with value: 0.865747825823779 and parameters: {'learning_rate': 0.0008181427846361111, 'weight_decay': 0.008, 'warmup_steps': 31}. Best is trial 34 with value: 0.8657619572039268.


Trial 84 with params: {'learning_rate': 0.0010977711800505714, 'weight_decay': 0.007, 'warmup_steps': 31}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3866,0.427895,0.800459,0.808926,0.801949,0.799572
2,0.2887,0.402574,0.832569,0.834543,0.831839,0.83206
3,0.22,0.421318,0.841743,0.845564,0.840764,0.84099
4,0.1663,0.465956,0.838303,0.841908,0.837343,0.837559
5,0.1248,0.460037,0.863532,0.863513,0.863633,0.863518
6,0.0923,0.508751,0.849771,0.854251,0.848731,0.848982
7,0.0651,0.610099,0.858945,0.861853,0.858119,0.858422
8,0.0449,0.593747,0.850917,0.850883,0.850994,0.850898
9,0.031,0.73546,0.863532,0.863513,0.863633,0.863518
10,0.0217,0.833621,0.858945,0.859185,0.859213,0.858945


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--f1/34c46321f42186df33a6260966e34a368f14868d9cc2ba47d142112e2800d233 (last modified on Fri Jan 10 23:14:01 2025) since it couldn't be found locally at evaluate-metric--f1, or remotely on the Hugging Face Hub.
[I 2025-03-23 04:34:00,407] Trial 84 finished with value: 0.8518770171564464 and parameters: {'learning_rate': 0.0010977711800505714, 'weight_decay': 0.007, 'warmup_steps': 31}. Best is trial 34 with value: 0.8657619572039268.


Trial 85 with params: {'learning_rate': 0.0008580120211274837, 'weight_decay': 0.007, 'warmup_steps': 22}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3897,0.434757,0.792431,0.802446,0.794066,0.791271
2,0.2987,0.421985,0.831422,0.834932,0.83046,0.830647
3,0.2357,0.431483,0.822248,0.826631,0.821156,0.821255
4,0.1838,0.464851,0.836009,0.839917,0.835007,0.835203
5,0.1428,0.452713,0.857798,0.85774,0.857792,0.857762
6,0.1112,0.529922,0.84289,0.847259,0.841848,0.842066
7,0.0838,0.567983,0.864679,0.868473,0.863749,0.864078
8,0.0613,0.608527,0.861239,0.861582,0.86155,0.861238
9,0.0457,0.687549,0.856651,0.856674,0.856793,0.856642
10,0.0316,0.856787,0.857798,0.858462,0.858213,0.857791


[I 2025-03-23 04:37:52,957] Trial 85 finished with value: 0.8554125542834583 and parameters: {'learning_rate': 0.0008580120211274837, 'weight_decay': 0.007, 'warmup_steps': 22}. Best is trial 34 with value: 0.8657619572039268.


Trial 86 with params: {'learning_rate': 0.0005801170896115007, 'weight_decay': 0.01, 'warmup_steps': 37}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4036,0.434141,0.795872,0.802444,0.797192,0.795198
2,0.3127,0.410718,0.832569,0.835935,0.831628,0.831825
3,0.257,0.419496,0.827982,0.831937,0.826955,0.827108
4,0.2119,0.463296,0.826835,0.829397,0.825998,0.826193
5,0.1744,0.429815,0.848624,0.848569,0.848657,0.848595


[I 2025-03-23 04:38:54,158] Trial 86 pruned. 


Trial 87 with params: {'learning_rate': 0.0005789832063929583, 'weight_decay': 0.007, 'warmup_steps': 33}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4021,0.435711,0.791284,0.799075,0.79273,0.79042
2,0.313,0.408815,0.832569,0.835935,0.831628,0.831825
3,0.2579,0.42464,0.818807,0.82262,0.817778,0.817887
4,0.2123,0.446464,0.834862,0.837372,0.834049,0.834273
5,0.1743,0.452108,0.84633,0.846522,0.846573,0.846329
6,0.142,0.516854,0.841743,0.846277,0.840679,0.840886
7,0.1166,0.502732,0.84633,0.850218,0.845352,0.845599
8,0.0954,0.500964,0.860092,0.860141,0.860255,0.860085
9,0.0761,0.659377,0.834862,0.838891,0.835859,0.834611
10,0.0617,0.688613,0.852064,0.852302,0.85233,0.852064


[I 2025-03-23 04:42:01,811] Trial 87 finished with value: 0.8530370620052562 and parameters: {'learning_rate': 0.0005789832063929583, 'weight_decay': 0.007, 'warmup_steps': 33}. Best is trial 34 with value: 0.8657619572039268.


Trial 88 with params: {'learning_rate': 0.0007832919195971522, 'weight_decay': 0.008, 'warmup_steps': 31}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3947,0.446925,0.779817,0.796196,0.781931,0.777551
2,0.3031,0.407187,0.830275,0.834983,0.829166,0.829297
3,0.2407,0.439577,0.832569,0.839856,0.831207,0.831218
4,0.1911,0.486486,0.841743,0.846277,0.840679,0.840886
5,0.1497,0.463606,0.852064,0.852302,0.85233,0.852064
6,0.1175,0.565905,0.837156,0.845553,0.835712,0.835703
7,0.0904,0.509226,0.865826,0.866361,0.865465,0.865656
8,0.0675,0.555051,0.858945,0.85985,0.859424,0.85893
9,0.0494,0.696171,0.856651,0.857243,0.857045,0.856647
10,0.0358,0.768455,0.861239,0.861983,0.861676,0.86123


[I 2025-03-23 04:44:42,481] Trial 88 finished with value: 0.8749406267968206 and parameters: {'learning_rate': 0.0007832919195971522, 'weight_decay': 0.008, 'warmup_steps': 31}. Best is trial 88 with value: 0.8749406267968206.


Trial 89 with params: {'learning_rate': 0.0005965683336988867, 'weight_decay': 0.007, 'warmup_steps': 30}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4,0.442545,0.78555,0.798691,0.787436,0.783865
2,0.3131,0.406672,0.829128,0.832606,0.828166,0.828343
3,0.2585,0.414076,0.829128,0.83439,0.827955,0.828052
4,0.2131,0.432668,0.84289,0.843977,0.842353,0.842575
5,0.1732,0.447145,0.852064,0.852144,0.852246,0.852059
6,0.1412,0.55435,0.84289,0.848848,0.841679,0.841841
7,0.1164,0.460215,0.856651,0.857623,0.856161,0.856393
8,0.0945,0.501764,0.869266,0.869462,0.869517,0.869265
9,0.0741,0.641138,0.83945,0.842854,0.840364,0.839259
10,0.0604,0.663453,0.860092,0.860286,0.860339,0.860091


[I 2025-03-23 04:47:51,409] Trial 89 finished with value: 0.8507635597682991 and parameters: {'learning_rate': 0.0005965683336988867, 'weight_decay': 0.007, 'warmup_steps': 30}. Best is trial 88 with value: 0.8749406267968206.


Trial 90 with params: {'learning_rate': 0.0011080059666330009, 'weight_decay': 0.009000000000000001, 'warmup_steps': 30}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3862,0.437811,0.792431,0.806464,0.794361,0.790713
2,0.2881,0.412462,0.832569,0.835334,0.831713,0.831924
3,0.219,0.418338,0.836009,0.839582,0.835049,0.835255
4,0.1662,0.480324,0.830275,0.837043,0.828955,0.828976
5,0.1251,0.456693,0.861239,0.8617,0.861592,0.861237
6,0.093,0.548752,0.841743,0.849766,0.840343,0.8404
7,0.0658,0.580084,0.857798,0.85803,0.85754,0.857672
8,0.0452,0.615645,0.864679,0.864633,0.864633,0.864633
9,0.0324,0.784039,0.862385,0.862435,0.862549,0.862379
10,0.0214,0.882586,0.847477,0.848932,0.848078,0.847432


[I 2025-03-23 04:49:48,762] Trial 90 pruned. 


Trial 91 with params: {'learning_rate': 0.0007264778396507315, 'weight_decay': 0.009000000000000001, 'warmup_steps': 33}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.397,0.456595,0.77867,0.798925,0.781016,0.77578
2,0.3066,0.397354,0.834862,0.837372,0.834049,0.834273
3,0.2461,0.419968,0.831422,0.837557,0.830165,0.830232
4,0.1978,0.456222,0.84633,0.848175,0.845647,0.845901
5,0.1569,0.444689,0.856651,0.85711,0.857003,0.85665
6,0.1241,0.554677,0.834862,0.84086,0.833628,0.833729
7,0.0978,0.485353,0.862385,0.862625,0.862128,0.862263
8,0.0741,0.530476,0.863532,0.863498,0.863465,0.86348
9,0.0556,0.640058,0.865826,0.866741,0.866307,0.865811
10,0.0411,0.707723,0.87156,0.871675,0.871769,0.871557


[I 2025-03-23 04:52:44,691] Trial 91 finished with value: 0.8715265171247113 and parameters: {'learning_rate': 0.0007264778396507315, 'weight_decay': 0.009000000000000001, 'warmup_steps': 33}. Best is trial 88 with value: 0.8749406267968206.


Trial 92 with params: {'learning_rate': 0.0007160995611327389, 'weight_decay': 0.01, 'warmup_steps': 30}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3959,0.448439,0.779817,0.796877,0.781974,0.777446
2,0.3074,0.400346,0.834862,0.836856,0.834133,0.83436
3,0.2472,0.419626,0.832569,0.838516,0.831334,0.831419
4,0.1991,0.462223,0.840596,0.84186,0.840016,0.840243
5,0.1581,0.443374,0.853211,0.853261,0.853372,0.853204
6,0.1257,0.574882,0.840596,0.84739,0.839301,0.839408
7,0.1001,0.507281,0.854358,0.854302,0.85433,0.854315
8,0.0768,0.55251,0.864679,0.864729,0.864844,0.864672
9,0.0586,0.684799,0.852064,0.854242,0.852793,0.851978
10,0.0432,0.767173,0.857798,0.857798,0.857919,0.857786


[I 2025-03-23 04:55:37,484] Trial 92 finished with value: 0.8680107771016862 and parameters: {'learning_rate': 0.0007160995611327389, 'weight_decay': 0.01, 'warmup_steps': 30}. Best is trial 88 with value: 0.8749406267968206.


Trial 93 with params: {'learning_rate': 0.0006639354084882724, 'weight_decay': 0.008, 'warmup_steps': 36}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3991,0.445066,0.790138,0.804707,0.792109,0.788311
2,0.3088,0.402054,0.832569,0.835627,0.83167,0.831875
3,0.2503,0.427283,0.830275,0.836172,0.829039,0.82911
4,0.2028,0.454743,0.845183,0.84714,0.844479,0.844732
5,0.1635,0.438965,0.854358,0.854311,0.854414,0.854335
6,0.1307,0.537107,0.837156,0.845051,0.835754,0.835774
7,0.1064,0.496072,0.857798,0.858526,0.857371,0.857582
8,0.0833,0.500823,0.865826,0.86629,0.86618,0.865824
9,0.0635,0.652463,0.844037,0.845814,0.8447,0.84397
10,0.0489,0.705281,0.857798,0.857848,0.857961,0.857791


[I 2025-03-23 04:59:06,760] Trial 93 finished with value: 0.8554285353375861 and parameters: {'learning_rate': 0.0006639354084882724, 'weight_decay': 0.008, 'warmup_steps': 36}. Best is trial 88 with value: 0.8749406267968206.


Trial 94 with params: {'learning_rate': 0.00021760202822591963, 'weight_decay': 0.009000000000000001, 'warmup_steps': 38}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4357,0.447332,0.78555,0.799298,0.787478,0.783776
2,0.347,0.428839,0.811927,0.814201,0.811105,0.811255
3,0.3178,0.427815,0.809633,0.809818,0.809864,0.809632
4,0.2934,0.471441,0.803899,0.811084,0.802465,0.802192
5,0.2681,0.435769,0.822248,0.822946,0.822672,0.822236


[I 2025-03-23 05:00:01,731] Trial 94 pruned. 


Trial 95 with params: {'learning_rate': 0.0006212423201391912, 'weight_decay': 0.01, 'warmup_steps': 33}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3998,0.446176,0.78555,0.801212,0.787604,0.783493
2,0.3111,0.40455,0.829128,0.832606,0.828166,0.828343
3,0.2543,0.437284,0.819954,0.827602,0.818525,0.818387
4,0.2082,0.439048,0.844037,0.845223,0.843479,0.843708
5,0.1699,0.432443,0.854358,0.854698,0.854667,0.854358
6,0.1368,0.53637,0.83945,0.844715,0.838301,0.838467
7,0.1121,0.475294,0.858945,0.859601,0.85854,0.858743
8,0.0897,0.502011,0.863532,0.863513,0.863633,0.863518
9,0.0695,0.661315,0.841743,0.846195,0.842784,0.841473
10,0.0554,0.68098,0.862385,0.862385,0.862507,0.862374


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--recall/11f90e583db35601050aed380d48e83202a896976b9608432fba9244fb447f24 (last modified on Fri Jan 10 23:14:00 2025) since it couldn't be found locally at evaluate-metric--recall, or remotely on the Hugging Face Hub.
[I 2025-03-23 05:03:12,275] Trial 95 finished with value: 0.8542564041823769 and parameters: {'learning_rate': 0.0006212423201391912, 'weight_decay': 0.01, 'warmup_steps': 33}. Best is trial 88 with value: 0.8749406267968206.


Trial 96 with params: {'learning_rate': 0.0005415381349143846, 'weight_decay': 0.009000000000000001, 'warmup_steps': 30}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4024,0.437057,0.799312,0.807048,0.800739,0.79851
2,0.3155,0.411103,0.829128,0.833275,0.828082,0.828232
3,0.2616,0.424313,0.819954,0.824658,0.81882,0.818885
4,0.218,0.440746,0.838303,0.839973,0.837638,0.837871
5,0.1796,0.452515,0.853211,0.853176,0.853288,0.853192
6,0.1481,0.541104,0.84289,0.848427,0.841721,0.8419
7,0.1234,0.468292,0.852064,0.853834,0.851404,0.851669
8,0.1024,0.499565,0.861239,0.862147,0.861718,0.861224
9,0.0834,0.645477,0.837156,0.840542,0.838069,0.836963
10,0.0689,0.637519,0.849771,0.850837,0.850288,0.849747


[I 2025-03-23 05:05:13,280] Trial 96 pruned. 


Trial 97 with params: {'learning_rate': 0.002538596903968981, 'weight_decay': 0.007, 'warmup_steps': 35}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3733,0.417754,0.806193,0.811154,0.807327,0.805763
2,0.2504,0.414301,0.844037,0.848992,0.842932,0.843138
3,0.1715,0.43401,0.840596,0.841192,0.840185,0.840368
4,0.1149,0.5531,0.833716,0.843325,0.832165,0.832044
5,0.0748,0.52624,0.856651,0.856891,0.856919,0.856651
6,0.0488,0.667972,0.840596,0.844573,0.839595,0.839812
7,0.0329,0.775842,0.850917,0.851054,0.850699,0.850804
8,0.0237,0.882461,0.841743,0.842275,0.841353,0.84153
9,0.0154,0.98494,0.84289,0.843142,0.842605,0.842739
10,0.0126,0.938379,0.84289,0.843054,0.842648,0.842761


[I 2025-03-23 05:07:40,242] Trial 97 pruned. 


Trial 98 with params: {'learning_rate': 0.0021748593765580906, 'weight_decay': 0.01, 'warmup_steps': 38}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3755,0.404413,0.811927,0.816071,0.812958,0.811606
2,0.2602,0.396905,0.84289,0.843977,0.842353,0.842575
3,0.1811,0.438062,0.83945,0.843238,0.838469,0.838686
4,0.1245,0.549523,0.827982,0.835619,0.826577,0.826522
5,0.0812,0.60317,0.84289,0.843977,0.842353,0.842575
6,0.0526,0.687446,0.83945,0.843238,0.838469,0.838686
7,0.035,0.754622,0.836009,0.836034,0.835849,0.835914
8,0.0242,0.825824,0.849771,0.851088,0.849194,0.849438
9,0.016,1.051646,0.838303,0.838646,0.837975,0.838124
10,0.0115,1.038994,0.844037,0.844007,0.843942,0.84397


[I 2025-03-23 05:09:31,096] Trial 98 pruned. 


Trial 99 with params: {'learning_rate': 0.000430349602549815, 'weight_decay': 0.008, 'warmup_steps': 26}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4072,0.4393,0.786697,0.794387,0.788141,0.785814
2,0.3225,0.411428,0.827982,0.830162,0.827208,0.827414
3,0.275,0.417583,0.823394,0.826951,0.822409,0.822554
4,0.2359,0.450873,0.826835,0.833296,0.825535,0.825544
5,0.199,0.451249,0.838303,0.839732,0.838901,0.838255


[I 2025-03-23 05:10:20,027] Trial 99 pruned. 


Trial 100 with params: {'learning_rate': 0.0008546574507332562, 'weight_decay': 0.01, 'warmup_steps': 30}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3918,0.445399,0.779817,0.794248,0.781805,0.777852
2,0.2978,0.406693,0.832569,0.835334,0.831713,0.831924
3,0.2332,0.422653,0.836009,0.841012,0.83488,0.835035
4,0.1822,0.50926,0.84633,0.855534,0.844847,0.844891
5,0.1416,0.452928,0.870413,0.870394,0.870517,0.870399
6,0.1097,0.561845,0.84633,0.854999,0.844889,0.844959
7,0.0818,0.528252,0.861239,0.861761,0.860876,0.861063
8,0.0612,0.581091,0.857798,0.857873,0.857624,0.857708
9,0.0435,0.702772,0.861239,0.861319,0.861423,0.861234
10,0.031,0.869597,0.862385,0.862435,0.862549,0.862379


[I 2025-03-23 05:13:38,368] Trial 100 finished with value: 0.8588779550092562 and parameters: {'learning_rate': 0.0008546574507332562, 'weight_decay': 0.01, 'warmup_steps': 30}. Best is trial 88 with value: 0.8749406267968206.


Trial 101 with params: {'learning_rate': 0.0005107783260989206, 'weight_decay': 0.008, 'warmup_steps': 25}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4028,0.439397,0.78555,0.796416,0.787267,0.784199
2,0.3179,0.415035,0.827982,0.832285,0.826913,0.82705
3,0.2669,0.417765,0.817661,0.824779,0.816273,0.816152
4,0.2245,0.429226,0.838303,0.839193,0.837806,0.838011
5,0.187,0.44613,0.847477,0.847458,0.847573,0.847461


[I 2025-03-23 05:14:52,335] Trial 101 pruned. 


Trial 102 with params: {'learning_rate': 0.000970276652470033, 'weight_decay': 0.008, 'warmup_steps': 24}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3869,0.431511,0.795872,0.806251,0.797529,0.794695
2,0.2939,0.410481,0.836009,0.83867,0.835175,0.835401
3,0.2288,0.430177,0.821101,0.829022,0.819651,0.819505
4,0.1756,0.477545,0.837156,0.840583,0.836217,0.836432
5,0.1332,0.428376,0.863532,0.863685,0.86376,0.86353
6,0.1021,0.537119,0.844037,0.853168,0.842553,0.842576
7,0.0737,0.571623,0.853211,0.856208,0.852362,0.852646
8,0.0515,0.597504,0.866972,0.866928,0.866928,0.866928
9,0.0376,0.701205,0.864679,0.865081,0.865012,0.864678
10,0.0252,0.822505,0.864679,0.864625,0.864718,0.864653


[I 2025-03-23 05:18:21,339] Trial 102 finished with value: 0.8634529168635017 and parameters: {'learning_rate': 0.000970276652470033, 'weight_decay': 0.008, 'warmup_steps': 24}. Best is trial 88 with value: 0.8749406267968206.


Trial 103 with params: {'learning_rate': 0.0009966291146806172, 'weight_decay': 0.008, 'warmup_steps': 27}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3879,0.427043,0.800459,0.807545,0.801823,0.799747
2,0.2924,0.408384,0.830275,0.832738,0.82946,0.82967
3,0.2251,0.425162,0.832569,0.835627,0.83167,0.831875
4,0.1729,0.44903,0.840596,0.84186,0.840016,0.840243
5,0.1309,0.478258,0.852064,0.852057,0.851951,0.851994
6,0.0995,0.568985,0.840596,0.849889,0.83909,0.839067
7,0.0723,0.598305,0.862385,0.864888,0.861623,0.861931
8,0.0507,0.583909,0.850917,0.850892,0.850825,0.850854
9,0.0356,0.841586,0.854358,0.854381,0.854498,0.854348
10,0.0246,0.930069,0.856651,0.856804,0.856877,0.85665


[I 2025-03-23 05:21:24,073] Trial 103 finished with value: 0.8633363844092217 and parameters: {'learning_rate': 0.0009966291146806172, 'weight_decay': 0.008, 'warmup_steps': 27}. Best is trial 88 with value: 0.8749406267968206.


Trial 104 with params: {'learning_rate': 0.001327983004182987, 'weight_decay': 0.006, 'warmup_steps': 26}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3823,0.429258,0.799312,0.804185,0.800444,0.798867
2,0.2814,0.404255,0.832569,0.834543,0.831839,0.83206
3,0.2086,0.440613,0.826835,0.832865,0.825577,0.825613
4,0.1535,0.489494,0.83945,0.843238,0.838469,0.838686
5,0.1111,0.465067,0.865826,0.867029,0.865296,0.865557
6,0.0791,0.627674,0.841743,0.852486,0.840132,0.840039
7,0.0535,0.60788,0.858945,0.859927,0.858455,0.858691
8,0.0369,0.634959,0.862385,0.862339,0.862339,0.862339
9,0.0248,0.82518,0.858945,0.858991,0.858792,0.858863
10,0.0169,0.946449,0.858945,0.859054,0.85875,0.858847


[I 2025-03-23 05:24:31,980] Trial 104 finished with value: 0.8577075931043558 and parameters: {'learning_rate': 0.001327983004182987, 'weight_decay': 0.006, 'warmup_steps': 26}. Best is trial 88 with value: 0.8749406267968206.


Trial 105 with params: {'learning_rate': 0.0007221737258359708, 'weight_decay': 0.009000000000000001, 'warmup_steps': 28}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3958,0.449775,0.786697,0.804124,0.788857,0.784401
2,0.307,0.407881,0.834862,0.83662,0.834175,0.834402
3,0.2468,0.412825,0.831422,0.83673,0.83025,0.83036
4,0.1979,0.457339,0.834862,0.837948,0.833965,0.834178
5,0.1564,0.443229,0.854358,0.854339,0.854456,0.854342
6,0.1243,0.571483,0.841743,0.851907,0.840174,0.840115
7,0.0975,0.510143,0.866972,0.868084,0.866465,0.866719
8,0.0752,0.516582,0.869266,0.870673,0.869853,0.869232
9,0.0561,0.726162,0.858945,0.85985,0.859424,0.85893
10,0.0419,0.731825,0.868119,0.869038,0.868601,0.868105


[I 2025-03-23 05:27:28,197] Trial 105 finished with value: 0.8726781239517768 and parameters: {'learning_rate': 0.0007221737258359708, 'weight_decay': 0.009000000000000001, 'warmup_steps': 28}. Best is trial 88 with value: 0.8749406267968206.


Trial 106 with params: {'learning_rate': 0.0012042322526873168, 'weight_decay': 0.007, 'warmup_steps': 30}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3841,0.430473,0.793578,0.803366,0.795192,0.79246
2,0.285,0.408927,0.833716,0.83694,0.832797,0.833002
3,0.2138,0.414317,0.836009,0.839917,0.835007,0.835203
4,0.1603,0.501112,0.833716,0.838298,0.832628,0.832786
5,0.1167,0.461349,0.868119,0.8681,0.868222,0.868105
6,0.0851,0.562064,0.838303,0.84648,0.83688,0.836895
7,0.058,0.589863,0.854358,0.855152,0.853909,0.854123
8,0.0391,0.67045,0.861239,0.861286,0.861087,0.861158
9,0.0269,0.787819,0.868119,0.8682,0.868306,0.868115
10,0.0186,0.836207,0.849771,0.850141,0.849446,0.849604


[I 2025-03-23 05:29:17,183] Trial 106 pruned. 


Trial 107 with params: {'learning_rate': 0.0008736542766103523, 'weight_decay': 0.008, 'warmup_steps': 31}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3917,0.443538,0.779817,0.794248,0.781805,0.777852
2,0.2972,0.407829,0.831422,0.834033,0.830586,0.830797
3,0.2328,0.413219,0.822248,0.83095,0.820735,0.820543
4,0.1813,0.490565,0.84289,0.851199,0.841469,0.841522
5,0.1396,0.433109,0.862385,0.862351,0.862465,0.862367
6,0.1076,0.56226,0.849771,0.857267,0.848436,0.84859
7,0.0806,0.543219,0.864679,0.865281,0.864297,0.864496
8,0.0584,0.595298,0.863532,0.863877,0.863844,0.863532
9,0.0427,0.671273,0.861239,0.862732,0.861844,0.861197
10,0.0305,0.802815,0.861239,0.861193,0.861297,0.861216


[I 2025-03-23 05:32:37,584] Trial 107 finished with value: 0.8749406267968206 and parameters: {'learning_rate': 0.0008736542766103523, 'weight_decay': 0.008, 'warmup_steps': 31}. Best is trial 88 with value: 0.8749406267968206.


Trial 108 with params: {'learning_rate': 0.0011839940776885481, 'weight_decay': 0.008, 'warmup_steps': 34}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.386,0.424801,0.806193,0.810793,0.807285,0.805804
2,0.286,0.403793,0.837156,0.838931,0.83647,0.836702
3,0.2153,0.422709,0.838303,0.840988,0.837469,0.837703
4,0.1611,0.478463,0.836009,0.840632,0.834923,0.835093
5,0.1183,0.486882,0.860092,0.860169,0.859918,0.860003
6,0.0865,0.514793,0.841743,0.848336,0.840469,0.840595
7,0.0601,0.678054,0.847477,0.849452,0.846773,0.847033
8,0.0418,0.58672,0.858945,0.858991,0.858792,0.858863
9,0.0279,0.782218,0.863532,0.863475,0.863549,0.863502
10,0.0196,0.888637,0.861239,0.861181,0.861255,0.861208


[I 2025-03-23 05:35:40,052] Trial 108 finished with value: 0.859985680592992 and parameters: {'learning_rate': 0.0011839940776885481, 'weight_decay': 0.008, 'warmup_steps': 34}. Best is trial 88 with value: 0.8749406267968206.


Trial 109 with params: {'learning_rate': 0.0004973162464999332, 'weight_decay': 0.009000000000000001, 'warmup_steps': 29}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4051,0.440479,0.788991,0.79865,0.790604,0.787848
2,0.318,0.4148,0.829128,0.831997,0.82825,0.828446
3,0.2672,0.429361,0.81422,0.820168,0.812937,0.812873
4,0.2251,0.449247,0.832569,0.834309,0.831881,0.832102
5,0.188,0.454054,0.84289,0.84304,0.843111,0.842888
6,0.1563,0.542179,0.852064,0.856213,0.851067,0.851337
7,0.1312,0.477536,0.853211,0.854447,0.852656,0.852901
8,0.1113,0.49914,0.860092,0.861475,0.860676,0.860056
9,0.0916,0.596496,0.845183,0.847871,0.845994,0.845056
10,0.0771,0.616951,0.84633,0.848883,0.84712,0.846214


[I 2025-03-23 05:37:26,873] Trial 109 pruned. 


Trial 110 with params: {'learning_rate': 0.0010150897293453322, 'weight_decay': 0.008, 'warmup_steps': 27}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.387,0.434338,0.800459,0.80992,0.802033,0.799445
2,0.291,0.41448,0.837156,0.839972,0.836301,0.836529
3,0.2242,0.427034,0.823394,0.830903,0.821988,0.821895
4,0.1714,0.47428,0.838303,0.843748,0.837133,0.837284
5,0.1296,0.460378,0.853211,0.853157,0.853246,0.853183
6,0.0983,0.547047,0.84633,0.853982,0.844973,0.845091
7,0.0704,0.643881,0.848624,0.852901,0.847605,0.847855
8,0.0487,0.688802,0.858945,0.858926,0.859045,0.85893
9,0.0344,0.79651,0.852064,0.852087,0.852204,0.852055
10,0.0241,0.893088,0.856651,0.857731,0.857171,0.856629


[I 2025-03-23 05:40:55,001] Trial 110 finished with value: 0.8577233204909578 and parameters: {'learning_rate': 0.0010150897293453322, 'weight_decay': 0.008, 'warmup_steps': 27}. Best is trial 88 with value: 0.8749406267968206.


Trial 111 with params: {'learning_rate': 0.0012026420239199128, 'weight_decay': 0.008, 'warmup_steps': 27}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.383,0.433263,0.791284,0.801527,0.79294,0.790082
2,0.2842,0.398672,0.836009,0.839917,0.835007,0.835203
3,0.2134,0.417365,0.831422,0.835965,0.830334,0.83048
4,0.1591,0.459407,0.840596,0.841497,0.840101,0.840309
5,0.1172,0.443527,0.862385,0.862625,0.862128,0.862263
6,0.0858,0.558429,0.852064,0.856213,0.851067,0.851337
7,0.0585,0.60264,0.864679,0.865026,0.864381,0.864539
8,0.0404,0.73414,0.863532,0.863486,0.863591,0.86351
9,0.0277,0.865873,0.861239,0.861181,0.861255,0.861208
10,0.0186,0.954416,0.864679,0.864794,0.864886,0.864676


[I 2025-03-23 05:43:52,851] Trial 111 finished with value: 0.8610848892031129 and parameters: {'learning_rate': 0.0012026420239199128, 'weight_decay': 0.008, 'warmup_steps': 27}. Best is trial 88 with value: 0.8749406267968206.


Trial 112 with params: {'learning_rate': 0.0007144069800932559, 'weight_decay': 0.008, 'warmup_steps': 29}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3962,0.452103,0.780963,0.800607,0.783268,0.778219
2,0.3073,0.405914,0.833716,0.836639,0.832839,0.833051
3,0.247,0.41555,0.834862,0.838584,0.833881,0.834077
4,0.1991,0.462383,0.83945,0.841242,0.838764,0.839002
5,0.158,0.447329,0.853211,0.853404,0.853456,0.85321
6,0.1247,0.545817,0.841743,0.848336,0.840469,0.840595
7,0.0989,0.503576,0.868119,0.868419,0.867843,0.867993
8,0.0759,0.511195,0.864679,0.864679,0.864802,0.864668
9,0.057,0.690481,0.854358,0.855256,0.854835,0.854342
10,0.0421,0.746667,0.864679,0.865208,0.865054,0.864676


[I 2025-03-23 05:46:53,988] Trial 112 finished with value: 0.8680566246021502 and parameters: {'learning_rate': 0.0007144069800932559, 'weight_decay': 0.008, 'warmup_steps': 29}. Best is trial 88 with value: 0.8749406267968206.


Trial 113 with params: {'learning_rate': 0.0007832681022702688, 'weight_decay': 0.008, 'warmup_steps': 32}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3951,0.445833,0.78211,0.799996,0.78431,0.779657
2,0.3037,0.405896,0.838303,0.841586,0.837385,0.837609
3,0.2408,0.427039,0.824541,0.833317,0.82303,0.822858
4,0.1912,0.48538,0.83945,0.842907,0.838511,0.838736
5,0.1499,0.456816,0.858945,0.859287,0.859255,0.858945
6,0.1175,0.547381,0.840596,0.84739,0.839301,0.839408
7,0.0906,0.540223,0.861239,0.861636,0.860918,0.861085
8,0.0681,0.560037,0.861239,0.8617,0.861592,0.861237
9,0.0503,0.677438,0.861239,0.861181,0.861255,0.861208
10,0.0355,0.784607,0.858945,0.859097,0.859171,0.858943


[I 2025-03-23 05:50:10,142] Trial 113 finished with value: 0.8680802305833385 and parameters: {'learning_rate': 0.0007832681022702688, 'weight_decay': 0.008, 'warmup_steps': 32}. Best is trial 88 with value: 0.8749406267968206.


Trial 114 with params: {'learning_rate': 0.00040312693636627886, 'weight_decay': 0.007, 'warmup_steps': 42}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.415,0.437494,0.788991,0.79719,0.790477,0.788053
2,0.3254,0.414016,0.822248,0.82563,0.821283,0.82143
3,0.281,0.418469,0.818807,0.82262,0.817778,0.817887
4,0.2428,0.477369,0.824541,0.832811,0.823072,0.822937
5,0.2072,0.450274,0.83945,0.84078,0.840027,0.839408
6,0.1772,0.499958,0.833716,0.837936,0.83267,0.832843
7,0.1542,0.465514,0.845183,0.845543,0.844858,0.845012
8,0.134,0.501136,0.855505,0.85545,0.85554,0.855477
9,0.1161,0.599196,0.850917,0.854089,0.851793,0.850764
10,0.1026,0.560294,0.845183,0.848164,0.846036,0.845035


[I 2025-03-23 05:52:19,056] Trial 114 pruned. 


Trial 115 with params: {'learning_rate': 0.0005634472763711701, 'weight_decay': 0.008, 'warmup_steps': 35}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4039,0.435117,0.788991,0.796731,0.790435,0.788117
2,0.3139,0.412592,0.826835,0.829677,0.825956,0.826143
3,0.2597,0.423122,0.824541,0.827955,0.823577,0.823734
4,0.2144,0.454122,0.829128,0.830956,0.828418,0.828631
5,0.1765,0.447412,0.847477,0.847458,0.847573,0.847461


[I 2025-03-23 05:53:29,350] Trial 115 pruned. 


Trial 116 with params: {'learning_rate': 0.0012793114247087718, 'weight_decay': 0.008, 'warmup_steps': 41}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3868,0.434648,0.791284,0.801008,0.792898,0.790154
2,0.2825,0.397787,0.834862,0.836398,0.834217,0.834441
3,0.2115,0.425437,0.833716,0.840799,0.832376,0.832409
4,0.1556,0.500128,0.832569,0.839394,0.831249,0.831287
5,0.1122,0.46262,0.853211,0.853176,0.853288,0.853192
6,0.0804,0.540397,0.853211,0.856852,0.852278,0.852559
7,0.0543,0.630722,0.856651,0.856605,0.856708,0.856629
8,0.0373,0.643495,0.861239,0.861902,0.860834,0.86104
9,0.0244,0.819965,0.862385,0.86258,0.862634,0.862385
10,0.017,0.984071,0.854358,0.855915,0.85374,0.854003


[I 2025-03-23 05:56:46,576] Trial 116 finished with value: 0.8657859737239183 and parameters: {'learning_rate': 0.0012793114247087718, 'weight_decay': 0.008, 'warmup_steps': 41}. Best is trial 88 with value: 0.8749406267968206.


Trial 117 with params: {'learning_rate': 0.0010650214368593087, 'weight_decay': 0.01, 'warmup_steps': 39}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3901,0.43715,0.793578,0.803366,0.795192,0.79246
2,0.29,0.414531,0.827982,0.832285,0.826913,0.82705
3,0.222,0.450181,0.824541,0.835504,0.822861,0.822523
4,0.1677,0.485581,0.845183,0.849229,0.844184,0.844422
5,0.1244,0.453328,0.865826,0.865849,0.86597,0.865817
6,0.0928,0.530109,0.84289,0.847259,0.841848,0.842066
7,0.0646,0.590988,0.865826,0.865943,0.865633,0.865732
8,0.0457,0.624311,0.860092,0.860071,0.860003,0.860032
9,0.0313,0.761988,0.860092,0.860141,0.860255,0.860085
10,0.0214,0.910964,0.858945,0.859185,0.859213,0.858945


[I 2025-03-23 06:00:23,150] Trial 117 finished with value: 0.8600181056443295 and parameters: {'learning_rate': 0.0010650214368593087, 'weight_decay': 0.01, 'warmup_steps': 39}. Best is trial 88 with value: 0.8749406267968206.


Trial 118 with params: {'learning_rate': 0.0015056202524908271, 'weight_decay': 0.007, 'warmup_steps': 40}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3826,0.425045,0.798165,0.807566,0.799739,0.79714
2,0.2769,0.396564,0.838303,0.840712,0.837512,0.837748
3,0.2013,0.408134,0.856651,0.859241,0.855866,0.856159
4,0.1452,0.462037,0.844037,0.847235,0.843142,0.843391
5,0.1016,0.498232,0.858945,0.858942,0.858834,0.858878
6,0.0717,0.607406,0.833716,0.843325,0.832165,0.832044
7,0.0483,0.634278,0.865826,0.865773,0.865802,0.865786
8,0.0314,0.741931,0.853211,0.853923,0.852783,0.852988
9,0.0214,0.737972,0.853211,0.854082,0.852741,0.85296
10,0.0135,0.859592,0.862385,0.86313,0.86196,0.862176


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--accuracy/f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Sat Oct 12 13:56:14 2024) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.
[I 2025-03-23 06:03:52,481] Trial 118 finished with value: 0.8622809973045822 and parameters: {'learning_rate': 0.0015056202524908271, 'weight_decay': 0.007, 'warmup_steps': 40}. Best is trial 88 with value: 0.8749406267968206.


Trial 119 with params: {'learning_rate': 0.0013325320541665191, 'weight_decay': 0.008, 'warmup_steps': 38}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3849,0.42599,0.795872,0.80287,0.797234,0.795143
2,0.2815,0.38812,0.844037,0.846931,0.843184,0.843436
3,0.2078,0.441567,0.841743,0.845231,0.840806,0.84104
4,0.1541,0.455097,0.84289,0.844167,0.842311,0.842542
5,0.1109,0.491041,0.861239,0.861237,0.861129,0.861173
6,0.0796,0.515237,0.857798,0.8615,0.856866,0.857166
7,0.0533,0.612505,0.865826,0.865807,0.865928,0.865811
8,0.036,0.6375,0.865826,0.865943,0.865633,0.865732
9,0.0246,0.760945,0.868119,0.868087,0.868054,0.868069
10,0.0163,0.953696,0.858945,0.858899,0.859003,0.858923


[I 2025-03-23 06:07:08,806] Trial 119 finished with value: 0.8633810067369458 and parameters: {'learning_rate': 0.0013325320541665191, 'weight_decay': 0.008, 'warmup_steps': 38}. Best is trial 88 with value: 0.8749406267968206.


Trial 120 with params: {'learning_rate': 0.000879768366039592, 'weight_decay': 0.009000000000000001, 'warmup_steps': 30}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3911,0.443573,0.784404,0.797203,0.786268,0.782753
2,0.2963,0.41254,0.832569,0.836258,0.831586,0.831772
3,0.2321,0.422266,0.831422,0.841504,0.829829,0.829647
4,0.1813,0.49457,0.83945,0.849534,0.83788,0.837798
5,0.1399,0.441249,0.860092,0.860112,0.85996,0.860018
6,0.1085,0.50509,0.852064,0.858668,0.850815,0.85102
7,0.0805,0.518252,0.853211,0.853211,0.85333,0.853199
8,0.0583,0.599878,0.850917,0.85111,0.851162,0.850917
9,0.0423,0.733991,0.858945,0.858926,0.859045,0.85893
10,0.0292,0.769514,0.861239,0.861479,0.861508,0.861238


[I 2025-03-23 06:10:54,013] Trial 120 finished with value: 0.856533453371398 and parameters: {'learning_rate': 0.000879768366039592, 'weight_decay': 0.009000000000000001, 'warmup_steps': 30}. Best is trial 88 with value: 0.8749406267968206.


Trial 121 with params: {'learning_rate': 0.0006585175431767217, 'weight_decay': 0.006, 'warmup_steps': 26}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3975,0.445216,0.790138,0.804076,0.792067,0.788401
2,0.3103,0.407086,0.832569,0.835627,0.83167,0.831875
3,0.2522,0.421338,0.824541,0.829712,0.823367,0.823436
4,0.2051,0.468548,0.838303,0.840451,0.837554,0.837791
5,0.1646,0.448993,0.847477,0.847714,0.847741,0.847477
6,0.1321,0.608867,0.834862,0.845353,0.833249,0.833084
7,0.1069,0.471019,0.868119,0.869142,0.867633,0.867881
8,0.0837,0.500911,0.873853,0.873884,0.873727,0.873787
9,0.0647,0.671765,0.84633,0.849169,0.847163,0.846194
10,0.0499,0.681889,0.861239,0.861319,0.861423,0.861234


[I 2025-03-23 06:13:40,288] Trial 121 finished with value: 0.8645584882612793 and parameters: {'learning_rate': 0.0006585175431767217, 'weight_decay': 0.006, 'warmup_steps': 26}. Best is trial 88 with value: 0.8749406267968206.


Trial 122 with params: {'learning_rate': 0.000834122789203197, 'weight_decay': 0.006, 'warmup_steps': 24}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3907,0.433586,0.794725,0.804288,0.796319,0.793648
2,0.2994,0.410856,0.834862,0.837948,0.833965,0.834178
3,0.2366,0.428923,0.827982,0.831937,0.826955,0.827108
4,0.186,0.472473,0.837156,0.840583,0.836217,0.836432
5,0.1457,0.434894,0.863532,0.863612,0.863718,0.863528
6,0.1138,0.525926,0.853211,0.857198,0.852235,0.852513
7,0.0864,0.535413,0.870413,0.870469,0.870264,0.870338
8,0.0651,0.585204,0.860092,0.861926,0.86076,0.860032
9,0.0462,0.739498,0.855505,0.85545,0.85554,0.855477
10,0.034,0.822249,0.854358,0.855094,0.854793,0.854348


[I 2025-03-23 06:15:28,755] Trial 122 pruned. 


Trial 123 with params: {'learning_rate': 0.0003350270585472069, 'weight_decay': 0.005, 'warmup_steps': 23}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4133,0.436276,0.791284,0.797774,0.792603,0.790596
2,0.331,0.420831,0.811927,0.817821,0.810642,0.810562
3,0.2914,0.411598,0.815367,0.815303,0.815368,0.815326
4,0.2561,0.493441,0.816514,0.827444,0.81481,0.814357
5,0.2231,0.450027,0.838303,0.839732,0.838901,0.838255
6,0.1939,0.495665,0.836009,0.841012,0.83488,0.835035
7,0.1721,0.477964,0.83945,0.84043,0.838932,0.839144
8,0.1539,0.507561,0.849771,0.850261,0.849404,0.849581
9,0.1369,0.563202,0.849771,0.850008,0.850036,0.84977
10,0.1243,0.541957,0.832569,0.839345,0.83386,0.83206


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--recall/11f90e583db35601050aed380d48e83202a896976b9608432fba9244fb447f24 (last modified on Fri Jan 10 23:14:00 2025) since it couldn't be found locally at evaluate-metric--recall, or remotely on the Hugging Face Hub.
[I 2025-03-23 06:17:56,024] Trial 123 pruned. 


Trial 124 with params: {'learning_rate': 0.0007197446276233207, 'weight_decay': 0.008, 'warmup_steps': 33}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.397,0.456448,0.779817,0.799765,0.782142,0.777
2,0.3067,0.399428,0.837156,0.839169,0.836428,0.836661
3,0.2468,0.417916,0.837156,0.842783,0.835964,0.8361
4,0.1982,0.454233,0.840596,0.842765,0.839848,0.840091
5,0.1575,0.440499,0.855505,0.855902,0.855835,0.855504
6,0.1247,0.546386,0.840596,0.845299,0.839511,0.839706
7,0.0986,0.485303,0.866972,0.867581,0.866591,0.866793
8,0.0751,0.551162,0.862385,0.862366,0.862297,0.862327
9,0.0566,0.616938,0.860092,0.861086,0.860592,0.860073
10,0.0418,0.734269,0.865826,0.865906,0.866012,0.865821


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--precision/155d3220d6cd4a6553f12da68eeb3d1f97cf431206304a4bc6e2d564c29502e9 (last modified on Fri Jan 10 23:13:59 2025) since it couldn't be found locally at evaluate-metric--precision, or remotely on the Hugging Face Hub.
[I 2025-03-23 06:21:19,700] Trial 124 finished with value: 0.8588095911960034 and parameters: {'learning_rate': 0.0007197446276233207, 'weight_decay': 0.008, 'warmup_steps': 33}. Best is trial 88 with value: 0.8749406267968206.


Trial 125 with params: {'learning_rate': 0.00027083528453045907, 'weight_decay': 0.005, 'warmup_steps': 30}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4231,0.434707,0.792431,0.800021,0.793856,0.791602
2,0.3391,0.420089,0.807339,0.80982,0.806475,0.806598
3,0.3048,0.419395,0.808486,0.810032,0.809116,0.808413
4,0.2768,0.471777,0.811927,0.823243,0.810179,0.809619
5,0.2456,0.435832,0.829128,0.829988,0.829597,0.82911
6,0.2196,0.457534,0.825688,0.828661,0.824787,0.824966
7,0.1982,0.434976,0.830275,0.83178,0.829629,0.829842
8,0.1804,0.482517,0.84633,0.846876,0.845942,0.846123
9,0.1637,0.518455,0.83945,0.839388,0.839438,0.839408
10,0.1515,0.498538,0.822248,0.827414,0.823388,0.821854


[I 2025-03-23 06:23:05,959] Trial 125 pruned. 


Trial 126 with params: {'learning_rate': 0.0005898956939978046, 'weight_decay': 0.008, 'warmup_steps': 28}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4005,0.439962,0.787844,0.798783,0.789562,0.786507
2,0.3135,0.407736,0.829128,0.833275,0.828082,0.828232
3,0.2589,0.415738,0.823394,0.827978,0.822283,0.822377
4,0.2137,0.429966,0.840596,0.840844,0.840311,0.840443
5,0.1746,0.444874,0.853211,0.853731,0.853583,0.853208
6,0.142,0.562008,0.837156,0.841613,0.836091,0.836274
7,0.1171,0.463617,0.858945,0.85889,0.858918,0.858903
8,0.0957,0.501831,0.865826,0.866425,0.866223,0.865821
9,0.0753,0.671067,0.837156,0.84193,0.838238,0.836846
10,0.0615,0.680141,0.856651,0.856993,0.856961,0.856651


[I 2025-03-23 06:24:54,154] Trial 126 pruned. 


Trial 127 with params: {'learning_rate': 0.000713613815453263, 'weight_decay': 0.005, 'warmup_steps': 27}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3955,0.447642,0.78555,0.80327,0.78773,0.783189
2,0.3069,0.403196,0.83945,0.842007,0.838638,0.838877
3,0.2472,0.423984,0.823394,0.830903,0.821988,0.821895
4,0.1981,0.484269,0.832569,0.836258,0.831586,0.831772
5,0.1576,0.450557,0.855505,0.855505,0.855624,0.855492
6,0.1251,0.583637,0.834862,0.843707,0.833375,0.833316
7,0.099,0.504256,0.863532,0.863582,0.863381,0.863453
8,0.0767,0.537824,0.870413,0.87076,0.870727,0.870413
9,0.057,0.694275,0.849771,0.851938,0.850499,0.849683
10,0.0418,0.725804,0.862385,0.8625,0.862592,0.862382


[I 2025-03-23 06:27:43,598] Trial 127 finished with value: 0.8692103255006183 and parameters: {'learning_rate': 0.000713613815453263, 'weight_decay': 0.005, 'warmup_steps': 27}. Best is trial 88 with value: 0.8749406267968206.


Trial 128 with params: {'learning_rate': 0.0006402984068612654, 'weight_decay': 0.004, 'warmup_steps': 30}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3989,0.449886,0.787844,0.802956,0.789856,0.785904
2,0.3115,0.401667,0.824541,0.827356,0.823661,0.82384
3,0.2547,0.42464,0.830275,0.8366,0.828997,0.829044
4,0.2077,0.44782,0.847477,0.848244,0.847026,0.847231
5,0.1677,0.444772,0.857798,0.858087,0.858087,0.857798
6,0.1349,0.557369,0.838303,0.843349,0.837175,0.837342
7,0.1098,0.471115,0.862385,0.862328,0.862381,0.86235
8,0.0869,0.527723,0.869266,0.869266,0.86939,0.869255
9,0.0669,0.696865,0.841743,0.846569,0.842826,0.841442
10,0.0528,0.701042,0.868119,0.868087,0.868054,0.868069


[I 2025-03-23 06:31:41,138] Trial 128 finished with value: 0.8611243828635133 and parameters: {'learning_rate': 0.0006402984068612654, 'weight_decay': 0.004, 'warmup_steps': 30}. Best is trial 88 with value: 0.8749406267968206.


Trial 129 with params: {'learning_rate': 0.0005026506802185396, 'weight_decay': 0.006, 'warmup_steps': 24}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4031,0.438748,0.791284,0.800503,0.792856,0.790224
2,0.3186,0.413436,0.823394,0.827279,0.822367,0.822497
3,0.2682,0.419705,0.81422,0.819763,0.812979,0.812945
4,0.2264,0.434793,0.837156,0.838126,0.836638,0.836846
5,0.1884,0.44244,0.847477,0.847458,0.847573,0.847461


[I 2025-03-23 06:32:42,123] Trial 129 pruned. 


Trial 130 with params: {'learning_rate': 0.0011474084669716098, 'weight_decay': 0.005, 'warmup_steps': 31}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3861,0.432928,0.795872,0.805724,0.797487,0.794766
2,0.2876,0.407024,0.834862,0.838259,0.833923,0.834129
3,0.2182,0.411525,0.840596,0.846087,0.839427,0.839592
4,0.163,0.491362,0.838303,0.842965,0.837217,0.837399
5,0.1215,0.45554,0.865826,0.865769,0.865844,0.865796
6,0.0894,0.56774,0.838303,0.848068,0.836754,0.836677
7,0.0638,0.619392,0.845183,0.845661,0.844816,0.844988
8,0.0436,0.58521,0.857798,0.857817,0.857666,0.857723
9,0.0296,0.844046,0.858945,0.858991,0.858792,0.858863
10,0.0202,0.892187,0.854358,0.854311,0.854414,0.854335


[I 2025-03-23 06:34:40,830] Trial 130 pruned. 


Trial 131 with params: {'learning_rate': 0.0005127558480351364, 'weight_decay': 0.01, 'warmup_steps': 26}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4025,0.435431,0.795872,0.803309,0.797276,0.795086
2,0.3179,0.415422,0.827982,0.831937,0.826955,0.827108
3,0.2668,0.422605,0.81422,0.819763,0.812979,0.812945
4,0.2248,0.435698,0.837156,0.838305,0.836596,0.836813
5,0.1867,0.440331,0.849771,0.849794,0.849909,0.849761
6,0.1548,0.536097,0.844037,0.848992,0.842932,0.843138
7,0.1306,0.463478,0.850917,0.852345,0.85032,0.850571
8,0.1104,0.49162,0.864679,0.865208,0.865054,0.864676
9,0.0905,0.67992,0.840596,0.845601,0.8417,0.840277
10,0.0758,0.596302,0.857798,0.859624,0.858466,0.857738


[I 2025-03-23 06:37:26,517] Trial 131 finished with value: 0.8484891563600752 and parameters: {'learning_rate': 0.0005127558480351364, 'weight_decay': 0.01, 'warmup_steps': 26}. Best is trial 88 with value: 0.8749406267968206.


Trial 132 with params: {'learning_rate': 0.0007937539287313915, 'weight_decay': 0.006, 'warmup_steps': 35}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3955,0.453731,0.780963,0.801365,0.78331,0.778103
2,0.3035,0.40033,0.83945,0.841482,0.838722,0.838962
3,0.2412,0.412053,0.834862,0.841744,0.833544,0.833598
4,0.1908,0.436056,0.849771,0.851088,0.849194,0.849438
5,0.1496,0.438639,0.863532,0.864444,0.864012,0.863518
6,0.1168,0.559376,0.833716,0.840799,0.832376,0.832409
7,0.0899,0.513347,0.861239,0.861286,0.861087,0.861158
8,0.0668,0.547979,0.865826,0.865827,0.865717,0.865762
9,0.0487,0.653444,0.862385,0.862787,0.862718,0.862385
10,0.035,0.723437,0.863532,0.863685,0.86376,0.86353


[I 2025-03-23 06:40:10,479] Trial 132 finished with value: 0.8646532673892455 and parameters: {'learning_rate': 0.0007937539287313915, 'weight_decay': 0.006, 'warmup_steps': 35}. Best is trial 88 with value: 0.8749406267968206.


Trial 133 with params: {'learning_rate': 0.000339914030194018, 'weight_decay': 0.007, 'warmup_steps': 36}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.417,0.435306,0.788991,0.797662,0.790519,0.787987
2,0.331,0.421104,0.813073,0.818396,0.811853,0.811826
3,0.2908,0.409144,0.81078,0.810802,0.810906,0.810768
4,0.256,0.480202,0.816514,0.825799,0.814936,0.814626
5,0.2232,0.461857,0.836009,0.837233,0.836564,0.835973
6,0.1941,0.496834,0.827982,0.830983,0.827082,0.827269
7,0.1725,0.471226,0.834862,0.836398,0.834217,0.834441
8,0.1537,0.522392,0.852064,0.852165,0.851867,0.851961
9,0.1365,0.586981,0.850917,0.852079,0.851457,0.850889
10,0.1241,0.544167,0.837156,0.844009,0.838448,0.836661


[I 2025-03-23 06:42:15,126] Trial 133 pruned. 


Trial 134 with params: {'learning_rate': 0.0006624609753352665, 'weight_decay': 0.006, 'warmup_steps': 35}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3996,0.44431,0.786697,0.800803,0.788646,0.784887
2,0.3089,0.399213,0.830275,0.833611,0.829334,0.829521
3,0.2501,0.426672,0.833716,0.839478,0.832502,0.832606
4,0.2027,0.44053,0.844037,0.84487,0.843563,0.84377
5,0.1629,0.449309,0.852064,0.852959,0.852541,0.852048


[I 2025-03-23 06:43:22,839] Trial 134 pruned. 


Trial 135 with params: {'learning_rate': 0.0005058147425774857, 'weight_decay': 0.004, 'warmup_steps': 42}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4081,0.438056,0.788991,0.798149,0.790562,0.787919
2,0.3163,0.412418,0.830275,0.833931,0.829292,0.829468
3,0.2648,0.430858,0.824541,0.830108,0.823324,0.82337
4,0.2224,0.449712,0.827982,0.830421,0.827166,0.827368
5,0.1855,0.440752,0.854358,0.854299,0.854372,0.854325
6,0.1537,0.512659,0.84633,0.851729,0.845184,0.84539
7,0.1288,0.460471,0.841743,0.843114,0.841143,0.841375
8,0.1088,0.532403,0.856651,0.859405,0.857466,0.856533
9,0.0897,0.648209,0.838303,0.841238,0.839153,0.838148
10,0.0753,0.594766,0.854358,0.856814,0.85513,0.854256


[I 2025-03-23 06:46:24,564] Trial 135 finished with value: 0.8473081840001158 and parameters: {'learning_rate': 0.0005058147425774857, 'weight_decay': 0.004, 'warmup_steps': 42}. Best is trial 88 with value: 0.8749406267968206.


Trial 136 with params: {'learning_rate': 0.0008691037697576869, 'weight_decay': 0.006, 'warmup_steps': 33}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.393,0.440177,0.77867,0.792132,0.780595,0.776838
2,0.2988,0.413309,0.836009,0.839263,0.835091,0.835305
3,0.2343,0.436379,0.827982,0.836107,0.826534,0.826447
4,0.1819,0.475,0.838303,0.841908,0.837343,0.837559
5,0.141,0.456594,0.862385,0.862676,0.862676,0.862385
6,0.1088,0.586193,0.836009,0.848665,0.834249,0.833955
7,0.0803,0.593769,0.860092,0.861801,0.859455,0.859735
8,0.0593,0.592616,0.858945,0.858942,0.858834,0.858878
9,0.0419,0.69298,0.861239,0.861219,0.861339,0.861224
10,0.0293,0.82719,0.860092,0.860617,0.860466,0.860089


[I 2025-03-23 06:49:21,945] Trial 136 finished with value: 0.8680107771016862 and parameters: {'learning_rate': 0.0008691037697576869, 'weight_decay': 0.006, 'warmup_steps': 33}. Best is trial 88 with value: 0.8749406267968206.


Trial 137 with params: {'learning_rate': 0.0010423293600981712, 'weight_decay': 0.007, 'warmup_steps': 37}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.391,0.435983,0.786697,0.799585,0.788562,0.785065
2,0.2916,0.418227,0.837156,0.84027,0.836259,0.836482
3,0.224,0.421601,0.832569,0.841869,0.831039,0.830924
4,0.1698,0.489778,0.840596,0.844573,0.839595,0.839812
5,0.1271,0.469165,0.870413,0.870567,0.870643,0.870411
6,0.0952,0.507514,0.84633,0.847945,0.845689,0.845938
7,0.0669,0.570425,0.858945,0.858991,0.858792,0.858863
8,0.0469,0.65945,0.854358,0.854739,0.854035,0.854197
9,0.0324,0.862848,0.855505,0.85783,0.856256,0.855413
10,0.0216,0.877238,0.864679,0.86497,0.86497,0.864679


[I 2025-03-23 06:52:34,044] Trial 137 finished with value: 0.8623389744885073 and parameters: {'learning_rate': 0.0010423293600981712, 'weight_decay': 0.007, 'warmup_steps': 37}. Best is trial 88 with value: 0.8749406267968206.


Trial 138 with params: {'learning_rate': 0.0009763930085232003, 'weight_decay': 0.009000000000000001, 'warmup_steps': 33}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3907,0.435893,0.793578,0.802858,0.79515,0.792529
2,0.293,0.408292,0.836009,0.83867,0.835175,0.835401
3,0.2267,0.420919,0.834862,0.84221,0.833502,0.83353
4,0.1734,0.488337,0.84289,0.848848,0.841679,0.841841
5,0.1319,0.435719,0.860092,0.860034,0.860087,0.860056
6,0.1002,0.491414,0.858945,0.861277,0.858203,0.858498
7,0.0715,0.526924,0.862385,0.862332,0.862423,0.862359
8,0.0509,0.567694,0.858945,0.859601,0.85854,0.858743
9,0.0364,0.714973,0.861239,0.861184,0.861213,0.861197
10,0.0247,0.794196,0.857798,0.857764,0.857877,0.857779


[I 2025-03-23 06:55:33,271] Trial 138 finished with value: 0.8600026319252534 and parameters: {'learning_rate': 0.0009763930085232003, 'weight_decay': 0.009000000000000001, 'warmup_steps': 33}. Best is trial 88 with value: 0.8749406267968206.


Trial 139 with params: {'learning_rate': 0.0007412320648361839, 'weight_decay': 0.009000000000000001, 'warmup_steps': 43}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3988,0.443714,0.791284,0.802611,0.793024,0.789931
2,0.3044,0.407035,0.832569,0.836949,0.831502,0.831662
3,0.2427,0.419098,0.826835,0.831667,0.825703,0.825806
4,0.1935,0.467854,0.837156,0.839422,0.836385,0.836619
5,0.1534,0.443551,0.858945,0.858968,0.859087,0.858936
6,0.1208,0.560259,0.838303,0.848631,0.836712,0.836601
7,0.0951,0.53307,0.850917,0.851236,0.850615,0.850764
8,0.0722,0.541831,0.855505,0.855456,0.855456,0.855456
9,0.0536,0.663003,0.854358,0.856292,0.855045,0.854289
10,0.0387,0.731171,0.862385,0.862435,0.862549,0.862379


[I 2025-03-23 06:58:14,648] Trial 139 finished with value: 0.8611857728310277 and parameters: {'learning_rate': 0.0007412320648361839, 'weight_decay': 0.009000000000000001, 'warmup_steps': 43}. Best is trial 88 with value: 0.8749406267968206.


Trial 140 with params: {'learning_rate': 0.0006491252808343185, 'weight_decay': 0.005, 'warmup_steps': 36}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3999,0.443742,0.784404,0.796621,0.786225,0.78284
2,0.3095,0.405725,0.832569,0.835627,0.83167,0.831875
3,0.2515,0.424593,0.827982,0.832648,0.826871,0.826991
4,0.204,0.453196,0.841743,0.843114,0.841143,0.841375
5,0.1645,0.444496,0.856651,0.856993,0.856961,0.856651
6,0.1314,0.551806,0.832569,0.841869,0.831039,0.830924
7,0.1067,0.484216,0.862385,0.862625,0.862128,0.862263
8,0.0844,0.505262,0.868119,0.868273,0.868348,0.868118
9,0.0649,0.640629,0.853211,0.856398,0.854088,0.85306
10,0.0497,0.703289,0.860092,0.860492,0.860424,0.860091


[I 2025-03-23 07:01:28,466] Trial 140 finished with value: 0.853059505002633 and parameters: {'learning_rate': 0.0006491252808343185, 'weight_decay': 0.005, 'warmup_steps': 36}. Best is trial 88 with value: 0.8749406267968206.


Trial 141 with params: {'learning_rate': 0.00045609680018516365, 'weight_decay': 0.006, 'warmup_steps': 31}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4071,0.441965,0.77867,0.790397,0.780469,0.777108
2,0.3199,0.414349,0.826835,0.830281,0.825871,0.826038
3,0.2716,0.417587,0.823394,0.82634,0.822493,0.822663
4,0.2312,0.4727,0.819954,0.827602,0.818525,0.818387
5,0.1942,0.451631,0.845183,0.846067,0.845658,0.845167


[I 2025-03-23 07:02:22,810] Trial 141 pruned. 


Trial 142 with params: {'learning_rate': 0.0013585743020518742, 'weight_decay': 0.004, 'warmup_steps': 23}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3814,0.432158,0.787844,0.795786,0.789309,0.786934
2,0.2796,0.391305,0.837156,0.837352,0.836891,0.837011
3,0.2072,0.427908,0.837156,0.840583,0.836217,0.836432
4,0.152,0.485807,0.837156,0.841987,0.836049,0.836218
5,0.108,0.513902,0.862385,0.862537,0.862171,0.862281
6,0.0761,0.590136,0.852064,0.854898,0.851236,0.851516
7,0.0509,0.586063,0.857798,0.858379,0.857413,0.857606
8,0.0354,0.674697,0.862385,0.86313,0.86196,0.862176
9,0.022,0.828862,0.861239,0.861219,0.861339,0.861224
10,0.0162,0.892553,0.856651,0.858008,0.856077,0.856334


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--recall/11f90e583db35601050aed380d48e83202a896976b9608432fba9244fb447f24 (last modified on Fri Jan 10 23:14:00 2025) since it couldn't be found locally at evaluate-metric--recall, or remotely on the Hugging Face Hub.
[I 2025-03-23 07:05:43,113] Trial 142 finished with value: 0.8473708799705206 and parameters: {'learning_rate': 0.0013585743020518742, 'weight_decay': 0.004, 'warmup_steps': 23}. Best is trial 88 with value: 0.8749406267968206.


Trial 143 with params: {'learning_rate': 0.0008309238005094156, 'weight_decay': 0.005, 'warmup_steps': 26}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3911,0.442493,0.791284,0.806227,0.793277,0.789422
2,0.3001,0.414252,0.834862,0.837653,0.834007,0.834227
3,0.2366,0.43534,0.830275,0.836172,0.829039,0.82911
4,0.1861,0.449506,0.836009,0.837061,0.83547,0.83568
5,0.145,0.460528,0.857798,0.857992,0.858045,0.857797
6,0.1124,0.544572,0.848624,0.856336,0.847268,0.847403
7,0.0858,0.541775,0.850917,0.850867,0.850867,0.850867
8,0.0629,0.559245,0.857798,0.85774,0.857792,0.857762
9,0.0455,0.703797,0.857798,0.858618,0.858255,0.857786
10,0.0333,0.780848,0.857798,0.857912,0.858003,0.857795


[I 2025-03-23 07:08:47,690] Trial 143 finished with value: 0.8577615011023589 and parameters: {'learning_rate': 0.0008309238005094156, 'weight_decay': 0.005, 'warmup_steps': 26}. Best is trial 88 with value: 0.8749406267968206.


Trial 144 with params: {'learning_rate': 0.002033581560860518, 'weight_decay': 0.008, 'warmup_steps': 43}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3767,0.413752,0.797018,0.804258,0.798402,0.796266
2,0.2607,0.400876,0.830275,0.833931,0.829292,0.829468
3,0.183,0.454529,0.838303,0.844163,0.837091,0.837224
4,0.1265,0.527757,0.83945,0.845548,0.838217,0.838347
5,0.0844,0.548227,0.856651,0.857161,0.856287,0.85647
6,0.055,0.688267,0.840596,0.844928,0.839553,0.83976
7,0.0375,0.729202,0.852064,0.852849,0.851614,0.851826
8,0.0238,0.863071,0.850917,0.851952,0.850404,0.850634
9,0.0152,1.009636,0.84633,0.847529,0.845773,0.846006
10,0.0115,1.009531,0.853211,0.853162,0.853162,0.853162


[I 2025-03-23 07:10:44,477] Trial 144 pruned. 


Trial 145 with params: {'learning_rate': 0.00019124300226037216, 'weight_decay': 0.01, 'warmup_steps': 26}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4379,0.445796,0.78555,0.796962,0.78731,0.784119
2,0.3509,0.432744,0.809633,0.811884,0.808811,0.808954
3,0.3239,0.427939,0.806193,0.806156,0.806064,0.806101
4,0.3026,0.462199,0.806193,0.810297,0.805096,0.80511
5,0.2798,0.430738,0.815367,0.815799,0.815705,0.815365


[I 2025-03-23 07:11:37,148] Trial 145 pruned. 


Trial 146 with params: {'learning_rate': 0.0007731164974953664, 'weight_decay': 0.007, 'warmup_steps': 33}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3964,0.443094,0.791284,0.806227,0.793277,0.789422
2,0.3039,0.403501,0.83945,0.842592,0.838554,0.838785
3,0.241,0.426689,0.830275,0.835364,0.829124,0.829237
4,0.1919,0.476424,0.841743,0.844324,0.840932,0.841178
5,0.1503,0.455016,0.856651,0.856993,0.856961,0.856651
6,0.1182,0.545329,0.84289,0.848848,0.841679,0.841841
7,0.0923,0.512638,0.860092,0.860327,0.859834,0.859967
8,0.068,0.551398,0.864679,0.864679,0.864802,0.864668
9,0.0509,0.704339,0.863532,0.863555,0.863676,0.863523
10,0.0362,0.788354,0.865826,0.866067,0.866096,0.865826


[I 2025-03-23 07:14:32,262] Trial 146 finished with value: 0.8668716307277629 and parameters: {'learning_rate': 0.0007731164974953664, 'weight_decay': 0.007, 'warmup_steps': 33}. Best is trial 88 with value: 0.8749406267968206.


Trial 147 with params: {'learning_rate': 0.0007828814939417646, 'weight_decay': 0.006, 'warmup_steps': 35}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3963,0.453127,0.786697,0.802747,0.788772,0.784602
2,0.3049,0.401534,0.831422,0.835261,0.830418,0.830593
3,0.2431,0.415279,0.831422,0.838915,0.830039,0.830027
4,0.1924,0.437079,0.84633,0.847343,0.845815,0.846038
5,0.1505,0.43203,0.864679,0.865081,0.865012,0.864678
6,0.1173,0.557527,0.837156,0.845051,0.835754,0.835774
7,0.0906,0.487199,0.870413,0.871639,0.869885,0.870153
8,0.0677,0.518975,0.863532,0.863475,0.863549,0.863502
9,0.0501,0.627411,0.862385,0.862787,0.862718,0.862385
10,0.0357,0.726006,0.873853,0.874389,0.874232,0.873851


[I 2025-03-23 07:18:54,454] Trial 147 finished with value: 0.8715163761892734 and parameters: {'learning_rate': 0.0007828814939417646, 'weight_decay': 0.006, 'warmup_steps': 35}. Best is trial 88 with value: 0.8749406267968206.


Trial 148 with params: {'learning_rate': 0.0006911339266546163, 'weight_decay': 0.008, 'warmup_steps': 31}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3977,0.447983,0.783257,0.79816,0.785268,0.781275
2,0.3084,0.401327,0.827982,0.831937,0.826955,0.827108
3,0.249,0.422298,0.830275,0.834983,0.829166,0.829297
4,0.2009,0.45417,0.84289,0.84364,0.842437,0.842636
5,0.1604,0.449861,0.847477,0.848364,0.847952,0.847461


[I 2025-03-23 07:19:47,791] Trial 148 pruned. 


Trial 149 with params: {'learning_rate': 0.0005279934990252418, 'weight_decay': 0.009000000000000001, 'warmup_steps': 34}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4044,0.439796,0.784404,0.793934,0.786015,0.783236
2,0.3151,0.412896,0.827982,0.830983,0.827082,0.827269
3,0.2625,0.421345,0.824541,0.827955,0.823577,0.823734
4,0.2196,0.444765,0.833716,0.835139,0.833091,0.83331
5,0.1815,0.436488,0.848624,0.848569,0.848657,0.848595
6,0.1494,0.512193,0.847477,0.852694,0.846352,0.846571
7,0.1248,0.479655,0.845183,0.84766,0.844395,0.844652
8,0.1041,0.506412,0.864679,0.86535,0.865096,0.864672
9,0.0846,0.642493,0.837156,0.840867,0.838111,0.836936
10,0.0705,0.669354,0.83945,0.842854,0.840364,0.839259


[I 2025-03-23 07:21:38,067] Trial 149 pruned. 


In [25]:
print(best_trial)

BestRun(run_id='88', objective=0.8749406267968206, hyperparameters={'learning_rate': 0.0007832919195971522, 'weight_decay': 0.008, 'warmup_steps': 31}, run_summary=None)


In [None]:
base.reset_seed()

In [27]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_hp-search", logging_dir=f"~/logs/{DATASET}/bilstm-distill_hp-search", remove_unused_columns=False, epochs=num_epochs, batch_size=batch_size)

In [28]:
def hp_space(trial):
    params =  {
        "learning_rate": trial.suggest_float("learning_rate", 5e-5, 5e-3, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0, 1e-2, step=1e-3),
        "warmup_steps" : trial.suggest_int("warmup_steps", 0, warm_up),
        "lambda_param": trial.suggest_float("lambda_param",0,1,step=.1),
        "temperature": trial.suggest_float("temperature", 2,7, step=.5)
    }
    print(f"Trial {trial.number} with params: {params}")
    return params

In [29]:
pruner = optuna.pruners.HyperbandPruner(min_resource=min_r, max_resource=max_r, reduction_factor=2, bootstrap_count=2)
sampler = optuna.samplers.TPESampler(seed=42, multivariate=True)



In [30]:
trainer = base.DistilTrainer(
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    model_init = lambda: get_BiLSTM(),
)
  

In [31]:
best_trial2 = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    compute_objective=lambda metrics: metrics["eval_f1"],
    pruner=pruner,
    sampler=sampler,
    study_name="Distill",
    n_trials=150
)

[I 2025-03-23 07:21:38,433] A new study created in memory with name: Distill


Trial 0 with params: {'learning_rate': 0.0002805758207667253, 'weight_decay': 0.01, 'warmup_steps': 32, 'lambda_param': 0.6000000000000001, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.919,1.643253,0.784404,0.792961,0.785931,0.783378
2,1.4265,1.467759,0.816514,0.823376,0.815147,0.815034
3,1.2391,1.38069,0.818807,0.824863,0.817525,0.817493
4,1.0919,1.419749,0.81422,0.821022,0.812853,0.812722
5,0.9535,1.315268,0.836009,0.836032,0.836143,0.835999


[I 2025-03-23 07:22:45,368] Trial 0 pruned. 


Trial 1 with params: {'learning_rate': 0.00010255552094216992, 'weight_decay': 0.0, 'warmup_steps': 38, 'lambda_param': 0.6000000000000001, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2819,1.646412,0.776376,0.776399,0.77649,0.776362
2,1.6178,1.584318,0.791284,0.792521,0.790625,0.790752
3,1.5071,1.520361,0.798165,0.799103,0.797592,0.79774
4,1.448,1.481652,0.795872,0.796797,0.795298,0.795441
5,1.3827,1.446343,0.808486,0.808536,0.808274,0.808353
6,1.324,1.511959,0.798165,0.803578,0.79936,0.79765
7,1.2752,1.534654,0.797018,0.803823,0.79836,0.796322
8,1.226,1.463692,0.806193,0.809352,0.805222,0.805301
9,1.1812,1.426664,0.816514,0.816675,0.816241,0.81635
10,1.147,1.434113,0.815367,0.816732,0.815957,0.815312


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--recall/11f90e583db35601050aed380d48e83202a896976b9608432fba9244fb447f24 (last modified on Fri Jan 10 23:14:00 2025) since it couldn't be found locally at evaluate-metric--recall, or remotely on the Hugging Face Hub.
[I 2025-03-23 07:25:03,274] Trial 1 pruned. 


Trial 2 with params: {'learning_rate': 5.497167787383099e-05, 'weight_decay': 0.01, 'warmup_steps': 36, 'lambda_param': 0.2, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5529,1.701492,0.777523,0.777462,0.777532,0.777481
2,1.7315,1.646766,0.77867,0.779736,0.778027,0.77813
3,1.6082,1.600726,0.786697,0.788086,0.785994,0.786102
4,1.553,1.557799,0.790138,0.791278,0.789499,0.789626
5,1.5077,1.532049,0.784404,0.784343,0.784415,0.784363


[I 2025-03-23 07:26:19,722] Trial 2 pruned. 


Trial 3 with params: {'learning_rate': 0.00011635338541918901, 'weight_decay': 0.003, 'warmup_steps': 23, 'lambda_param': 0.4, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2038,1.637608,0.784404,0.784872,0.784752,0.784399
2,1.5853,1.578391,0.790138,0.791111,0.789541,0.789673
3,1.4872,1.52827,0.806193,0.809065,0.805264,0.805361
4,1.4222,1.463981,0.795872,0.796962,0.795256,0.795397
5,1.3506,1.43564,0.811927,0.811863,0.811863,0.811863
6,1.2861,1.51075,0.800459,0.806293,0.801697,0.799902
7,1.2252,1.526615,0.801606,0.806878,0.802781,0.801122
8,1.1729,1.438023,0.806193,0.80627,0.806359,0.806186
9,1.1257,1.415536,0.819954,0.820017,0.819746,0.819829
10,1.0832,1.425574,0.813073,0.81407,0.813579,0.813044


[I 2025-03-23 07:29:19,400] Trial 3 pruned. 


Trial 4 with params: {'learning_rate': 0.0008369042894376068, 'weight_decay': 0.001, 'warmup_steps': 12, 'lambda_param': 0.4, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6885,1.529029,0.794725,0.80107,0.796024,0.794075
2,1.1647,1.286994,0.836009,0.836732,0.835554,0.835745
3,0.8627,1.533093,0.816514,0.832569,0.814473,0.813547
4,0.6469,1.2546,0.847477,0.849213,0.846815,0.84707
5,0.491,1.355477,0.854358,0.854302,0.85433,0.854315
6,0.3785,1.337668,0.861239,0.863591,0.860497,0.860799
7,0.2988,1.402973,0.856651,0.858454,0.855993,0.856269
8,0.2382,1.325057,0.858945,0.858899,0.859003,0.858923
9,0.1942,1.3516,0.862385,0.862728,0.862086,0.862243
10,0.1596,1.347137,0.857798,0.857992,0.858045,0.857797


[I 2025-03-23 07:32:15,237] Trial 4 pruned. 


Trial 5 with params: {'learning_rate': 0.0018591820902866042, 'weight_decay': 0.002, 'warmup_steps': 22, 'lambda_param': 0.6000000000000001, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5837,1.379857,0.803899,0.807178,0.804822,0.803651
2,0.9479,1.230319,0.860092,0.860112,0.85996,0.860018
3,0.5875,1.383567,0.841743,0.845564,0.840764,0.84099
4,0.3819,1.385805,0.845183,0.850766,0.844016,0.844208
5,0.2553,1.329507,0.848624,0.84884,0.848362,0.848489
6,0.1792,1.264681,0.856651,0.857623,0.856161,0.856393
7,0.131,1.323016,0.868119,0.868122,0.868012,0.868057
8,0.0976,1.315119,0.856651,0.8573,0.856245,0.856446
9,0.0798,1.305477,0.858945,0.858899,0.859003,0.858923
10,0.0654,1.280634,0.860092,0.860112,0.85996,0.860018


[I 2025-03-23 07:36:39,158] Trial 5 finished with value: 0.8575553822481715 and parameters: {'learning_rate': 0.0018591820902866042, 'weight_decay': 0.002, 'warmup_steps': 22, 'lambda_param': 0.6000000000000001, 'temperature': 2.0}. Best is trial 5 with value: 0.8575553822481715.


Trial 6 with params: {'learning_rate': 0.0008204643365323959, 'weight_decay': 0.001, 'warmup_steps': 2, 'lambda_param': 1.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6848,1.623031,0.783257,0.79352,0.784931,0.78197
2,1.1871,1.263903,0.834862,0.835253,0.834512,0.834667
3,0.8739,1.342885,0.834862,0.83662,0.834175,0.834402
4,0.6391,1.280446,0.831422,0.833508,0.830671,0.830888
5,0.4903,1.297224,0.849771,0.850397,0.849362,0.849555
6,0.3808,1.269278,0.858945,0.860315,0.858371,0.858632
7,0.3012,1.402062,0.855505,0.856561,0.854993,0.85523
8,0.2414,1.348436,0.862385,0.862464,0.862213,0.862298
9,0.1951,1.287627,0.866972,0.86722,0.866717,0.866854
10,0.1593,1.304576,0.870413,0.870367,0.870474,0.870392


[I 2025-03-23 07:39:29,295] Trial 6 pruned. 


Trial 7 with params: {'learning_rate': 0.0020690200562805084, 'weight_decay': 0.003, 'warmup_steps': 4, 'lambda_param': 0.7000000000000001, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5182,1.291803,0.832569,0.833516,0.83306,0.832547
2,0.9022,1.20282,0.840596,0.841671,0.840059,0.840277
3,0.5547,1.265504,0.84289,0.842831,0.842858,0.842843
4,0.3525,1.257974,0.830275,0.833305,0.829376,0.829572
5,0.2314,1.291228,0.853211,0.85591,0.852404,0.852687
6,0.1573,1.203658,0.860092,0.860112,0.85996,0.860018
7,0.1179,1.273678,0.855505,0.855948,0.855161,0.855333
8,0.0891,1.21924,0.856651,0.856596,0.856624,0.856609
9,0.0734,1.245752,0.852064,0.852104,0.851909,0.851978
10,0.0618,1.218779,0.861239,0.861237,0.861129,0.861173


[I 2025-03-23 07:43:45,719] Trial 7 finished with value: 0.8554429913427888 and parameters: {'learning_rate': 0.0020690200562805084, 'weight_decay': 0.003, 'warmup_steps': 4, 'lambda_param': 0.7000000000000001, 'temperature': 4.0}. Best is trial 5 with value: 0.8575553822481715.


Trial 8 with params: {'learning_rate': 8.770946743725407e-05, 'weight_decay': 0.005, 'warmup_steps': 1, 'lambda_param': 1.0, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3064,1.668683,0.776376,0.776784,0.776701,0.776373
2,1.6304,1.606393,0.788991,0.790039,0.788372,0.7885
3,1.5405,1.534677,0.795872,0.796378,0.795424,0.795561
4,1.4835,1.510501,0.798165,0.799847,0.797424,0.797552
5,1.4273,1.4744,0.800459,0.800671,0.800139,0.800253
6,1.3771,1.495544,0.800459,0.804165,0.801444,0.800155
7,1.3337,1.476659,0.806193,0.80949,0.807117,0.805947
8,1.2951,1.438933,0.801606,0.801779,0.801307,0.801415
9,1.2545,1.453859,0.808486,0.809565,0.807896,0.808062
10,1.2192,1.47015,0.815367,0.818439,0.816252,0.815163


[I 2025-03-23 07:46:45,463] Trial 8 pruned. 


Trial 9 with params: {'learning_rate': 0.0010568529720322872, 'weight_decay': 0.003, 'warmup_steps': 22, 'lambda_param': 0.6000000000000001, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6679,1.622515,0.786697,0.798999,0.78852,0.78515
2,1.0993,1.262098,0.845183,0.845543,0.844858,0.845012
3,0.7576,1.52771,0.834862,0.843707,0.833375,0.833316
4,0.5464,1.229599,0.84289,0.844167,0.842311,0.842542
5,0.3969,1.298719,0.856651,0.856731,0.856835,0.856647
6,0.2951,1.427087,0.84633,0.849557,0.845437,0.845694
7,0.2278,1.301446,0.854358,0.854739,0.854035,0.854197
8,0.1744,1.359705,0.860092,0.86117,0.859582,0.859826
9,0.1387,1.295142,0.861239,0.861286,0.861087,0.861158
10,0.1112,1.365778,0.868119,0.8681,0.868222,0.868105


[I 2025-03-23 07:49:36,385] Trial 9 pruned. 


Trial 10 with params: {'learning_rate': 0.003553256925699131, 'weight_decay': 0.003, 'warmup_steps': 26, 'lambda_param': 0.1, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4976,1.287629,0.832569,0.832534,0.832639,0.832547
2,0.7937,1.547601,0.81078,0.826854,0.808716,0.807663
3,0.4626,1.158976,0.862385,0.865486,0.861539,0.861856
4,0.2715,1.361986,0.836009,0.842692,0.834712,0.834787
5,0.1745,1.337793,0.858945,0.859461,0.858582,0.858766
6,0.1231,1.2158,0.863532,0.865149,0.862918,0.863199
7,0.0925,1.239385,0.861239,0.861203,0.861171,0.861186
8,0.0726,1.174021,0.864679,0.865779,0.86417,0.864421
9,0.0611,1.122195,0.868119,0.868087,0.868054,0.868069
10,0.0522,1.129703,0.869266,0.869222,0.869222,0.869222


[I 2025-03-23 07:53:56,888] Trial 10 finished with value: 0.8691972462578159 and parameters: {'learning_rate': 0.003553256925699131, 'weight_decay': 0.003, 'warmup_steps': 26, 'lambda_param': 0.1, 'temperature': 2.0}. Best is trial 10 with value: 0.8691972462578159.


Trial 11 with params: {'learning_rate': 0.0036979694616670403, 'weight_decay': 0.006, 'warmup_steps': 37, 'lambda_param': 0.1, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.503,1.280238,0.827982,0.830708,0.828808,0.827829
2,0.7755,1.438747,0.831422,0.83673,0.83025,0.83036
3,0.4412,1.321585,0.847477,0.849452,0.846773,0.847033
4,0.2579,1.383943,0.845183,0.846104,0.844689,0.844904
5,0.1743,1.421742,0.850917,0.850867,0.850867,0.850867
6,0.1214,1.469786,0.849771,0.851524,0.84911,0.849369
7,0.0913,1.436064,0.852064,0.852057,0.851951,0.851994
8,0.0698,1.438167,0.855505,0.856385,0.855035,0.855258
9,0.059,1.418177,0.849771,0.84973,0.849699,0.849714
10,0.0508,1.378966,0.853211,0.853325,0.853414,0.853208


[I 2025-03-23 07:56:55,080] Trial 11 pruned. 


Trial 12 with params: {'learning_rate': 0.0044803639948611095, 'weight_decay': 0.001, 'warmup_steps': 22, 'lambda_param': 0.0, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4469,1.195965,0.834862,0.834975,0.835059,0.834859
2,0.7362,1.260177,0.845183,0.847943,0.844353,0.844609
3,0.4182,1.305393,0.850917,0.854528,0.849983,0.850255
4,0.2477,1.314621,0.84289,0.845625,0.842058,0.842307
5,0.166,1.270416,0.856651,0.856647,0.85654,0.856583
6,0.1192,1.218361,0.856651,0.858454,0.855993,0.856269
7,0.086,1.283597,0.849771,0.850712,0.849278,0.8495
8,0.0675,1.256031,0.847477,0.847738,0.847194,0.847331
9,0.0558,1.220718,0.850917,0.850985,0.850741,0.850822
10,0.0486,1.274964,0.852064,0.852057,0.851951,0.851994


[I 2025-03-23 07:59:34,249] Trial 12 pruned. 


Trial 13 with params: {'learning_rate': 0.002518208951412107, 'weight_decay': 0.0, 'warmup_steps': 14, 'lambda_param': 0.5, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5363,1.271868,0.821101,0.821377,0.821377,0.821101
2,0.8391,1.194751,0.848624,0.848937,0.84832,0.848468
3,0.5,1.302493,0.837156,0.84027,0.836259,0.836482
4,0.3122,1.225074,0.852064,0.853016,0.851572,0.851797
5,0.2015,1.307705,0.854358,0.856649,0.853614,0.853896
6,0.1395,1.186339,0.869266,0.869213,0.869306,0.869241
7,0.1002,1.232111,0.858945,0.860764,0.858287,0.858568
8,0.0776,1.193666,0.862385,0.862537,0.862171,0.862281
9,0.0645,1.197939,0.857798,0.857798,0.857919,0.857786
10,0.0543,1.205404,0.868119,0.868419,0.867843,0.867993


[I 2025-03-23 08:02:20,094] Trial 13 pruned. 


Trial 14 with params: {'learning_rate': 0.0035985903311758468, 'weight_decay': 0.007, 'warmup_steps': 15, 'lambda_param': 0.4, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4633,1.221037,0.83945,0.839839,0.839774,0.839449
2,0.7733,1.317303,0.834862,0.841294,0.833586,0.833664
3,0.4468,1.259335,0.847477,0.848408,0.846984,0.847202
4,0.271,1.295331,0.84289,0.843801,0.842395,0.842607
5,0.1777,1.291718,0.856651,0.856647,0.85654,0.856583
6,0.1219,1.209644,0.857798,0.859492,0.857161,0.857435
7,0.0891,1.253263,0.847477,0.849213,0.846815,0.84707
8,0.0705,1.22127,0.856651,0.857454,0.856203,0.85642
9,0.0598,1.203624,0.860092,0.86068,0.859708,0.859903
10,0.0508,1.221893,0.856651,0.857454,0.856203,0.85642


[I 2025-03-23 08:07:02,197] Trial 14 finished with value: 0.8575816487273915 and parameters: {'learning_rate': 0.0035985903311758468, 'weight_decay': 0.007, 'warmup_steps': 15, 'lambda_param': 0.4, 'temperature': 2.5}. Best is trial 10 with value: 0.8691972462578159.


Trial 15 with params: {'learning_rate': 0.0021853805778439743, 'weight_decay': 0.009000000000000001, 'warmup_steps': 15, 'lambda_param': 0.4, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5557,1.393433,0.817661,0.82105,0.818588,0.81743
2,0.8972,1.232878,0.853211,0.853152,0.853204,0.853173
3,0.5437,1.377761,0.84289,0.846901,0.84189,0.842117
4,0.3378,1.303367,0.847477,0.848408,0.846984,0.847202
5,0.2177,1.307284,0.865826,0.867235,0.865254,0.865528
6,0.1535,1.229696,0.864679,0.86476,0.864507,0.864593
7,0.1142,1.332063,0.863532,0.863823,0.863255,0.863401
8,0.0881,1.1986,0.863532,0.863582,0.863381,0.863453
9,0.0713,1.205389,0.865826,0.866505,0.865423,0.865633
10,0.06,1.238616,0.864679,0.864703,0.864549,0.864608


[I 2025-03-23 08:10:12,109] Trial 15 pruned. 


Trial 16 with params: {'learning_rate': 0.002549155387318145, 'weight_decay': 0.008, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5037,1.357121,0.824541,0.830131,0.825724,0.824114
2,0.8406,1.256548,0.848624,0.851569,0.847773,0.848041
3,0.4944,1.425492,0.83945,0.848437,0.837964,0.837946
4,0.3117,1.344409,0.84633,0.850941,0.845268,0.845498
5,0.1991,1.375888,0.848624,0.852204,0.847689,0.847951
6,0.1411,1.369565,0.849771,0.851524,0.84911,0.849369
7,0.1048,1.348352,0.856651,0.860869,0.855656,0.855946
8,0.0809,1.248576,0.857798,0.858866,0.857287,0.857528
9,0.0661,1.15516,0.861239,0.861286,0.861087,0.861158
10,0.0561,1.217308,0.857798,0.858866,0.857287,0.857528


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--recall/11f90e583db35601050aed380d48e83202a896976b9608432fba9244fb447f24 (last modified on Fri Jan 10 23:14:00 2025) since it couldn't be found locally at evaluate-metric--recall, or remotely on the Hugging Face Hub.
[I 2025-03-23 08:15:14,451] Trial 16 finished with value: 0.8552578884134646 and parameters: {'learning_rate': 0.002549155387318145, 'weight_decay': 0.008, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 2.0}. Best is trial 10 with value: 0.8691972462578159.


Trial 17 with params: {'learning_rate': 0.0016934976427839907, 'weight_decay': 0.006, 'warmup_steps': 17, 'lambda_param': 0.2, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5781,1.432875,0.808486,0.817391,0.81,0.807605
2,0.96,1.262233,0.837156,0.841987,0.836049,0.836218
3,0.6134,1.258933,0.84633,0.84925,0.845479,0.845739
4,0.4049,1.177284,0.856651,0.857623,0.856161,0.856393
5,0.2745,1.288816,0.860092,0.86117,0.859582,0.859826
6,0.1896,1.174971,0.865826,0.865827,0.865717,0.865762
7,0.1379,1.27115,0.864679,0.865281,0.864297,0.864496
8,0.1044,1.214602,0.862385,0.862339,0.862339,0.862339
9,0.0846,1.162524,0.869266,0.869232,0.869348,0.869249
10,0.0695,1.224392,0.865826,0.865769,0.865844,0.865796


[I 2025-03-23 08:18:09,325] Trial 17 pruned. 


Trial 18 with params: {'learning_rate': 0.004571777982388411, 'weight_decay': 0.007, 'warmup_steps': 19, 'lambda_param': 0.9, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4299,1.172019,0.845183,0.845164,0.845279,0.845167
2,0.7222,1.310848,0.83945,0.846446,0.838133,0.838221
3,0.4104,1.341369,0.840596,0.840945,0.840269,0.84042
4,0.2508,1.478903,0.830275,0.837043,0.828955,0.828976
5,0.1621,1.384158,0.836009,0.836248,0.835722,0.835852


[I 2025-03-23 08:19:42,256] Trial 18 pruned. 


Trial 19 with params: {'learning_rate': 0.002584250012368157, 'weight_decay': 0.002, 'warmup_steps': 5, 'lambda_param': 0.0, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.502,1.2746,0.834862,0.835503,0.83527,0.834855
2,0.8478,1.243077,0.836009,0.839917,0.835007,0.835203
3,0.5028,1.361917,0.836009,0.839917,0.835007,0.835203
4,0.3085,1.326264,0.856651,0.856836,0.856414,0.856533
5,0.2006,1.265919,0.864679,0.864644,0.86476,0.864661
6,0.1377,1.359586,0.852064,0.853016,0.851572,0.851797
7,0.1014,1.295664,0.857798,0.857744,0.857834,0.857771
8,0.0781,1.270655,0.864679,0.86466,0.864591,0.864621
9,0.0642,1.204196,0.860092,0.860057,0.860171,0.860073
10,0.055,1.246203,0.864679,0.864729,0.864844,0.864672


[I 2025-03-23 08:24:14,745] Trial 19 finished with value: 0.8611726224074798 and parameters: {'learning_rate': 0.002584250012368157, 'weight_decay': 0.002, 'warmup_steps': 5, 'lambda_param': 0.0, 'temperature': 2.0}. Best is trial 10 with value: 0.8691972462578159.


Trial 20 with params: {'learning_rate': 0.0018324036589012613, 'weight_decay': 0.003, 'warmup_steps': 4, 'lambda_param': 0.0, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5625,1.419693,0.822248,0.827414,0.823388,0.821854
2,0.9405,1.231498,0.855505,0.856561,0.854993,0.85523
3,0.5923,1.344425,0.832569,0.836258,0.831586,0.831772
4,0.385,1.315681,0.838303,0.840451,0.837554,0.837791
5,0.262,1.20858,0.863532,0.865637,0.862834,0.863135
6,0.1767,1.217458,0.860092,0.86117,0.859582,0.859826
7,0.127,1.273724,0.865826,0.865877,0.865675,0.865748
8,0.0976,1.201326,0.858945,0.859025,0.859129,0.85894
9,0.0786,1.193491,0.857798,0.858866,0.857287,0.857528
10,0.0647,1.212508,0.860092,0.860071,0.860003,0.860032


[I 2025-03-23 08:27:15,127] Trial 20 pruned. 


Trial 21 with params: {'learning_rate': 0.002034702167296518, 'weight_decay': 0.0, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5235,1.213844,0.837156,0.837094,0.837143,0.837114
2,0.9045,1.28512,0.840596,0.843028,0.839806,0.840049
3,0.5652,1.318827,0.848624,0.84905,0.848278,0.848444
4,0.3648,1.355369,0.841743,0.843795,0.841016,0.841262
5,0.2354,1.432312,0.854358,0.855625,0.854919,0.854325
6,0.1632,1.338837,0.848624,0.850487,0.847941,0.848201
7,0.1187,1.276809,0.858945,0.859686,0.859382,0.858936
8,0.0943,1.314385,0.850917,0.851621,0.850488,0.85069
9,0.074,1.322807,0.84633,0.847529,0.845773,0.846006
10,0.0628,1.27844,0.855505,0.855554,0.855666,0.855498


[I 2025-03-23 08:30:12,741] Trial 21 pruned. 


Trial 22 with params: {'learning_rate': 0.003610630957926916, 'weight_decay': 0.004, 'warmup_steps': 23, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4874,1.253278,0.819954,0.821141,0.820504,0.819914
2,0.7597,1.402428,0.834862,0.835658,0.834386,0.83458
3,0.4344,1.209932,0.849771,0.852292,0.848983,0.849255
4,0.2544,1.406061,0.841743,0.844324,0.840932,0.841178
5,0.1682,1.238469,0.857798,0.858131,0.857498,0.857651
6,0.1195,1.156577,0.863532,0.863823,0.863255,0.863401
7,0.0906,1.21299,0.857798,0.858379,0.857413,0.857606
8,0.0727,1.208942,0.857798,0.857873,0.857624,0.857708
9,0.0582,1.156367,0.865826,0.865849,0.86597,0.865817
10,0.0501,1.155504,0.866972,0.866938,0.867054,0.866955


[I 2025-03-23 08:34:57,478] Trial 22 finished with value: 0.8657322778688039 and parameters: {'learning_rate': 0.003610630957926916, 'weight_decay': 0.004, 'warmup_steps': 23, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}. Best is trial 10 with value: 0.8691972462578159.


Trial 23 with params: {'learning_rate': 0.0036178686096916646, 'weight_decay': 0.004, 'warmup_steps': 29, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4859,1.33508,0.830275,0.836151,0.831481,0.829842
2,0.7806,1.393989,0.826835,0.838481,0.825114,0.824755
3,0.4536,1.362171,0.850917,0.85523,0.849899,0.85016
4,0.2713,1.396257,0.84289,0.845079,0.842142,0.842392
5,0.1769,1.285497,0.868119,0.868087,0.868054,0.868069
6,0.1247,1.293021,0.856651,0.8573,0.856245,0.856446
7,0.0933,1.289509,0.857798,0.857873,0.857624,0.857708
8,0.0701,1.23208,0.864679,0.864703,0.864549,0.864608
9,0.0584,1.172878,0.876147,0.876113,0.876231,0.87613
10,0.0519,1.217728,0.861239,0.861834,0.861634,0.861234


[I 2025-03-23 08:39:27,998] Trial 23 finished with value: 0.8691972462578159 and parameters: {'learning_rate': 0.0036178686096916646, 'weight_decay': 0.004, 'warmup_steps': 29, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}. Best is trial 10 with value: 0.8691972462578159.


Trial 24 with params: {'learning_rate': 0.002398616287867879, 'weight_decay': 0.002, 'warmup_steps': 36, 'lambda_param': 0.30000000000000004, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5724,1.378121,0.822248,0.826327,0.823261,0.821961
2,0.884,1.16128,0.853211,0.854652,0.852614,0.85287
3,0.5273,1.416376,0.841743,0.847465,0.840553,0.840717
4,0.3342,1.320312,0.840596,0.844233,0.839638,0.839863
5,0.2205,1.28593,0.860092,0.861575,0.859497,0.859766
6,0.1495,1.325942,0.848624,0.851879,0.847731,0.847997
7,0.1112,1.214836,0.865826,0.866505,0.865423,0.865633
8,0.0869,1.259034,0.863532,0.863498,0.863465,0.86348
9,0.0707,1.206537,0.865826,0.866024,0.865591,0.865715
10,0.0595,1.170555,0.864679,0.86466,0.864591,0.864621


[I 2025-03-23 08:42:17,501] Trial 24 pruned. 


Trial 25 with params: {'learning_rate': 0.002224630501428965, 'weight_decay': 0.001, 'warmup_steps': 32, 'lambda_param': 0.1, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5704,1.342723,0.821101,0.823022,0.821798,0.821007
2,0.9039,1.242896,0.837156,0.840583,0.836217,0.836432
3,0.5441,1.280527,0.849771,0.853201,0.848857,0.849126
4,0.3419,1.403614,0.840596,0.842283,0.839932,0.840171
5,0.2267,1.226395,0.857798,0.857944,0.857582,0.85769
6,0.1566,1.237394,0.857798,0.85803,0.85754,0.857672
7,0.1132,1.245081,0.855505,0.85547,0.855582,0.855486
8,0.0883,1.190831,0.854358,0.854352,0.854246,0.854289
9,0.0718,1.205347,0.865826,0.865827,0.865717,0.865762
10,0.0602,1.190224,0.860092,0.860045,0.860045,0.860045


[I 2025-03-23 08:46:43,672] Trial 25 finished with value: 0.8599672505752209 and parameters: {'learning_rate': 0.002224630501428965, 'weight_decay': 0.001, 'warmup_steps': 32, 'lambda_param': 0.1, 'temperature': 2.0}. Best is trial 10 with value: 0.8691972462578159.


Trial 26 with params: {'learning_rate': 0.0009805560050968028, 'weight_decay': 0.004, 'warmup_steps': 23, 'lambda_param': 0.1, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6729,1.525058,0.799312,0.806597,0.800697,0.798568
2,1.127,1.220403,0.836009,0.836732,0.835554,0.835745
3,0.794,1.556382,0.830275,0.837043,0.828955,0.828976
4,0.5715,1.305425,0.838303,0.842965,0.837217,0.837399
5,0.4215,1.241983,0.87156,0.872398,0.872022,0.871549
6,0.3138,1.244679,0.855505,0.858227,0.854698,0.854989
7,0.2461,1.343382,0.869266,0.871287,0.868591,0.868901
8,0.1901,1.224401,0.879587,0.879909,0.879315,0.879472
9,0.1525,1.203885,0.873853,0.873903,0.874021,0.873847
10,0.1228,1.251796,0.872706,0.872832,0.872516,0.872618


[I 2025-03-23 08:51:16,852] Trial 26 finished with value: 0.8714453447903668 and parameters: {'learning_rate': 0.0009805560050968028, 'weight_decay': 0.004, 'warmup_steps': 23, 'lambda_param': 0.1, 'temperature': 2.5}. Best is trial 26 with value: 0.8714453447903668.


Trial 27 with params: {'learning_rate': 0.0008914811069059884, 'weight_decay': 0.005, 'warmup_steps': 18, 'lambda_param': 0.1, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6784,1.524222,0.797018,0.805646,0.798529,0.796085
2,1.1532,1.286328,0.834862,0.837948,0.833965,0.834178
3,0.8327,1.69839,0.827982,0.845448,0.825903,0.825096
4,0.6048,1.424541,0.824541,0.834933,0.822904,0.822609
5,0.4607,1.286003,0.844037,0.845039,0.843521,0.84374
6,0.3578,1.320667,0.855505,0.856561,0.854993,0.85523
7,0.2811,1.441543,0.857798,0.862218,0.856782,0.857076
8,0.2213,1.327131,0.858945,0.859054,0.85875,0.858847
9,0.1798,1.456558,0.848624,0.85019,0.849246,0.848573
10,0.1453,1.369789,0.864679,0.864679,0.864802,0.864668


[I 2025-03-23 08:55:52,908] Trial 27 finished with value: 0.8610629385731009 and parameters: {'learning_rate': 0.0008914811069059884, 'weight_decay': 0.005, 'warmup_steps': 18, 'lambda_param': 0.1, 'temperature': 3.5}. Best is trial 26 with value: 0.8714453447903668.


Trial 28 with params: {'learning_rate': 0.00021731627366121237, 'weight_decay': 0.003, 'warmup_steps': 39, 'lambda_param': 0.1, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.02,1.628179,0.786697,0.793105,0.788015,0.785994
2,1.4764,1.455371,0.813073,0.815487,0.812232,0.81238
3,1.3198,1.603067,0.799312,0.814708,0.797245,0.796006
4,1.207,1.407549,0.81422,0.817644,0.813231,0.813336
5,1.081,1.412526,0.816514,0.818654,0.817252,0.816397
6,0.9775,1.727266,0.809633,0.816897,0.811,0.808954
7,0.8847,1.463371,0.815367,0.816368,0.815873,0.815338
8,0.8119,1.532888,0.821101,0.821456,0.820746,0.820889
9,0.7541,1.475355,0.844037,0.845864,0.843353,0.843601
10,0.6973,1.620209,0.815367,0.819051,0.816336,0.815102


[I 2025-03-23 08:58:59,703] Trial 28 pruned. 


Trial 29 with params: {'learning_rate': 0.004465858399905994, 'weight_decay': 0.002, 'warmup_steps': 40, 'lambda_param': 0.9, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4817,1.340685,0.834862,0.837924,0.835733,0.834692
2,0.7372,1.385906,0.830275,0.834983,0.829166,0.829297
3,0.4254,1.436487,0.831422,0.838447,0.830081,0.830097
4,0.2503,1.447156,0.84289,0.845079,0.842142,0.842392
5,0.1657,1.353863,0.853211,0.856208,0.852362,0.852646


[I 2025-03-23 09:00:27,255] Trial 29 pruned. 


Trial 30 with params: {'learning_rate': 0.000311584806759745, 'weight_decay': 0.008, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8629,1.604764,0.792431,0.797949,0.793645,0.791877
2,1.4168,1.41012,0.817661,0.819867,0.816862,0.817035
3,1.2244,1.416549,0.811927,0.819575,0.810474,0.810249
4,1.0756,1.429133,0.816514,0.821715,0.815315,0.815324
5,0.9293,1.313802,0.830275,0.830212,0.83026,0.830231


[I 2025-03-23 09:01:53,101] Trial 30 pruned. 


Trial 31 with params: {'learning_rate': 0.0022164632395502517, 'weight_decay': 0.005, 'warmup_steps': 22, 'lambda_param': 0.1, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5932,1.29415,0.832569,0.832512,0.832512,0.832512
2,0.8962,1.210093,0.844037,0.846931,0.843184,0.843436
3,0.5391,1.224098,0.863532,0.865149,0.862918,0.863199
4,0.3439,1.273458,0.862385,0.866149,0.861455,0.861774
5,0.2238,1.322372,0.864679,0.865598,0.864212,0.864448
6,0.154,1.248489,0.862385,0.86411,0.86175,0.862034
7,0.1122,1.300044,0.858945,0.859601,0.85854,0.858743
8,0.0857,1.200791,0.861239,0.861181,0.861255,0.861208
9,0.0704,1.199283,0.866972,0.866919,0.867012,0.866947
10,0.0601,1.195082,0.866972,0.866955,0.866886,0.866916


[I 2025-03-23 09:06:20,604] Trial 31 finished with value: 0.8645584882612793 and parameters: {'learning_rate': 0.0022164632395502517, 'weight_decay': 0.005, 'warmup_steps': 22, 'lambda_param': 0.1, 'temperature': 2.5}. Best is trial 26 with value: 0.8714453447903668.


Trial 32 with params: {'learning_rate': 0.0007193454106078407, 'weight_decay': 0.006, 'warmup_steps': 30, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7486,1.712399,0.788991,0.800801,0.790772,0.787542
2,1.2076,1.260421,0.837156,0.83781,0.836722,0.836908
3,0.9028,1.354859,0.83945,0.843945,0.838385,0.83858
4,0.6809,1.385805,0.84289,0.846559,0.841932,0.842167
5,0.5312,1.224528,0.865826,0.865849,0.86597,0.865817
6,0.4209,1.322448,0.855505,0.85783,0.856256,0.855413
7,0.3372,1.310671,0.861239,0.861351,0.861044,0.861142
8,0.2733,1.326293,0.862385,0.862385,0.862507,0.862374
9,0.2258,1.522605,0.860092,0.860617,0.860466,0.860089
10,0.1866,1.353387,0.868119,0.868871,0.868559,0.868111


[I 2025-03-23 09:11:09,274] Trial 32 finished with value: 0.8657322778688039 and parameters: {'learning_rate': 0.0007193454106078407, 'weight_decay': 0.006, 'warmup_steps': 30, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}. Best is trial 26 with value: 0.8714453447903668.


Trial 33 with params: {'learning_rate': 0.0036082296418674083, 'weight_decay': 0.004, 'warmup_steps': 33, 'lambda_param': 0.4, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.509,1.273009,0.829128,0.831456,0.829892,0.829009
2,0.7718,1.311636,0.833716,0.840343,0.832418,0.832476
3,0.4398,1.180392,0.858945,0.859461,0.858582,0.858766
4,0.2661,1.39836,0.844037,0.847555,0.8431,0.843344
5,0.1737,1.314995,0.848624,0.848673,0.848783,0.848617
6,0.1207,1.232826,0.858945,0.859756,0.858498,0.858717
7,0.0894,1.278357,0.858945,0.858899,0.859003,0.858923
8,0.0731,1.293024,0.852064,0.852334,0.851783,0.851922
9,0.0592,1.24324,0.862385,0.86298,0.862002,0.8622
10,0.0522,1.299178,0.860092,0.860327,0.859834,0.859967


[I 2025-03-23 09:14:33,828] Trial 33 pruned. 


Trial 34 with params: {'learning_rate': 0.004061484546648457, 'weight_decay': 0.004, 'warmup_steps': 18, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4554,1.22285,0.826835,0.828441,0.827471,0.826769
2,0.7496,1.286883,0.838303,0.840204,0.837596,0.837832
3,0.432,1.519081,0.815367,0.829642,0.813431,0.812651
4,0.2586,1.299419,0.853211,0.853923,0.852783,0.852988
5,0.1725,1.290758,0.862385,0.862407,0.862255,0.862313
6,0.1236,1.370097,0.853211,0.85536,0.852488,0.852765
7,0.0917,1.313554,0.858945,0.859461,0.858582,0.858766
8,0.0705,1.306581,0.858945,0.859601,0.85854,0.858743
9,0.06,1.230833,0.856651,0.856614,0.856582,0.856597
10,0.0512,1.306813,0.862385,0.862328,0.862381,0.86235


[I 2025-03-23 09:17:39,850] Trial 34 pruned. 


Trial 35 with params: {'learning_rate': 0.0007383776373507603, 'weight_decay': 0.003, 'warmup_steps': 30, 'lambda_param': 0.0, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.744,1.686423,0.797018,0.805646,0.798529,0.796085
2,1.1971,1.258018,0.840596,0.840844,0.840311,0.840443
3,0.8921,1.301282,0.844037,0.848242,0.843016,0.843244
4,0.6707,1.263081,0.844037,0.845864,0.843353,0.843601
5,0.5182,1.28059,0.861239,0.861181,0.861255,0.861208
6,0.4027,1.273255,0.869266,0.869517,0.869012,0.86915
7,0.324,1.266972,0.865826,0.866664,0.865381,0.865609
8,0.2618,1.274591,0.865826,0.866067,0.866096,0.865826
9,0.2174,1.328051,0.866972,0.866972,0.867096,0.866961
10,0.1791,1.333561,0.87156,0.871853,0.871853,0.87156


[I 2025-03-23 09:21:54,472] Trial 35 finished with value: 0.8691669474393531 and parameters: {'learning_rate': 0.0007383776373507603, 'weight_decay': 0.003, 'warmup_steps': 30, 'lambda_param': 0.0, 'temperature': 2.5}. Best is trial 26 with value: 0.8714453447903668.


Trial 36 with params: {'learning_rate': 0.0008409003188062777, 'weight_decay': 0.004, 'warmup_steps': 26, 'lambda_param': 0.0, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7107,1.415103,0.809633,0.810541,0.810116,0.809608
2,1.1785,1.250699,0.833716,0.83608,0.832923,0.833145
3,0.854,1.366233,0.84289,0.846901,0.84189,0.842117
4,0.6293,1.342141,0.836009,0.841012,0.83488,0.835035
5,0.4754,1.268289,0.858945,0.859185,0.859213,0.858945
6,0.366,1.217165,0.872706,0.873015,0.872432,0.872584
7,0.2883,1.237514,0.864679,0.865026,0.864381,0.864539
8,0.2315,1.362743,0.854358,0.854596,0.854624,0.854358
9,0.1892,1.386024,0.862385,0.864228,0.863055,0.862327
10,0.1536,1.316844,0.857798,0.859872,0.858508,0.857723


[I 2025-03-23 09:25:03,307] Trial 36 pruned. 


Trial 37 with params: {'learning_rate': 0.0002483379277236458, 'weight_decay': 0.004, 'warmup_steps': 19, 'lambda_param': 0.2, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9418,1.618912,0.78211,0.788843,0.783468,0.781332
2,1.4433,1.42826,0.81422,0.817342,0.813274,0.813395
3,1.2761,1.439414,0.811927,0.819575,0.810474,0.810249
4,1.1518,1.402559,0.813073,0.81696,0.812021,0.812093
5,1.0202,1.338428,0.822248,0.822808,0.82263,0.822242
6,0.9088,1.542491,0.817661,0.823155,0.818841,0.817216
7,0.8199,1.479347,0.831422,0.833268,0.832102,0.831342
8,0.7422,1.481086,0.833716,0.834586,0.833218,0.833416
9,0.6765,1.500813,0.841743,0.844611,0.84089,0.841134
10,0.629,1.623293,0.824541,0.828993,0.825598,0.824225


[I 2025-03-23 09:28:46,741] Trial 37 pruned. 


Trial 38 with params: {'learning_rate': 0.00014198795619548116, 'weight_decay': 0.005, 'warmup_steps': 28, 'lambda_param': 0.30000000000000004, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1342,1.641609,0.786697,0.788927,0.787467,0.786536
2,1.553,1.555364,0.801606,0.803009,0.800928,0.801076
3,1.4553,1.502986,0.803899,0.806471,0.803012,0.803116
4,1.3736,1.42834,0.803899,0.804004,0.803644,0.803738
5,1.2868,1.407581,0.81078,0.810713,0.810737,0.810724
6,1.208,1.576685,0.791284,0.800013,0.792814,0.790292
7,1.1382,1.433954,0.811927,0.814831,0.812789,0.811732
8,1.0793,1.402393,0.81422,0.815245,0.813652,0.813828
9,1.0268,1.407017,0.821101,0.821994,0.820578,0.820761
10,0.9775,1.466899,0.81078,0.813269,0.811579,0.810624


[I 2025-03-23 09:31:37,699] Trial 38 pruned. 


Trial 39 with params: {'learning_rate': 0.0009006181329331087, 'weight_decay': 0.001, 'warmup_steps': 21, 'lambda_param': 0.0, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.692,1.509317,0.790138,0.796005,0.791393,0.789526
2,1.1497,1.287388,0.831422,0.835605,0.830376,0.830537
3,0.8215,1.557567,0.830275,0.8395,0.828745,0.828608
4,0.6086,1.299151,0.841743,0.845564,0.840764,0.84099
5,0.4634,1.345252,0.852064,0.852561,0.851699,0.851877


[I 2025-03-23 09:33:28,867] Trial 39 pruned. 


Trial 40 with params: {'learning_rate': 0.00038098201755779005, 'weight_decay': 0.002, 'warmup_steps': 27, 'lambda_param': 0.2, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8592,1.680178,0.787844,0.799899,0.789646,0.786347
2,1.3519,1.339805,0.827982,0.828252,0.827671,0.827804
3,1.1193,1.361948,0.827982,0.832285,0.826913,0.82705
4,0.9491,1.361292,0.831422,0.834318,0.830544,0.830749
5,0.806,1.39989,0.837156,0.838687,0.837775,0.837101
6,0.6773,1.36933,0.849771,0.850036,0.849489,0.849626
7,0.5874,1.41768,0.84633,0.846393,0.846152,0.846232
8,0.5133,1.433104,0.849771,0.852021,0.849025,0.849295
9,0.4509,1.485521,0.84633,0.848115,0.846994,0.846265
10,0.4027,1.450639,0.853211,0.855265,0.853919,0.853134


[I 2025-03-23 09:36:24,187] Trial 40 pruned. 


Trial 41 with params: {'learning_rate': 0.001860404176999338, 'weight_decay': 0.003, 'warmup_steps': 30, 'lambda_param': 0.0, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5933,1.318097,0.816514,0.818192,0.817168,0.816436
2,0.9387,1.200227,0.848624,0.849476,0.848152,0.848365
3,0.5844,1.560225,0.826835,0.835684,0.825324,0.825174
4,0.3801,1.38912,0.829128,0.833275,0.828082,0.828232
5,0.2544,1.360084,0.853211,0.853923,0.852783,0.852988


[I 2025-03-23 09:38:16,637] Trial 41 pruned. 


Trial 42 with params: {'learning_rate': 0.00151310416767446, 'weight_decay': 0.003, 'warmup_steps': 25, 'lambda_param': 0.30000000000000004, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6372,1.425594,0.81078,0.816573,0.812,0.810275
2,1.0014,1.225733,0.858945,0.859601,0.85854,0.858743
3,0.6564,1.399404,0.848624,0.852901,0.847605,0.847855
4,0.4425,1.244462,0.83945,0.841737,0.83868,0.83892
5,0.3074,1.194526,0.862385,0.862351,0.862465,0.862367
6,0.2172,1.232073,0.861239,0.863075,0.860581,0.860868
7,0.1578,1.272349,0.857798,0.858247,0.857456,0.85763
8,0.1208,1.255615,0.862385,0.86313,0.86196,0.862176
9,0.0961,1.226841,0.87156,0.871525,0.871643,0.871543
10,0.0778,1.251178,0.863532,0.863582,0.863381,0.863453


[I 2025-03-23 09:42:56,350] Trial 42 finished with value: 0.8634011492058895 and parameters: {'learning_rate': 0.00151310416767446, 'weight_decay': 0.003, 'warmup_steps': 25, 'lambda_param': 0.30000000000000004, 'temperature': 3.0}. Best is trial 26 with value: 0.8714453447903668.


Trial 43 with params: {'learning_rate': 0.00456494760304353, 'weight_decay': 0.004, 'warmup_steps': 23, 'lambda_param': 0.4, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4426,1.165366,0.840596,0.842723,0.841321,0.840504
2,0.7169,1.195935,0.853211,0.855627,0.852446,0.852727
3,0.4094,1.229221,0.857798,0.861166,0.856908,0.857209
4,0.2445,1.279236,0.84289,0.842831,0.842858,0.842843
5,0.1652,1.206591,0.861239,0.862621,0.860666,0.860931
6,0.1145,1.202101,0.854358,0.854299,0.854372,0.854325
7,0.0913,1.231558,0.856651,0.856605,0.856708,0.856629
8,0.07,1.209659,0.854358,0.854399,0.854204,0.854273
9,0.0585,1.193835,0.858945,0.859227,0.858666,0.85881
10,0.0503,1.188838,0.860092,0.860327,0.859834,0.859967


[I 2025-03-23 09:45:50,748] Trial 43 pruned. 


Trial 44 with params: {'learning_rate': 0.0014691315499909523, 'weight_decay': 0.009000000000000001, 'warmup_steps': 39, 'lambda_param': 0.9, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.645,1.553647,0.794725,0.807026,0.796529,0.793276
2,1.0128,1.194316,0.844037,0.844245,0.843774,0.843898
3,0.6688,1.427281,0.844037,0.844715,0.843605,0.843799
4,0.4555,1.246132,0.836009,0.83659,0.835596,0.835774
5,0.3186,1.220976,0.863532,0.863498,0.863465,0.86348
6,0.2249,1.182809,0.863532,0.863727,0.863297,0.86342
7,0.165,1.296269,0.853211,0.853325,0.853414,0.853208
8,0.1293,1.330744,0.858945,0.859054,0.85875,0.858847
9,0.0976,1.243077,0.869266,0.86921,0.869264,0.869232
10,0.0805,1.260071,0.87156,0.871504,0.871558,0.871527


[I 2025-03-23 09:50:15,163] Trial 44 finished with value: 0.8680802305833385 and parameters: {'learning_rate': 0.0014691315499909523, 'weight_decay': 0.009000000000000001, 'warmup_steps': 39, 'lambda_param': 0.9, 'temperature': 6.5}. Best is trial 26 with value: 0.8714453447903668.


Trial 45 with params: {'learning_rate': 0.0016014260749467285, 'weight_decay': 0.009000000000000001, 'warmup_steps': 38, 'lambda_param': 0.9, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6267,1.460678,0.809633,0.816025,0.810916,0.809054
2,0.9887,1.19652,0.850917,0.850883,0.850994,0.850898
3,0.6375,1.614628,0.833716,0.841272,0.832334,0.832339
4,0.4325,1.369689,0.832569,0.84083,0.831123,0.831075
5,0.2966,1.196516,0.873853,0.87405,0.874105,0.873853
6,0.2073,1.213578,0.855505,0.85696,0.854909,0.855169
7,0.1502,1.22642,0.858945,0.859227,0.858666,0.85881
8,0.1149,1.212683,0.858945,0.859227,0.858666,0.85881
9,0.0913,1.202924,0.869266,0.869671,0.869601,0.869265
10,0.0755,1.229503,0.860092,0.86068,0.859708,0.859903


[I 2025-03-23 09:53:15,089] Trial 45 pruned. 


Trial 46 with params: {'learning_rate': 0.00035209578167894637, 'weight_decay': 0.01, 'warmup_steps': 31, 'lambda_param': 0.8, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8758,1.612287,0.787844,0.79722,0.789435,0.786731
2,1.3875,1.409475,0.821101,0.826796,0.819862,0.819873
3,1.1724,1.357872,0.825688,0.832341,0.824366,0.824354
4,1.0104,1.423427,0.815367,0.823357,0.813895,0.813679
5,0.8572,1.306065,0.841743,0.841688,0.841774,0.841713
6,0.7315,1.321832,0.84633,0.84634,0.846194,0.846249
7,0.6403,1.454062,0.830275,0.830778,0.830639,0.830272
8,0.5631,1.425313,0.845183,0.846903,0.844521,0.84477
9,0.5014,1.472228,0.848624,0.849141,0.848994,0.848621
10,0.4519,1.592742,0.836009,0.84096,0.837112,0.83568


[I 2025-03-23 09:56:31,402] Trial 46 pruned. 


Trial 47 with params: {'learning_rate': 0.00017209337253776082, 'weight_decay': 0.007, 'warmup_steps': 26, 'lambda_param': 0.9, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0488,1.622257,0.78211,0.784556,0.782921,0.781916
2,1.5126,1.538412,0.806193,0.809968,0.805138,0.805176
3,1.3968,1.498531,0.800459,0.802862,0.799592,0.799691
4,1.301,1.417349,0.811927,0.813115,0.811316,0.811489
5,1.195,1.421781,0.809633,0.811287,0.810285,0.809552


[I 2025-03-23 09:57:55,986] Trial 47 pruned. 


Trial 48 with params: {'learning_rate': 0.0016030292289701987, 'weight_decay': 0.01, 'warmup_steps': 43, 'lambda_param': 1.0, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.644,1.462365,0.806193,0.813615,0.80758,0.805474
2,0.9928,1.163377,0.853211,0.853157,0.853246,0.853183
3,0.6381,1.275449,0.850917,0.850883,0.850994,0.850898
4,0.4289,1.236661,0.84289,0.843801,0.842395,0.842607
5,0.2958,1.264019,0.854358,0.854399,0.854204,0.854273


[I 2025-03-23 09:59:21,275] Trial 48 pruned. 


Trial 49 with params: {'learning_rate': 0.0028348868920629153, 'weight_decay': 0.01, 'warmup_steps': 28, 'lambda_param': 0.5, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5769,1.207665,0.833716,0.834047,0.834017,0.833715
2,0.8265,1.226206,0.841743,0.844914,0.840848,0.841088
3,0.4847,1.195117,0.856651,0.857454,0.856203,0.85642
4,0.3015,1.246064,0.856651,0.857038,0.856329,0.856493
5,0.1946,1.328209,0.853211,0.853435,0.852951,0.85308
6,0.1339,1.300392,0.853211,0.855627,0.852446,0.852727
7,0.1006,1.279385,0.857798,0.85803,0.85754,0.857672
8,0.0778,1.252324,0.849771,0.849869,0.849573,0.849666
9,0.0649,1.248755,0.855505,0.855647,0.855288,0.855395
10,0.0541,1.281946,0.850917,0.850883,0.850994,0.850898


[I 2025-03-23 10:02:20,405] Trial 49 pruned. 


Trial 50 with params: {'learning_rate': 0.0021133792752108674, 'weight_decay': 0.005, 'warmup_steps': 20, 'lambda_param': 1.0, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5885,1.294278,0.824541,0.82462,0.824714,0.824536
2,0.9031,1.23943,0.850917,0.850931,0.850783,0.850839
3,0.5495,1.287138,0.844037,0.843976,0.844026,0.843996
4,0.353,1.287037,0.838303,0.838761,0.837933,0.838098
5,0.2337,1.256086,0.853211,0.853176,0.853288,0.853192
6,0.1583,1.208097,0.857798,0.85775,0.85775,0.85775
7,0.1196,1.229734,0.857798,0.85775,0.85775,0.85775
8,0.0912,1.220067,0.849771,0.849724,0.849825,0.849747
9,0.0744,1.145986,0.861239,0.861582,0.86155,0.861238
10,0.0626,1.231556,0.860092,0.860492,0.860424,0.860091


[I 2025-03-23 10:06:57,883] Trial 50 finished with value: 0.8589299266213601 and parameters: {'learning_rate': 0.0021133792752108674, 'weight_decay': 0.005, 'warmup_steps': 20, 'lambda_param': 1.0, 'temperature': 6.5}. Best is trial 26 with value: 0.8714453447903668.


Trial 51 with params: {'learning_rate': 0.0009335953268915212, 'weight_decay': 0.007, 'warmup_steps': 41, 'lambda_param': 0.7000000000000001, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7194,1.578621,0.799312,0.807514,0.800781,0.798451
2,1.135,1.272052,0.841743,0.841867,0.841521,0.841623
3,0.8144,1.440114,0.838303,0.845984,0.836922,0.836965
4,0.5887,1.237514,0.850917,0.851477,0.85053,0.850716
5,0.4471,1.260833,0.858945,0.858968,0.859087,0.858936
6,0.3346,1.314203,0.865826,0.865877,0.865675,0.865748
7,0.2638,1.281483,0.87156,0.871589,0.871432,0.871492
8,0.2081,1.333712,0.866972,0.867445,0.866633,0.866815
9,0.1643,1.295043,0.866972,0.866955,0.866886,0.866916
10,0.1347,1.289493,0.872706,0.873312,0.873106,0.872702


[I 2025-03-23 10:11:39,559] Trial 51 finished with value: 0.8680802305833385 and parameters: {'learning_rate': 0.0009335953268915212, 'weight_decay': 0.007, 'warmup_steps': 41, 'lambda_param': 0.7000000000000001, 'temperature': 6.0}. Best is trial 26 with value: 0.8714453447903668.


Trial 52 with params: {'learning_rate': 0.0009102214711612892, 'weight_decay': 0.005, 'warmup_steps': 32, 'lambda_param': 0.6000000000000001, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7307,1.559713,0.793578,0.800109,0.794898,0.792897
2,1.1546,1.310788,0.836009,0.837061,0.83547,0.83568
3,0.8354,1.460322,0.840596,0.84694,0.839343,0.839471
4,0.6088,1.320008,0.845183,0.846903,0.844521,0.84477
5,0.4525,1.414676,0.848624,0.850254,0.847983,0.848237
6,0.3435,1.320485,0.850917,0.851477,0.85053,0.850716
7,0.2696,1.365019,0.858945,0.861013,0.858245,0.858534
8,0.2116,1.446052,0.856651,0.858454,0.855993,0.856269
9,0.1733,1.407682,0.853211,0.853186,0.853119,0.853148
10,0.1388,1.393264,0.858945,0.859538,0.859339,0.85894


[I 2025-03-23 10:14:41,989] Trial 52 pruned. 


Trial 53 with params: {'learning_rate': 0.0025418651451781585, 'weight_decay': 0.008, 'warmup_steps': 43, 'lambda_param': 0.5, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.583,1.316556,0.826835,0.830959,0.82785,0.826555
2,0.8596,1.220161,0.84633,0.849557,0.845437,0.845694
3,0.5042,1.237613,0.836009,0.837061,0.83547,0.83568
4,0.31,1.362232,0.838303,0.843349,0.837175,0.837342
5,0.2026,1.31149,0.847477,0.847842,0.847152,0.847308


[I 2025-03-23 10:16:18,987] Trial 53 pruned. 


Trial 54 with params: {'learning_rate': 0.000984917276343171, 'weight_decay': 0.007, 'warmup_steps': 31, 'lambda_param': 0.7000000000000001, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.698,1.457752,0.801606,0.807659,0.802865,0.801027
2,1.1356,1.237669,0.844037,0.844451,0.843689,0.843852
3,0.8128,1.33263,0.848624,0.854487,0.847436,0.847642
4,0.5849,1.171741,0.854358,0.856144,0.853698,0.853969
5,0.4368,1.179721,0.855505,0.855793,0.855793,0.855505
6,0.3267,1.185249,0.865826,0.865979,0.866054,0.865824
7,0.2502,1.16068,0.868119,0.868062,0.868138,0.86809
8,0.1995,1.256654,0.864679,0.864679,0.864802,0.864668
9,0.1603,1.261528,0.870413,0.87088,0.870769,0.870411
10,0.1266,1.299226,0.864679,0.86587,0.865223,0.864653


[I 2025-03-23 10:20:24,434] Trial 54 finished with value: 0.878417341892034 and parameters: {'learning_rate': 0.000984917276343171, 'weight_decay': 0.007, 'warmup_steps': 31, 'lambda_param': 0.7000000000000001, 'temperature': 7.0}. Best is trial 54 with value: 0.878417341892034.


Trial 55 with params: {'learning_rate': 7.242888062473813e-05, 'weight_decay': 0.001, 'warmup_steps': 32, 'lambda_param': 0.0, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4287,1.690173,0.768349,0.768273,0.768313,0.768289
2,1.6731,1.627411,0.786697,0.788086,0.785994,0.786102
3,1.5647,1.56669,0.792431,0.793768,0.791751,0.791877
4,1.5117,1.53641,0.791284,0.792708,0.790583,0.790702
5,1.4624,1.503315,0.78555,0.785477,0.785499,0.785487


[I 2025-03-23 10:22:05,918] Trial 55 pruned. 


Trial 56 with params: {'learning_rate': 0.0008619707247674679, 'weight_decay': 0.008, 'warmup_steps': 28, 'lambda_param': 0.7000000000000001, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7262,1.581981,0.790138,0.797678,0.791561,0.789299
2,1.1741,1.313275,0.836009,0.83867,0.835175,0.835401
3,0.8509,1.551806,0.830275,0.842371,0.828534,0.828193
4,0.623,1.266555,0.849771,0.849711,0.849783,0.849737
5,0.472,1.328285,0.849771,0.850008,0.850036,0.84977
6,0.3636,1.216321,0.87156,0.871648,0.87139,0.871478
7,0.2854,1.271624,0.866972,0.867733,0.866549,0.86677
8,0.2258,1.317849,0.863532,0.863727,0.863297,0.86342
9,0.1834,1.307918,0.858945,0.860223,0.859508,0.858914
10,0.1488,1.232421,0.868119,0.868074,0.86818,0.868098


[I 2025-03-23 10:27:02,276] Trial 56 finished with value: 0.8577075931043558 and parameters: {'learning_rate': 0.0008619707247674679, 'weight_decay': 0.008, 'warmup_steps': 28, 'lambda_param': 0.7000000000000001, 'temperature': 6.5}. Best is trial 54 with value: 0.878417341892034.


Trial 57 with params: {'learning_rate': 0.0005385725149521535, 'weight_decay': 0.008, 'warmup_steps': 33, 'lambda_param': 0.4, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7908,1.538977,0.791284,0.794606,0.792224,0.791003
2,1.2666,1.320048,0.83945,0.840264,0.838974,0.839175
3,0.9954,1.292885,0.841743,0.843326,0.8411,0.841339
4,0.7905,1.371095,0.830275,0.835364,0.829124,0.829237
5,0.6394,1.284328,0.855505,0.855554,0.855666,0.855498
6,0.5213,1.312558,0.860092,0.860991,0.859624,0.859853
7,0.4395,1.493213,0.838303,0.841908,0.837343,0.837559
8,0.3677,1.325235,0.857798,0.857744,0.857834,0.857771
9,0.3126,1.517288,0.850917,0.852491,0.851541,0.850867
10,0.2684,1.400178,0.860092,0.860286,0.860339,0.860091


[I 2025-03-23 10:29:54,229] Trial 57 pruned. 


Trial 58 with params: {'learning_rate': 0.000447152190074509, 'weight_decay': 0.01, 'warmup_steps': 37, 'lambda_param': 1.0, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8527,1.524923,0.792431,0.795911,0.793393,0.792133
2,1.3343,1.340671,0.825688,0.826775,0.825124,0.825321
3,1.0734,1.342573,0.830275,0.833931,0.829292,0.829468
4,0.8903,1.335468,0.823394,0.823655,0.823082,0.823212
5,0.7296,1.334307,0.849771,0.850141,0.849446,0.849604


[I 2025-03-23 10:31:39,378] Trial 58 pruned. 


Trial 59 with params: {'learning_rate': 0.003320093416323534, 'weight_decay': 0.009000000000000001, 'warmup_steps': 33, 'lambda_param': 1.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5982,1.260212,0.830275,0.830911,0.830681,0.830267
2,0.7891,1.24741,0.837156,0.838126,0.836638,0.836846
3,0.453,1.243855,0.857798,0.857944,0.857582,0.85769
4,0.2682,1.306714,0.838303,0.84128,0.837427,0.837657
5,0.1761,1.400627,0.856651,0.856929,0.856372,0.856514
6,0.127,1.29079,0.850917,0.851054,0.850699,0.850804
7,0.0954,1.231865,0.858945,0.859337,0.858624,0.858789
8,0.0746,1.224481,0.855505,0.85547,0.855582,0.855486
9,0.0622,1.13229,0.861239,0.861237,0.861129,0.861173
10,0.0523,1.187331,0.863532,0.863727,0.863297,0.86342


[I 2025-03-23 10:36:30,102] Trial 59 finished with value: 0.8633594189272645 and parameters: {'learning_rate': 0.003320093416323534, 'weight_decay': 0.009000000000000001, 'warmup_steps': 33, 'lambda_param': 1.0, 'temperature': 7.0}. Best is trial 54 with value: 0.878417341892034.


Trial 60 with params: {'learning_rate': 0.00017559280388301614, 'weight_decay': 0.0, 'warmup_steps': 5, 'lambda_param': 1.0, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0134,1.624749,0.780963,0.782825,0.781668,0.780836
2,1.5279,1.567767,0.800459,0.807325,0.799044,0.798765
3,1.4001,1.493777,0.802752,0.804932,0.801928,0.802048
4,1.2987,1.421613,0.808486,0.809565,0.807896,0.808062
5,1.194,1.39185,0.811927,0.812543,0.812326,0.811918


[I 2025-03-23 10:38:00,133] Trial 60 pruned. 


Trial 61 with params: {'learning_rate': 0.00046487607901924186, 'weight_decay': 0.006, 'warmup_steps': 39, 'lambda_param': 0.8, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8386,1.612434,0.790138,0.796005,0.791393,0.789526
2,1.3363,1.349328,0.829128,0.829825,0.828671,0.828853
3,1.0741,1.316941,0.833716,0.836639,0.832839,0.833051
4,0.8834,1.36522,0.827982,0.829686,0.827292,0.827502
5,0.7237,1.393475,0.841743,0.841712,0.841648,0.841676
6,0.5975,1.296606,0.860092,0.860057,0.860171,0.860073
7,0.5131,1.436032,0.840596,0.841337,0.840143,0.840339
8,0.4415,1.460814,0.837156,0.837674,0.836764,0.836936
9,0.3782,1.500772,0.854358,0.855832,0.854961,0.854315
10,0.3335,1.513036,0.854358,0.856292,0.855045,0.854289


[I 2025-03-23 10:41:11,867] Trial 61 pruned. 


Trial 62 with params: {'learning_rate': 0.0005655053834503238, 'weight_decay': 0.007, 'warmup_steps': 43, 'lambda_param': 0.8, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8134,1.526581,0.797018,0.799428,0.797813,0.796851
2,1.2607,1.30961,0.825688,0.826171,0.825293,0.825453
3,0.9813,1.329286,0.840596,0.844928,0.839553,0.83976
4,0.7741,1.391457,0.827982,0.829081,0.827419,0.827619
5,0.6184,1.377949,0.852064,0.852698,0.851657,0.851852


[I 2025-03-23 10:42:41,874] Trial 62 pruned. 


Trial 63 with params: {'learning_rate': 0.0010968419589109076, 'weight_decay': 0.007, 'warmup_steps': 32, 'lambda_param': 0.6000000000000001, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6758,1.671089,0.78211,0.797937,0.784184,0.77997
2,1.0957,1.219467,0.841743,0.842042,0.841437,0.84158
3,0.7592,1.460424,0.825688,0.828962,0.824745,0.824913
4,0.5428,1.197163,0.845183,0.845277,0.844984,0.845076
5,0.3961,1.264151,0.850917,0.850863,0.850951,0.850889
6,0.2934,1.243935,0.857798,0.857764,0.857877,0.857779
7,0.2223,1.323835,0.864679,0.867805,0.863833,0.864158
8,0.1728,1.268171,0.866972,0.869239,0.866254,0.866568
9,0.1372,1.158241,0.864679,0.864679,0.864802,0.864668
10,0.1096,1.190434,0.869266,0.869222,0.869222,0.869222


[I 2025-03-23 10:47:35,470] Trial 63 finished with value: 0.8772081515552309 and parameters: {'learning_rate': 0.0010968419589109076, 'weight_decay': 0.007, 'warmup_steps': 32, 'lambda_param': 0.6000000000000001, 'temperature': 7.0}. Best is trial 54 with value: 0.878417341892034.


Trial 64 with params: {'learning_rate': 0.0008694255528803575, 'weight_decay': 0.004, 'warmup_steps': 29, 'lambda_param': 0.9, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7178,1.586896,0.795872,0.803763,0.797318,0.795026
2,1.1616,1.376129,0.830275,0.833305,0.829376,0.829572
3,0.845,1.436935,0.829128,0.835211,0.827871,0.827922
4,0.6178,1.230137,0.850917,0.853593,0.850109,0.850385
5,0.4672,1.255163,0.849771,0.850036,0.849489,0.849626


[I 2025-03-23 10:49:03,642] Trial 64 pruned. 


Trial 65 with params: {'learning_rate': 0.001600342932754346, 'weight_decay': 0.005, 'warmup_steps': 26, 'lambda_param': 0.6000000000000001, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6127,1.414183,0.81078,0.816981,0.812042,0.810229
2,0.9834,1.174002,0.855505,0.855948,0.855161,0.855333
3,0.625,1.347215,0.837156,0.840911,0.836175,0.836381
4,0.4185,1.227529,0.84633,0.84646,0.84611,0.846214
5,0.2869,1.269041,0.850917,0.851621,0.850488,0.85069
6,0.2035,1.195365,0.87156,0.871589,0.871432,0.871492
7,0.1492,1.30448,0.853211,0.853534,0.852909,0.85306
8,0.1137,1.269166,0.865826,0.866233,0.865507,0.865677
9,0.0905,1.242854,0.862385,0.862328,0.862381,0.86235
10,0.0728,1.295366,0.862385,0.862846,0.862044,0.862222


[I 2025-03-23 10:53:51,002] Trial 65 finished with value: 0.8599473407056346 and parameters: {'learning_rate': 0.001600342932754346, 'weight_decay': 0.005, 'warmup_steps': 26, 'lambda_param': 0.6000000000000001, 'temperature': 7.0}. Best is trial 54 with value: 0.878417341892034.


Trial 66 with params: {'learning_rate': 0.000647415663556665, 'weight_decay': 0.006, 'warmup_steps': 30, 'lambda_param': 0.5, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7611,1.519292,0.791284,0.793786,0.792098,0.791099
2,1.2225,1.315135,0.836009,0.837662,0.835344,0.835571
3,0.9309,1.304929,0.84289,0.848021,0.841763,0.841957
4,0.7174,1.453425,0.830275,0.833611,0.829334,0.829521
5,0.5677,1.325385,0.854358,0.854381,0.854498,0.854348
6,0.4508,1.250218,0.868119,0.868067,0.868096,0.86808
7,0.3681,1.350271,0.855505,0.856385,0.855035,0.855258
8,0.3033,1.288797,0.857798,0.857912,0.858003,0.857795
9,0.2531,1.437593,0.852064,0.853326,0.852625,0.852031
10,0.2126,1.380902,0.860092,0.861273,0.860634,0.860065


[I 2025-03-23 10:56:50,099] Trial 66 pruned. 


Trial 67 with params: {'learning_rate': 0.0007547621663272352, 'weight_decay': 0.008, 'warmup_steps': 27, 'lambda_param': 0.7000000000000001, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.751,1.667629,0.783257,0.797527,0.785226,0.78137
2,1.1904,1.322185,0.826835,0.827152,0.826503,0.826643
3,0.8893,1.365589,0.838303,0.843748,0.837133,0.837284
4,0.6618,1.355145,0.829128,0.831715,0.828292,0.828495
5,0.5092,1.27241,0.849771,0.849724,0.849825,0.849747
6,0.3947,1.304573,0.853211,0.854082,0.852741,0.85296
7,0.3175,1.437291,0.84633,0.847945,0.845689,0.845938
8,0.2544,1.332402,0.861239,0.861351,0.861044,0.861142
9,0.2126,1.347402,0.857798,0.857873,0.857624,0.857708
10,0.173,1.288323,0.868119,0.868362,0.868391,0.868119


[I 2025-03-23 11:01:20,687] Trial 67 finished with value: 0.8553760128891628 and parameters: {'learning_rate': 0.0007547621663272352, 'weight_decay': 0.008, 'warmup_steps': 27, 'lambda_param': 0.7000000000000001, 'temperature': 7.0}. Best is trial 54 with value: 0.878417341892034.


Trial 68 with params: {'learning_rate': 0.0003413970368636955, 'weight_decay': 0.001, 'warmup_steps': 30, 'lambda_param': 0.2, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8855,1.55228,0.792431,0.795617,0.793351,0.792169
2,1.4005,1.417761,0.821101,0.825643,0.819988,0.82007
3,1.186,1.374494,0.822248,0.829037,0.820904,0.820851
4,1.0272,1.46469,0.813073,0.823034,0.811432,0.811015
5,0.8763,1.365501,0.837156,0.837346,0.837396,0.837155
6,0.7471,1.384357,0.837156,0.837544,0.83748,0.837155
7,0.6527,1.493006,0.831422,0.832131,0.831849,0.831411
8,0.5768,1.497814,0.838303,0.839035,0.837848,0.838042
9,0.5138,1.489144,0.849771,0.850837,0.850288,0.849747
10,0.4641,1.510633,0.852064,0.856436,0.853088,0.851826


[I 2025-03-23 11:04:16,735] Trial 68 pruned. 


Trial 69 with params: {'learning_rate': 7.808255793137976e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 28, 'lambda_param': 0.8, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3652,1.691832,0.774083,0.774301,0.774322,0.774082
2,1.6431,1.62345,0.787844,0.788651,0.787288,0.787419
3,1.5564,1.571733,0.791284,0.792521,0.790625,0.790752
4,1.5014,1.523885,0.794725,0.796078,0.794045,0.794177
5,1.4508,1.493986,0.790138,0.790091,0.790004,0.790038


[I 2025-03-23 11:05:55,918] Trial 69 pruned. 


Trial 70 with params: {'learning_rate': 0.0033167095265479833, 'weight_decay': 0.007, 'warmup_steps': 31, 'lambda_param': 0.7000000000000001, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5228,1.363171,0.826835,0.829696,0.827682,0.826669
2,0.7904,1.317434,0.834862,0.837948,0.833965,0.834178
3,0.4523,1.235515,0.845183,0.845351,0.844942,0.845056
4,0.2739,1.30621,0.849771,0.853201,0.848857,0.849126
5,0.1738,1.319507,0.847477,0.848244,0.847026,0.847231


[I 2025-03-23 11:07:25,479] Trial 70 pruned. 


Trial 71 with params: {'learning_rate': 0.0007072122019403161, 'weight_decay': 0.004, 'warmup_steps': 41, 'lambda_param': 0.1, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7731,1.624895,0.793578,0.803889,0.795234,0.792389
2,1.2032,1.28293,0.833716,0.83443,0.83326,0.833447
3,0.8994,1.276843,0.84633,0.848682,0.845563,0.845823
4,0.6752,1.437952,0.831422,0.837557,0.830165,0.830232
5,0.5232,1.297595,0.852064,0.852561,0.851699,0.851877
6,0.4131,1.213321,0.869266,0.869427,0.869054,0.869167
7,0.3359,1.341014,0.857798,0.859492,0.857161,0.857435
8,0.2726,1.285615,0.863532,0.864535,0.863044,0.863286
9,0.2261,1.370683,0.856651,0.858356,0.857298,0.856597
10,0.1866,1.316186,0.866972,0.866955,0.866886,0.866916


[I 2025-03-23 11:12:19,807] Trial 71 finished with value: 0.8622628694182501 and parameters: {'learning_rate': 0.0007072122019403161, 'weight_decay': 0.004, 'warmup_steps': 41, 'lambda_param': 0.1, 'temperature': 2.5}. Best is trial 54 with value: 0.878417341892034.


Trial 72 with params: {'learning_rate': 0.0007326143061430178, 'weight_decay': 0.006, 'warmup_steps': 42, 'lambda_param': 0.4, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7544,1.595801,0.795872,0.802444,0.797192,0.795198
2,1.2,1.30208,0.837156,0.839972,0.836301,0.836529
3,0.8917,1.333251,0.838303,0.841908,0.837343,0.837559
4,0.6594,1.43821,0.833716,0.838298,0.832628,0.832786
5,0.5161,1.340449,0.861239,0.861219,0.861339,0.861224
6,0.4002,1.309375,0.861239,0.861193,0.861297,0.861216
7,0.3286,1.338954,0.865826,0.867029,0.865296,0.865557
8,0.2622,1.262602,0.875,0.875213,0.874768,0.874897
9,0.2169,1.409239,0.856651,0.856674,0.856793,0.856642
10,0.1798,1.327104,0.864679,0.864644,0.86476,0.864661


[I 2025-03-23 11:17:01,784] Trial 72 finished with value: 0.8600026319252534 and parameters: {'learning_rate': 0.0007326143061430178, 'weight_decay': 0.006, 'warmup_steps': 42, 'lambda_param': 0.4, 'temperature': 5.5}. Best is trial 54 with value: 0.878417341892034.


Trial 73 with params: {'learning_rate': 0.0011884846122620625, 'weight_decay': 0.006, 'warmup_steps': 43, 'lambda_param': 0.9, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.707,1.744373,0.788991,0.805838,0.791109,0.78682
2,1.077,1.255452,0.850917,0.850863,0.850951,0.850889
3,0.7377,1.483666,0.832569,0.839394,0.831249,0.831287
4,0.5186,1.285721,0.845183,0.847943,0.844353,0.844609
5,0.3708,1.252769,0.855505,0.856385,0.855035,0.855258
6,0.2717,1.161904,0.862385,0.862537,0.862171,0.862281
7,0.2012,1.233524,0.872706,0.872655,0.872685,0.872669
8,0.1552,1.332707,0.853211,0.856522,0.85232,0.852603
9,0.1219,1.176423,0.868119,0.868173,0.86797,0.868043
10,0.0981,1.185049,0.865826,0.866171,0.866138,0.865826


[I 2025-03-23 11:21:20,116] Trial 73 finished with value: 0.8760816017179309 and parameters: {'learning_rate': 0.0011884846122620625, 'weight_decay': 0.006, 'warmup_steps': 43, 'lambda_param': 0.9, 'temperature': 7.0}. Best is trial 54 with value: 0.878417341892034.


Trial 74 with params: {'learning_rate': 0.000791582291044892, 'weight_decay': 0.005, 'warmup_steps': 39, 'lambda_param': 0.9, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7409,1.58545,0.788991,0.798149,0.790562,0.787919
2,1.1671,1.274163,0.836009,0.836462,0.835638,0.835802
3,0.8559,1.304488,0.837156,0.842783,0.835964,0.8361
4,0.6308,1.317854,0.830275,0.833611,0.829334,0.829521
5,0.4857,1.244973,0.862385,0.862537,0.862171,0.862281
6,0.374,1.208326,0.876147,0.876197,0.876316,0.876141
7,0.3012,1.316768,0.857798,0.859267,0.857203,0.857468
8,0.2384,1.307257,0.864679,0.865081,0.865012,0.864678
9,0.1942,1.338471,0.868119,0.870368,0.868854,0.868043
10,0.1589,1.260167,0.873853,0.87405,0.874105,0.873853


[I 2025-03-23 11:25:53,518] Trial 74 finished with value: 0.8726687443021789 and parameters: {'learning_rate': 0.000791582291044892, 'weight_decay': 0.005, 'warmup_steps': 39, 'lambda_param': 0.9, 'temperature': 6.5}. Best is trial 54 with value: 0.878417341892034.


Trial 75 with params: {'learning_rate': 0.0007995609239192307, 'weight_decay': 0.004, 'warmup_steps': 42, 'lambda_param': 1.0, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7581,1.719129,0.774083,0.787954,0.776048,0.772116
2,1.1675,1.3285,0.837156,0.837352,0.836891,0.837011
3,0.8486,1.464126,0.832569,0.839856,0.831207,0.831218
4,0.6238,1.389759,0.826835,0.829677,0.825956,0.826143
5,0.4747,1.293777,0.860092,0.86024,0.859876,0.859986
6,0.3666,1.398247,0.853211,0.854652,0.852614,0.85287
7,0.2916,1.204705,0.866972,0.866955,0.866886,0.866916
8,0.2344,1.251326,0.872706,0.872916,0.872474,0.872602
9,0.1912,1.322873,0.869266,0.869462,0.869517,0.869265
10,0.1563,1.331928,0.869266,0.871657,0.870022,0.869183


[I 2025-03-23 11:30:22,938] Trial 75 finished with value: 0.8703376437443334 and parameters: {'learning_rate': 0.0007995609239192307, 'weight_decay': 0.004, 'warmup_steps': 42, 'lambda_param': 1.0, 'temperature': 6.0}. Best is trial 54 with value: 0.878417341892034.


Trial 76 with params: {'learning_rate': 0.00027193193206210794, 'weight_decay': 0.006, 'warmup_steps': 40, 'lambda_param': 1.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9522,1.639808,0.780963,0.790647,0.782595,0.779739
2,1.4352,1.429696,0.823394,0.82835,0.82224,0.822314
3,1.2565,1.411462,0.816514,0.823376,0.815147,0.815034
4,1.1143,1.433052,0.819954,0.824658,0.81882,0.818885
5,0.9794,1.336032,0.829128,0.829079,0.82905,0.829063


[I 2025-03-23 11:31:56,879] Trial 76 pruned. 


Trial 77 with params: {'learning_rate': 0.0005111495151492374, 'weight_decay': 0.001, 'warmup_steps': 34, 'lambda_param': 1.0, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8017,1.580313,0.791284,0.795881,0.792393,0.790844
2,1.2935,1.349339,0.825688,0.826301,0.82525,0.825423
3,1.0202,1.285811,0.840596,0.840945,0.840269,0.84042
4,0.8216,1.322842,0.834862,0.83662,0.834175,0.834402
5,0.6662,1.396694,0.848624,0.85019,0.849246,0.848573
6,0.5485,1.320291,0.860092,0.861365,0.859539,0.859797
7,0.4615,1.428234,0.849771,0.850141,0.849446,0.849604
8,0.3941,1.395768,0.84633,0.847945,0.845689,0.845938
9,0.3334,1.456542,0.857798,0.857992,0.858045,0.857797
10,0.2897,1.439714,0.857798,0.859392,0.858424,0.85775


[I 2025-03-23 11:35:04,480] Trial 77 pruned. 


Trial 78 with params: {'learning_rate': 0.0014695836063348643, 'weight_decay': 0.003, 'warmup_steps': 42, 'lambda_param': 1.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6563,1.394254,0.809633,0.81174,0.810369,0.809512
2,1.0262,1.144608,0.854358,0.854319,0.854288,0.854302
3,0.6653,1.409974,0.841743,0.848336,0.840469,0.840595
4,0.4568,1.23579,0.841743,0.842414,0.841311,0.841502
5,0.3239,1.133841,0.87156,0.871589,0.871432,0.871492
6,0.2329,1.198749,0.868119,0.868966,0.867675,0.867906
7,0.1735,1.26077,0.863532,0.867487,0.862581,0.862904
8,0.1294,1.23755,0.866972,0.867901,0.866507,0.866745
9,0.1017,1.186582,0.862385,0.862537,0.862171,0.862281
10,0.0825,1.235303,0.869266,0.869559,0.869559,0.869266


[I 2025-03-23 11:39:35,405] Trial 78 finished with value: 0.8667930889169828 and parameters: {'learning_rate': 0.0014695836063348643, 'weight_decay': 0.003, 'warmup_steps': 42, 'lambda_param': 1.0, 'temperature': 7.0}. Best is trial 54 with value: 0.878417341892034.


Trial 79 with params: {'learning_rate': 0.0007545608510372824, 'weight_decay': 0.004, 'warmup_steps': 42, 'lambda_param': 0.8, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7479,1.535388,0.795872,0.800881,0.797024,0.795397
2,1.1875,1.340395,0.825688,0.829608,0.824661,0.824802
3,0.881,1.380251,0.840596,0.847857,0.839259,0.839344
4,0.6606,1.347886,0.831422,0.835261,0.830418,0.830593
5,0.5076,1.313234,0.857798,0.857776,0.857708,0.857738
6,0.3914,1.288518,0.865826,0.865773,0.865802,0.865786
7,0.3157,1.386603,0.854358,0.857523,0.853488,0.853776
8,0.2525,1.249993,0.87156,0.871545,0.871474,0.871505
9,0.2089,1.457615,0.855505,0.855554,0.855666,0.855498
10,0.1723,1.322275,0.873853,0.873853,0.873979,0.873843


[I 2025-03-23 11:44:07,185] Trial 79 finished with value: 0.8611419283942331 and parameters: {'learning_rate': 0.0007545608510372824, 'weight_decay': 0.004, 'warmup_steps': 42, 'lambda_param': 0.8, 'temperature': 6.5}. Best is trial 54 with value: 0.878417341892034.


Trial 80 with params: {'learning_rate': 0.004649502709493366, 'weight_decay': 0.005, 'warmup_steps': 40, 'lambda_param': 0.8, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5004,1.205401,0.832569,0.833207,0.832975,0.832561
2,0.7411,1.257427,0.83945,0.843945,0.838385,0.83858
3,0.4222,1.47525,0.824541,0.835504,0.822861,0.822523
4,0.2545,1.555567,0.829128,0.838052,0.827619,0.827489
5,0.1689,1.315115,0.84289,0.844593,0.842227,0.84247
6,0.12,1.240778,0.861239,0.861525,0.86096,0.861105
7,0.0875,1.355029,0.850917,0.850892,0.850825,0.850854
8,0.0684,1.312027,0.854358,0.855152,0.853909,0.854123
9,0.0579,1.310605,0.848624,0.848635,0.848489,0.848544
10,0.0497,1.291991,0.857798,0.857873,0.857624,0.857708


[I 2025-03-23 11:47:06,403] Trial 80 pruned. 


Trial 81 with params: {'learning_rate': 0.0021539060030542584, 'weight_decay': 0.005, 'warmup_steps': 43, 'lambda_param': 0.9, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6,1.388854,0.808486,0.813854,0.809664,0.808019
2,0.8998,1.175535,0.841743,0.844611,0.84089,0.841134
3,0.5423,1.353442,0.844037,0.845864,0.843353,0.843601
4,0.3455,1.218444,0.848624,0.852544,0.847647,0.847904
5,0.2303,1.193514,0.869266,0.869427,0.869054,0.869167
6,0.1568,1.194204,0.863532,0.863995,0.863886,0.86353
7,0.1173,1.20021,0.865826,0.867235,0.865254,0.865528
8,0.0898,1.202078,0.858945,0.858942,0.858834,0.858878
9,0.0717,1.173432,0.865826,0.865792,0.865759,0.865775
10,0.0608,1.180135,0.869266,0.869266,0.86939,0.869255


[I 2025-03-23 11:51:39,149] Trial 81 finished with value: 0.8645927095670483 and parameters: {'learning_rate': 0.0021539060030542584, 'weight_decay': 0.005, 'warmup_steps': 43, 'lambda_param': 0.9, 'temperature': 5.0}. Best is trial 54 with value: 0.878417341892034.


Trial 82 with params: {'learning_rate': 0.0002936482808609518, 'weight_decay': 0.003, 'warmup_steps': 36, 'lambda_param': 0.9, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9198,1.639908,0.78555,0.796962,0.78731,0.784119
2,1.4154,1.410681,0.822248,0.826631,0.821156,0.821255
3,1.2249,1.390956,0.81422,0.821472,0.81281,0.812643
4,1.0708,1.417919,0.813073,0.818396,0.811853,0.811826
5,0.9311,1.346914,0.831422,0.831499,0.831218,0.831305
6,0.8116,1.52121,0.831422,0.836706,0.832565,0.831049
7,0.7131,1.478013,0.830275,0.830464,0.830513,0.830274
8,0.638,1.506802,0.841743,0.841867,0.841521,0.841623
9,0.5754,1.451098,0.845183,0.845794,0.844774,0.844961
10,0.5272,1.585398,0.831422,0.834922,0.832355,0.831209


[I 2025-03-23 11:54:37,261] Trial 82 pruned. 


Trial 83 with params: {'learning_rate': 0.001650654935063365, 'weight_decay': 0.006, 'warmup_steps': 35, 'lambda_param': 0.9, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6103,1.379487,0.81422,0.817736,0.815168,0.81397
2,0.9534,1.174614,0.856651,0.856596,0.856624,0.856609
3,0.6092,1.436135,0.841743,0.844324,0.840932,0.841178
4,0.4027,1.412997,0.827982,0.835619,0.826577,0.826522
5,0.2806,1.206388,0.855505,0.856027,0.855877,0.855502
6,0.1935,1.189163,0.864679,0.865026,0.864381,0.864539
7,0.1429,1.245974,0.857798,0.859059,0.857245,0.857498
8,0.1094,1.239639,0.861239,0.861351,0.861044,0.861142
9,0.088,1.227389,0.865826,0.865773,0.865802,0.865786
10,0.0726,1.269502,0.865826,0.865906,0.866012,0.865821


[I 2025-03-23 11:59:21,125] Trial 83 finished with value: 0.8633810067369458 and parameters: {'learning_rate': 0.001650654935063365, 'weight_decay': 0.006, 'warmup_steps': 35, 'lambda_param': 0.9, 'temperature': 7.0}. Best is trial 54 with value: 0.878417341892034.


Trial 84 with params: {'learning_rate': 0.0048160834038781805, 'weight_decay': 0.002, 'warmup_steps': 30, 'lambda_param': 0.2, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4564,1.277476,0.847477,0.849385,0.848162,0.847405
2,0.7251,1.213601,0.852064,0.854335,0.85132,0.851596
3,0.4038,1.194655,0.853211,0.855109,0.85253,0.852801
4,0.2487,1.292742,0.837156,0.841613,0.836091,0.836274
5,0.1628,1.271537,0.865826,0.866838,0.865338,0.865584
6,0.1143,1.229367,0.858945,0.858942,0.858834,0.858878
7,0.0856,1.241244,0.87156,0.871516,0.871516,0.871516
8,0.0671,1.252536,0.858945,0.859227,0.858666,0.85881
9,0.0578,1.247731,0.868119,0.868062,0.868138,0.86809
10,0.0494,1.256271,0.865826,0.865827,0.865717,0.865762


[I 2025-03-23 12:03:50,087] Trial 84 finished with value: 0.8634672898057032 and parameters: {'learning_rate': 0.0048160834038781805, 'weight_decay': 0.002, 'warmup_steps': 30, 'lambda_param': 0.2, 'temperature': 2.0}. Best is trial 54 with value: 0.878417341892034.


Trial 85 with params: {'learning_rate': 0.0008152751398082852, 'weight_decay': 0.003, 'warmup_steps': 32, 'lambda_param': 0.0, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7323,1.694582,0.770642,0.783481,0.772544,0.768792
2,1.1839,1.326151,0.827982,0.830695,0.827124,0.827319
3,0.8647,1.353783,0.84633,0.852147,0.845142,0.845334
4,0.6349,1.268947,0.841743,0.843795,0.841016,0.841262
5,0.4867,1.256805,0.850917,0.850867,0.850867,0.850867


[I 2025-03-23 12:05:16,323] Trial 85 pruned. 


Trial 86 with params: {'learning_rate': 0.004254920491036483, 'weight_decay': 0.006, 'warmup_steps': 36, 'lambda_param': 0.4, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4903,1.238121,0.830275,0.830464,0.830513,0.830274
2,0.7504,1.316113,0.83945,0.843945,0.838385,0.83858
3,0.4267,1.377117,0.838303,0.843349,0.837175,0.837342
4,0.2586,1.364232,0.841743,0.843795,0.841016,0.841262
5,0.169,1.440924,0.849771,0.850547,0.84932,0.849528
6,0.1181,1.336871,0.855505,0.856561,0.854993,0.85523
7,0.0893,1.411646,0.850917,0.851137,0.850657,0.850785
8,0.0705,1.363862,0.848624,0.848624,0.848741,0.848611
9,0.0582,1.356211,0.849771,0.850356,0.850162,0.849766
10,0.0516,1.386142,0.84633,0.846302,0.846236,0.846265


[I 2025-03-23 12:08:32,257] Trial 86 pruned. 


Trial 87 with params: {'learning_rate': 0.00025335316923329827, 'weight_decay': 0.004, 'warmup_steps': 11, 'lambda_param': 0.6000000000000001, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9264,1.665745,0.780963,0.789202,0.782468,0.779956
2,1.4548,1.526412,0.811927,0.821052,0.810348,0.809991
3,1.2818,1.475283,0.807339,0.81682,0.805717,0.805265
4,1.1533,1.408831,0.809633,0.815886,0.808306,0.808176
5,1.022,1.342546,0.821101,0.821871,0.821546,0.821086


[I 2025-03-23 12:09:55,465] Trial 87 pruned. 


Trial 88 with params: {'learning_rate': 0.00014231419310604566, 'weight_decay': 0.007, 'warmup_steps': 37, 'lambda_param': 0.0, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1347,1.640271,0.790138,0.790797,0.790551,0.790124
2,1.5551,1.580746,0.797018,0.800051,0.796045,0.796085
3,1.4482,1.507256,0.805046,0.806786,0.804307,0.804453
4,1.3653,1.432287,0.807339,0.807362,0.807148,0.807217
5,1.281,1.412559,0.809633,0.809575,0.809653,0.809597


[I 2025-03-23 12:11:29,602] Trial 88 pruned. 


Trial 89 with params: {'learning_rate': 0.003944572654555935, 'weight_decay': 0.005, 'warmup_steps': 27, 'lambda_param': 0.0, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4785,1.169513,0.832569,0.834086,0.833186,0.832512
2,0.7487,1.307933,0.83945,0.840807,0.838848,0.839076
3,0.4323,1.220113,0.845183,0.847393,0.844437,0.844693
4,0.2557,1.370595,0.84289,0.842843,0.842942,0.842865
5,0.1661,1.297426,0.852064,0.852849,0.851614,0.851826
6,0.1211,1.230906,0.869266,0.870035,0.868843,0.869067
7,0.0941,1.304408,0.866972,0.867168,0.867222,0.866972
8,0.073,1.230546,0.863532,0.863498,0.863465,0.86348
9,0.0596,1.208559,0.866972,0.866972,0.867096,0.866961
10,0.0509,1.24422,0.863532,0.863475,0.863549,0.863502


[I 2025-03-23 12:16:19,295] Trial 89 finished with value: 0.8622809973045822 and parameters: {'learning_rate': 0.003944572654555935, 'weight_decay': 0.005, 'warmup_steps': 27, 'lambda_param': 0.0, 'temperature': 3.5}. Best is trial 54 with value: 0.878417341892034.


Trial 90 with params: {'learning_rate': 0.0011948822556739503, 'weight_decay': 0.004, 'warmup_steps': 43, 'lambda_param': 1.0, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7026,1.618718,0.798165,0.809719,0.799907,0.796856
2,1.0651,1.24918,0.850917,0.852141,0.850362,0.850603
3,0.7251,1.694828,0.822248,0.839742,0.820146,0.819212
4,0.5085,1.238849,0.853211,0.854257,0.852698,0.852932
5,0.3682,1.188206,0.856651,0.856647,0.85654,0.856583
6,0.2745,1.180642,0.863532,0.864061,0.86317,0.863359
7,0.2078,1.302037,0.861239,0.862326,0.86176,0.861216
8,0.1562,1.258141,0.862385,0.86313,0.86196,0.862176
9,0.1225,1.229235,0.862385,0.862332,0.862423,0.862359
10,0.0978,1.238652,0.866972,0.867168,0.867222,0.866972


[I 2025-03-23 12:21:08,239] Trial 90 finished with value: 0.8749524730461324 and parameters: {'learning_rate': 0.0011948822556739503, 'weight_decay': 0.004, 'warmup_steps': 43, 'lambda_param': 1.0, 'temperature': 5.0}. Best is trial 54 with value: 0.878417341892034.


Trial 91 with params: {'learning_rate': 0.0009859179051415225, 'weight_decay': 0.004, 'warmup_steps': 35, 'lambda_param': 0.9, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7312,1.645066,0.784404,0.798412,0.786352,0.782574
2,1.1257,1.256412,0.84289,0.843054,0.842648,0.842761
3,0.8019,1.605777,0.824541,0.837321,0.822735,0.82225
4,0.5788,1.305759,0.837156,0.841613,0.836091,0.836274
5,0.4272,1.221855,0.858945,0.859337,0.858624,0.858789
6,0.3222,1.349914,0.841743,0.843114,0.841143,0.841375
7,0.2449,1.242283,0.868119,0.868966,0.867675,0.867906
8,0.1913,1.32264,0.865826,0.866121,0.865549,0.865697
9,0.1521,1.359963,0.863532,0.863475,0.863549,0.863502
10,0.1232,1.288778,0.866972,0.867264,0.867264,0.866972


[I 2025-03-23 12:25:52,361] Trial 91 finished with value: 0.8669157698076467 and parameters: {'learning_rate': 0.0009859179051415225, 'weight_decay': 0.004, 'warmup_steps': 35, 'lambda_param': 0.9, 'temperature': 4.5}. Best is trial 54 with value: 0.878417341892034.


Trial 92 with params: {'learning_rate': 0.0006625525893228407, 'weight_decay': 0.006, 'warmup_steps': 42, 'lambda_param': 1.0, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7718,1.631017,0.799312,0.807995,0.800823,0.798389
2,1.2256,1.298592,0.823394,0.823387,0.823251,0.823302
3,0.9265,1.327063,0.840596,0.842516,0.83989,0.840132
4,0.7081,1.369981,0.832569,0.835627,0.83167,0.831875
5,0.5616,1.300104,0.858945,0.859097,0.859171,0.858943
6,0.444,1.291189,0.863532,0.864129,0.863928,0.863528
7,0.364,1.46946,0.845183,0.846104,0.844689,0.844904
8,0.2977,1.33883,0.862385,0.863054,0.862802,0.862379
9,0.2492,1.479856,0.849771,0.851027,0.85033,0.849737
10,0.2097,1.341674,0.863532,0.863513,0.863633,0.863518


[I 2025-03-23 12:30:40,043] Trial 92 finished with value: 0.8565968727593261 and parameters: {'learning_rate': 0.0006625525893228407, 'weight_decay': 0.006, 'warmup_steps': 42, 'lambda_param': 1.0, 'temperature': 5.5}. Best is trial 54 with value: 0.878417341892034.


Trial 93 with params: {'learning_rate': 0.0007169580914232985, 'weight_decay': 0.004, 'warmup_steps': 36, 'lambda_param': 0.9, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7468,1.50002,0.797018,0.800849,0.798023,0.796691
2,1.1969,1.269661,0.840596,0.840945,0.840269,0.84042
3,0.8915,1.485337,0.827982,0.83713,0.82645,0.826292
4,0.6798,1.308222,0.845183,0.846681,0.844563,0.844806
5,0.5283,1.290577,0.861239,0.861902,0.860834,0.86104
6,0.4125,1.245455,0.87156,0.871504,0.871558,0.871527
7,0.3347,1.324099,0.863532,0.863935,0.863213,0.863381
8,0.27,1.332305,0.858945,0.858942,0.858834,0.858878
9,0.2244,1.344182,0.865826,0.866575,0.866265,0.865817
10,0.1866,1.288785,0.868119,0.868142,0.868264,0.868111


[I 2025-03-23 12:35:12,118] Trial 93 finished with value: 0.8726459593985971 and parameters: {'learning_rate': 0.0007169580914232985, 'weight_decay': 0.004, 'warmup_steps': 36, 'lambda_param': 0.9, 'temperature': 5.5}. Best is trial 54 with value: 0.878417341892034.


Trial 94 with params: {'learning_rate': 0.0011457862795882525, 'weight_decay': 0.004, 'warmup_steps': 41, 'lambda_param': 1.0, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7263,1.721533,0.794725,0.805338,0.796403,0.793506
2,1.0905,1.285239,0.847477,0.848408,0.846984,0.847202
3,0.7509,1.440956,0.836009,0.842692,0.834712,0.834787
4,0.5256,1.286019,0.849771,0.849945,0.849531,0.849647
5,0.3801,1.403378,0.845183,0.84544,0.8449,0.845035
6,0.2802,1.263945,0.857798,0.857992,0.858045,0.857797
7,0.2095,1.322345,0.863532,0.863582,0.863381,0.863453
8,0.1586,1.299632,0.857798,0.858526,0.857371,0.857582
9,0.1274,1.268833,0.87156,0.871921,0.871264,0.871427
10,0.102,1.327939,0.865826,0.866171,0.866138,0.865826


[I 2025-03-23 12:40:13,085] Trial 94 finished with value: 0.864644009113535 and parameters: {'learning_rate': 0.0011457862795882525, 'weight_decay': 0.004, 'warmup_steps': 41, 'lambda_param': 1.0, 'temperature': 5.5}. Best is trial 54 with value: 0.878417341892034.


Trial 95 with params: {'learning_rate': 0.003304115005208811, 'weight_decay': 0.003, 'warmup_steps': 31, 'lambda_param': 1.0, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5142,1.344285,0.834862,0.842613,0.836238,0.834273
2,0.7902,1.227367,0.847477,0.850878,0.846563,0.846823
3,0.4534,1.168497,0.857798,0.857744,0.857834,0.857771
4,0.2751,1.259383,0.855505,0.859176,0.854572,0.854862
5,0.177,1.311448,0.860092,0.860112,0.85996,0.860018
6,0.1225,1.240025,0.870413,0.870536,0.870222,0.870323
7,0.0928,1.314206,0.862385,0.862435,0.862549,0.862379
8,0.071,1.191013,0.873853,0.874112,0.8736,0.873741
9,0.0593,1.206056,0.872706,0.872676,0.872643,0.872658
10,0.0515,1.22344,0.879587,0.87961,0.879736,0.879579


[I 2025-03-23 12:44:56,033] Trial 95 finished with value: 0.8726178020806601 and parameters: {'learning_rate': 0.003304115005208811, 'weight_decay': 0.003, 'warmup_steps': 31, 'lambda_param': 1.0, 'temperature': 4.5}. Best is trial 54 with value: 0.878417341892034.


Trial 96 with params: {'learning_rate': 0.004407788348570435, 'weight_decay': 0.004, 'warmup_steps': 36, 'lambda_param': 1.0, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4714,1.381452,0.826835,0.830959,0.82785,0.826555
2,0.7407,1.37788,0.832569,0.835935,0.831628,0.831825
3,0.4085,1.314218,0.834862,0.841294,0.833586,0.833664
4,0.2455,1.456939,0.84289,0.843977,0.842353,0.842575
5,0.166,1.364434,0.83945,0.839975,0.839059,0.839233
6,0.1138,1.435508,0.838303,0.841908,0.837343,0.837559
7,0.0865,1.289588,0.848624,0.848689,0.848447,0.848527
8,0.0676,1.325343,0.841743,0.842042,0.841437,0.84158
9,0.0578,1.30274,0.845183,0.845137,0.845237,0.845159
10,0.0496,1.297555,0.841743,0.841749,0.841606,0.84166


[I 2025-03-23 12:48:03,409] Trial 96 pruned. 


Trial 97 with params: {'learning_rate': 0.002345295876379191, 'weight_decay': 0.001, 'warmup_steps': 29, 'lambda_param': 0.8, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5644,1.323582,0.824541,0.827986,0.825471,0.824319
2,0.8758,1.169472,0.849771,0.851298,0.849152,0.849404
3,0.5316,1.202678,0.850917,0.852564,0.850278,0.850537
4,0.3336,1.146514,0.855505,0.855577,0.85533,0.855413
5,0.2166,1.196769,0.865826,0.866838,0.865338,0.865584
6,0.1535,1.079553,0.869266,0.869293,0.869138,0.869197
7,0.1077,1.106616,0.870413,0.870469,0.870264,0.870338
8,0.0843,1.14541,0.870413,0.870381,0.870348,0.870364
9,0.0691,1.192481,0.864679,0.865146,0.864339,0.864519
10,0.0583,1.148792,0.869266,0.869517,0.869012,0.86915


[I 2025-03-23 12:52:27,799] Trial 97 finished with value: 0.8679732418046114 and parameters: {'learning_rate': 0.002345295876379191, 'weight_decay': 0.001, 'warmup_steps': 29, 'lambda_param': 0.8, 'temperature': 5.5}. Best is trial 54 with value: 0.878417341892034.


Trial 98 with params: {'learning_rate': 0.001361852545897912, 'weight_decay': 0.006, 'warmup_steps': 42, 'lambda_param': 0.9, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6664,1.424592,0.805046,0.808484,0.805991,0.804783
2,1.0292,1.241117,0.848624,0.848635,0.848489,0.848544
3,0.6816,1.444617,0.838303,0.843349,0.837175,0.837342
4,0.4694,1.271616,0.84289,0.846901,0.84189,0.842117
5,0.332,1.293192,0.84289,0.843493,0.842479,0.842665


[I 2025-03-23 12:54:01,392] Trial 98 pruned. 


Trial 99 with params: {'learning_rate': 8.710007471084877e-05, 'weight_decay': 0.01, 'warmup_steps': 23, 'lambda_param': 0.30000000000000004, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3038,1.662037,0.772936,0.773512,0.773322,0.772925
2,1.6192,1.604335,0.791284,0.792521,0.790625,0.790752
3,1.5345,1.55095,0.792431,0.79417,0.791667,0.791774
4,1.4808,1.518308,0.794725,0.796275,0.794003,0.794127
5,1.4279,1.476015,0.798165,0.798295,0.797887,0.797986
6,1.3791,1.520156,0.801606,0.806508,0.802738,0.801166
7,1.3364,1.485762,0.807339,0.81111,0.808327,0.807046
8,1.2999,1.442203,0.807339,0.807275,0.807275,0.807275
9,1.2569,1.454277,0.806193,0.807437,0.805559,0.80572
10,1.2232,1.480879,0.818807,0.821211,0.819588,0.81867


[I 2025-03-23 12:57:12,375] Trial 99 pruned. 


Trial 100 with params: {'learning_rate': 0.0008284065855808625, 'weight_decay': 0.002, 'warmup_steps': 42, 'lambda_param': 0.9, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7346,1.533355,0.790138,0.795621,0.791351,0.789577
2,1.1591,1.290315,0.834862,0.836856,0.834133,0.83436
3,0.846,1.457905,0.833716,0.838676,0.832586,0.832728
4,0.6192,1.304707,0.83945,0.842007,0.838638,0.838877
5,0.4763,1.243524,0.858945,0.859405,0.859297,0.858943
6,0.3654,1.283729,0.864679,0.86497,0.86497,0.864679
7,0.2917,1.272379,0.877294,0.877275,0.8774,0.877281
8,0.2303,1.268187,0.869266,0.869517,0.869012,0.86915
9,0.1872,1.284614,0.87156,0.87156,0.871685,0.871549
10,0.1529,1.246232,0.875,0.87547,0.875358,0.874999


[I 2025-03-23 13:01:53,085] Trial 100 finished with value: 0.881824628991491 and parameters: {'learning_rate': 0.0008284065855808625, 'weight_decay': 0.002, 'warmup_steps': 42, 'lambda_param': 0.9, 'temperature': 5.5}. Best is trial 100 with value: 0.881824628991491.


Trial 101 with params: {'learning_rate': 0.0012341656493684006, 'weight_decay': 0.002, 'warmup_steps': 42, 'lambda_param': 0.7000000000000001, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6938,1.428434,0.794725,0.798855,0.795771,0.794355
2,1.0725,1.242946,0.845183,0.84544,0.8449,0.845035
3,0.7337,1.451814,0.841743,0.849766,0.840343,0.8404
4,0.5085,1.238266,0.84289,0.846901,0.84189,0.842117
5,0.3598,1.256525,0.856651,0.85711,0.857003,0.85665
6,0.2638,1.305371,0.860092,0.860381,0.860381,0.860092
7,0.2017,1.217003,0.853211,0.853152,0.853204,0.853173
8,0.1518,1.224568,0.870413,0.870619,0.87018,0.870306
9,0.1209,1.20681,0.87156,0.871504,0.871558,0.871527
10,0.0953,1.275793,0.864679,0.865208,0.865054,0.864676


[I 2025-03-23 13:05:07,624] Trial 101 pruned. 


Trial 102 with params: {'learning_rate': 0.0006658580160669897, 'weight_decay': 0.004, 'warmup_steps': 32, 'lambda_param': 0.8, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7577,1.510435,0.795872,0.798665,0.796729,0.795661
2,1.2216,1.308132,0.834862,0.83662,0.834175,0.834402
3,0.9283,1.314918,0.850917,0.852798,0.850236,0.850501
4,0.7059,1.336488,0.83945,0.842007,0.838638,0.838877
5,0.5562,1.26422,0.853211,0.853261,0.853372,0.853204
6,0.446,1.248611,0.868119,0.868532,0.867801,0.867973
7,0.3661,1.281914,0.866972,0.866916,0.86697,0.866938
8,0.2966,1.241534,0.873853,0.87405,0.874105,0.873853
9,0.2482,1.43944,0.860092,0.861086,0.860592,0.860073
10,0.2059,1.306444,0.870413,0.871168,0.870853,0.870404


[I 2025-03-23 13:10:04,791] Trial 102 finished with value: 0.8726580230102816 and parameters: {'learning_rate': 0.0006658580160669897, 'weight_decay': 0.004, 'warmup_steps': 32, 'lambda_param': 0.8, 'temperature': 6.0}. Best is trial 100 with value: 0.881824628991491.


Trial 103 with params: {'learning_rate': 0.00027009583847554473, 'weight_decay': 0.005, 'warmup_steps': 21, 'lambda_param': 0.9, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9201,1.612992,0.78555,0.792142,0.786889,0.784814
2,1.43,1.442428,0.818807,0.823308,0.817694,0.817763
3,1.252,1.433086,0.816514,0.824783,0.815021,0.814795
4,1.1185,1.368029,0.813073,0.814996,0.812316,0.812481
5,0.9822,1.322772,0.827982,0.828031,0.828134,0.827974


[I 2025-03-23 13:11:44,759] Trial 103 pruned. 


Trial 104 with params: {'learning_rate': 0.0032091538411893345, 'weight_decay': 0.001, 'warmup_steps': 40, 'lambda_param': 1.0, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5307,1.358603,0.825688,0.828401,0.826513,0.825533
2,0.7998,1.229884,0.83945,0.842592,0.838554,0.838785
3,0.4641,1.250968,0.847477,0.851209,0.846521,0.846776
4,0.2818,1.272113,0.848624,0.851879,0.847731,0.847997
5,0.1846,1.266355,0.854358,0.856649,0.853614,0.853896
6,0.1236,1.2461,0.852064,0.855524,0.851151,0.851429
7,0.0925,1.207708,0.856651,0.856929,0.856372,0.856514
8,0.0747,1.199353,0.857798,0.857776,0.857708,0.857738
9,0.0615,1.116852,0.857798,0.857873,0.857624,0.857708
10,0.0537,1.180837,0.857798,0.857817,0.857666,0.857723


[I 2025-03-23 13:14:42,105] Trial 104 pruned. 


Trial 105 with params: {'learning_rate': 0.0005385042465587333, 'weight_decay': 0.001, 'warmup_steps': 41, 'lambda_param': 0.9, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.804,1.623813,0.786697,0.792704,0.787973,0.786049
2,1.2827,1.294761,0.831422,0.83283,0.830797,0.831011
3,1.0103,1.321977,0.84289,0.846232,0.841974,0.842216
4,0.8121,1.338438,0.830275,0.830654,0.829923,0.830074
5,0.6462,1.337238,0.847477,0.847573,0.847278,0.847371


[I 2025-03-23 13:16:11,422] Trial 105 pruned. 


Trial 106 with params: {'learning_rate': 0.000925573060565197, 'weight_decay': 0.003, 'warmup_steps': 29, 'lambda_param': 0.9, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7033,1.586984,0.794725,0.803785,0.796276,0.793715
2,1.1442,1.290482,0.831422,0.835261,0.830418,0.830593
3,0.8241,1.518253,0.838303,0.844594,0.837048,0.837162
4,0.5914,1.256016,0.844037,0.845636,0.843395,0.843639
5,0.4501,1.304944,0.856651,0.856632,0.85675,0.856636
6,0.3404,1.26403,0.860092,0.86117,0.859582,0.859826
7,0.2685,1.310092,0.856651,0.856758,0.856456,0.856552
8,0.2102,1.323417,0.869266,0.869517,0.869012,0.86915
9,0.1707,1.334152,0.864679,0.866074,0.865265,0.864644
10,0.1365,1.329615,0.870413,0.870567,0.870643,0.870411


[I 2025-03-23 13:21:07,654] Trial 106 finished with value: 0.8623498397764764 and parameters: {'learning_rate': 0.000925573060565197, 'weight_decay': 0.003, 'warmup_steps': 29, 'lambda_param': 0.9, 'temperature': 5.5}. Best is trial 100 with value: 0.881824628991491.


Trial 107 with params: {'learning_rate': 0.0002759444112521778, 'weight_decay': 0.004, 'warmup_steps': 42, 'lambda_param': 0.9, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9434,1.571161,0.786697,0.790275,0.787678,0.786372
2,1.4467,1.407418,0.819954,0.822442,0.819115,0.819287
3,1.2606,1.43597,0.805046,0.814443,0.803423,0.802947
4,1.1159,1.410947,0.818807,0.821696,0.817904,0.818057
5,0.9754,1.32152,0.834862,0.834806,0.834891,0.834831


[I 2025-03-23 13:22:29,074] Trial 107 pruned. 


Trial 108 with params: {'learning_rate': 0.004186759877239569, 'weight_decay': 0.005, 'warmup_steps': 22, 'lambda_param': 1.0, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4835,1.301946,0.833716,0.836621,0.834565,0.833556
2,0.7424,1.294725,0.829128,0.832294,0.828208,0.828395
3,0.42,1.260868,0.847477,0.848408,0.846984,0.847202
4,0.2527,1.392244,0.84289,0.845921,0.842016,0.842262
5,0.1619,1.24411,0.856651,0.856596,0.856624,0.856609
6,0.1132,1.316333,0.848624,0.84905,0.848278,0.848444
7,0.0837,1.277533,0.866972,0.867581,0.866591,0.866793
8,0.0679,1.2564,0.862385,0.863671,0.861834,0.862095
9,0.0575,1.217621,0.864679,0.865026,0.864381,0.864539
10,0.0488,1.259625,0.861239,0.861902,0.860834,0.86104


[I 2025-03-23 13:25:36,450] Trial 108 pruned. 


Trial 109 with params: {'learning_rate': 0.00046978105272097034, 'weight_decay': 0.003, 'warmup_steps': 42, 'lambda_param': 0.7000000000000001, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8404,1.609097,0.788991,0.795439,0.790309,0.788295
2,1.3224,1.325609,0.825688,0.826171,0.825293,0.825453
3,1.0622,1.328401,0.836009,0.83867,0.835175,0.835401
4,0.8716,1.290733,0.831422,0.831652,0.831134,0.83126
5,0.7122,1.342394,0.855505,0.856079,0.855119,0.85531
6,0.5892,1.334023,0.862385,0.862351,0.862465,0.862367
7,0.5029,1.398952,0.844037,0.845864,0.843353,0.843601
8,0.4335,1.421436,0.848624,0.849319,0.848194,0.848393
9,0.375,1.448328,0.853211,0.854193,0.853709,0.853192
10,0.3278,1.491797,0.856651,0.859405,0.857466,0.856533


[I 2025-03-23 13:28:40,750] Trial 109 pruned. 


Trial 110 with params: {'learning_rate': 0.0003413091540356496, 'weight_decay': 0.004, 'warmup_steps': 38, 'lambda_param': 1.0, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8772,1.589204,0.791284,0.796232,0.792435,0.790799
2,1.389,1.363304,0.827982,0.829686,0.827292,0.827502
3,1.1817,1.363447,0.826835,0.833296,0.825535,0.825544
4,1.0161,1.428742,0.819954,0.825828,0.818694,0.818683
5,0.8707,1.337855,0.84289,0.842921,0.842732,0.842799
6,0.7418,1.423338,0.83945,0.839732,0.839732,0.83945
7,0.6474,1.502769,0.83945,0.839839,0.839774,0.839449
8,0.5723,1.510081,0.849771,0.849808,0.849615,0.849683
9,0.508,1.576531,0.845183,0.845909,0.845615,0.845174
10,0.459,1.458484,0.852064,0.852651,0.852456,0.852059


[I 2025-03-23 13:31:43,909] Trial 110 pruned. 


Trial 111 with params: {'learning_rate': 0.00041846537118887567, 'weight_decay': 0.005, 'warmup_steps': 32, 'lambda_param': 0.8, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8474,1.70248,0.788991,0.799166,0.790646,0.787775
2,1.3378,1.353289,0.829128,0.829451,0.828797,0.828939
3,1.0927,1.332447,0.827982,0.831937,0.826955,0.827108
4,0.9137,1.329115,0.825688,0.827847,0.824914,0.825113
5,0.7572,1.404964,0.841743,0.842134,0.842069,0.841742


[I 2025-03-23 13:33:09,725] Trial 111 pruned. 


Trial 112 with params: {'learning_rate': 0.0008436878218394436, 'weight_decay': 0.003, 'warmup_steps': 42, 'lambda_param': 1.0, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.733,1.640732,0.788991,0.800801,0.790772,0.787542
2,1.1548,1.29565,0.834862,0.835999,0.834302,0.834514
3,0.8425,1.495546,0.837156,0.843204,0.835922,0.836038
4,0.6158,1.298626,0.84289,0.843801,0.842395,0.842607
5,0.4648,1.209856,0.870413,0.870394,0.870517,0.870399
6,0.3573,1.190341,0.875,0.874949,0.874979,0.874963
7,0.2826,1.241289,0.870413,0.870831,0.870096,0.870269
8,0.2235,1.227586,0.875,0.875429,0.874684,0.874862
9,0.1827,1.232053,0.863532,0.864203,0.863128,0.863336
10,0.1483,1.283467,0.870413,0.870656,0.870685,0.870413


[I 2025-03-23 13:38:02,234] Trial 112 finished with value: 0.8806831039654763 and parameters: {'learning_rate': 0.0008436878218394436, 'weight_decay': 0.003, 'warmup_steps': 42, 'lambda_param': 1.0, 'temperature': 5.0}. Best is trial 100 with value: 0.881824628991491.


Trial 113 with params: {'learning_rate': 0.0007206785562237838, 'weight_decay': 0.002, 'warmup_steps': 38, 'lambda_param': 0.9, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7497,1.571528,0.792431,0.798336,0.793687,0.791826
2,1.1886,1.262947,0.838303,0.838646,0.837975,0.838124
3,0.8833,1.346166,0.844037,0.846641,0.843226,0.84348
4,0.6697,1.309446,0.834862,0.836398,0.834217,0.834441
5,0.516,1.236841,0.866972,0.866919,0.867012,0.866947
6,0.4042,1.264352,0.864679,0.864679,0.864802,0.864668
7,0.328,1.408348,0.858945,0.861853,0.858119,0.858422
8,0.2651,1.297527,0.862385,0.86298,0.862002,0.8622
9,0.2185,1.386372,0.860092,0.860092,0.860213,0.86008
10,0.1804,1.307653,0.870413,0.870567,0.870643,0.870411


[I 2025-03-23 13:42:53,150] Trial 113 finished with value: 0.8737994368865616 and parameters: {'learning_rate': 0.0007206785562237838, 'weight_decay': 0.002, 'warmup_steps': 38, 'lambda_param': 0.9, 'temperature': 5.5}. Best is trial 100 with value: 0.881824628991491.


Trial 114 with params: {'learning_rate': 0.0008768171427676919, 'weight_decay': 0.001, 'warmup_steps': 32, 'lambda_param': 0.6000000000000001, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7265,1.609304,0.792431,0.801433,0.793982,0.79141
2,1.1586,1.293468,0.832569,0.834309,0.831881,0.832102
3,0.8425,1.391796,0.837156,0.842783,0.835964,0.8361
4,0.6152,1.323534,0.845183,0.846474,0.844605,0.84484
5,0.4617,1.333771,0.840596,0.840757,0.840353,0.840465
6,0.3481,1.297183,0.857798,0.857944,0.857582,0.85769
7,0.2757,1.290248,0.861239,0.861636,0.860918,0.861085
8,0.2168,1.352214,0.862385,0.863294,0.861918,0.86215
9,0.176,1.322809,0.855505,0.85547,0.855582,0.855486
10,0.1415,1.287584,0.865826,0.86578,0.865886,0.865804


[I 2025-03-23 13:47:28,870] Trial 114 finished with value: 0.8669024611044442 and parameters: {'learning_rate': 0.0008768171427676919, 'weight_decay': 0.001, 'warmup_steps': 32, 'lambda_param': 0.6000000000000001, 'temperature': 6.5}. Best is trial 100 with value: 0.881824628991491.


Trial 115 with params: {'learning_rate': 0.0014155659845173267, 'weight_decay': 0.004, 'warmup_steps': 34, 'lambda_param': 0.8, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6339,1.520995,0.806193,0.813615,0.80758,0.805474
2,1.0178,1.247666,0.841743,0.842151,0.841395,0.841556
3,0.6706,1.633717,0.827982,0.837667,0.826408,0.826212
4,0.4611,1.322008,0.848624,0.850734,0.847899,0.848164
5,0.3247,1.329352,0.852064,0.853606,0.851446,0.851704


[I 2025-03-23 13:49:04,375] Trial 115 pruned. 


Trial 116 with params: {'learning_rate': 0.0007005749684251881, 'weight_decay': 0.003, 'warmup_steps': 41, 'lambda_param': 1.0, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7554,1.685859,0.795872,0.805724,0.797487,0.794766
2,1.2032,1.299096,0.833716,0.833697,0.833596,0.833637
3,0.9038,1.323072,0.83945,0.843945,0.838385,0.83858
4,0.6837,1.246358,0.837156,0.838126,0.836638,0.836846
5,0.5354,1.317117,0.857798,0.857744,0.857834,0.857771
6,0.4194,1.235455,0.870413,0.870367,0.870474,0.870392
7,0.3421,1.362454,0.857798,0.857944,0.857582,0.85769
8,0.2774,1.30298,0.866972,0.866919,0.867012,0.866947
9,0.2307,1.607801,0.845183,0.848164,0.846036,0.845035
10,0.1932,1.392663,0.861239,0.862521,0.861802,0.861208


[I 2025-03-23 13:53:48,780] Trial 116 finished with value: 0.863491716864498 and parameters: {'learning_rate': 0.0007005749684251881, 'weight_decay': 0.003, 'warmup_steps': 41, 'lambda_param': 1.0, 'temperature': 4.5}. Best is trial 100 with value: 0.881824628991491.


Trial 117 with params: {'learning_rate': 0.001048739893936699, 'weight_decay': 0.004, 'warmup_steps': 43, 'lambda_param': 0.9, 'temperature': 3.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7126,1.490797,0.802752,0.809449,0.804075,0.802102
2,1.1046,1.28885,0.847477,0.849976,0.846689,0.846954
3,0.7668,1.425462,0.845183,0.854083,0.843721,0.843768
4,0.5444,1.281952,0.850917,0.851952,0.850404,0.850634
5,0.4002,1.279012,0.863532,0.86482,0.864097,0.863502
6,0.2967,1.257343,0.866972,0.86722,0.866717,0.866854
7,0.2284,1.359573,0.857798,0.85774,0.857792,0.857762
8,0.1763,1.272956,0.87156,0.872507,0.871095,0.87134
9,0.1412,1.339724,0.860092,0.860429,0.859792,0.859947
10,0.1133,1.347124,0.858945,0.859097,0.859171,0.858943


[I 2025-03-23 13:56:48,801] Trial 117 pruned. 


Trial 118 with params: {'learning_rate': 0.000992458491267661, 'weight_decay': 0.006, 'warmup_steps': 33, 'lambda_param': 0.7000000000000001, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7019,1.640382,0.798165,0.810295,0.799949,0.79678
2,1.1299,1.325056,0.834862,0.835374,0.83447,0.83464
3,0.7972,1.609748,0.832569,0.841869,0.831039,0.830924
4,0.5711,1.228018,0.852064,0.852849,0.851614,0.851826
5,0.4269,1.290709,0.850917,0.850867,0.850867,0.850867


[I 2025-03-23 13:58:29,127] Trial 118 pruned. 


Trial 119 with params: {'learning_rate': 0.00319625284827361, 'weight_decay': 0.002, 'warmup_steps': 30, 'lambda_param': 1.0, 'temperature': 3.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5314,1.22892,0.832569,0.832513,0.832597,0.832537
2,0.8114,1.257753,0.841743,0.847892,0.840511,0.840657
3,0.4602,1.401994,0.845183,0.847393,0.844437,0.844693
4,0.2769,1.350444,0.841743,0.846277,0.840679,0.840886
5,0.1796,1.266306,0.862385,0.863571,0.862928,0.862359
6,0.1271,1.204669,0.861239,0.86284,0.860623,0.8609
7,0.0951,1.156378,0.870413,0.870494,0.870601,0.870409
8,0.0741,1.251377,0.863532,0.864928,0.86296,0.86323
9,0.0627,1.197532,0.862385,0.863383,0.862886,0.862367
10,0.0525,1.172909,0.864679,0.864622,0.864675,0.864644


[I 2025-03-23 14:03:01,873] Trial 119 finished with value: 0.859985680592992 and parameters: {'learning_rate': 0.00319625284827361, 'weight_decay': 0.002, 'warmup_steps': 30, 'lambda_param': 1.0, 'temperature': 3.5}. Best is trial 100 with value: 0.881824628991491.


Trial 120 with params: {'learning_rate': 0.00016104904333464902, 'weight_decay': 0.009000000000000001, 'warmup_steps': 21, 'lambda_param': 0.2, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0677,1.638042,0.786697,0.790275,0.787678,0.786372
2,1.5282,1.588257,0.801606,0.808724,0.80017,0.799879
3,1.4121,1.494233,0.801606,0.804419,0.800676,0.800754
4,1.3197,1.433826,0.809633,0.810633,0.809064,0.809232
5,1.2211,1.389068,0.817661,0.817739,0.817831,0.817655


[I 2025-03-23 14:04:30,603] Trial 120 pruned. 


Trial 121 with params: {'learning_rate': 8.532115701682182e-05, 'weight_decay': 0.003, 'warmup_steps': 28, 'lambda_param': 1.0, 'temperature': 2.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3247,1.670693,0.777523,0.77853,0.778037,0.777481
2,1.6362,1.612935,0.794725,0.795895,0.794087,0.794224
3,1.5389,1.544391,0.794725,0.795724,0.794129,0.79427
4,1.4795,1.51159,0.792431,0.793587,0.791793,0.791925
5,1.4286,1.473584,0.799312,0.799409,0.799055,0.799147
6,1.3841,1.520964,0.806193,0.811154,0.807327,0.805763
7,1.3402,1.465778,0.811927,0.814045,0.812663,0.811807
8,1.3064,1.437672,0.808486,0.808422,0.808485,0.808444
9,1.2648,1.450612,0.81078,0.811406,0.810316,0.810474
10,1.2298,1.469744,0.813073,0.81532,0.813831,0.812943


[I 2025-03-23 14:07:33,379] Trial 121 pruned. 


Trial 122 with params: {'learning_rate': 0.001079998155617523, 'weight_decay': 0.001, 'warmup_steps': 43, 'lambda_param': 0.9, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6978,1.594667,0.799312,0.807995,0.800823,0.798389
2,1.0988,1.250069,0.847477,0.847961,0.84711,0.847284
3,0.7658,1.438573,0.840596,0.846087,0.839427,0.839592
4,0.5362,1.350989,0.84633,0.847729,0.845731,0.845973
5,0.3917,1.301129,0.849771,0.850141,0.849446,0.849604


[I 2025-03-23 14:09:09,281] Trial 122 pruned. 


Trial 123 with params: {'learning_rate': 0.0007461973088531268, 'weight_decay': 0.006, 'warmup_steps': 18, 'lambda_param': 0.1, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7137,1.582642,0.795872,0.802033,0.79715,0.795251
2,1.2073,1.284876,0.836009,0.836732,0.835554,0.835745
3,0.9054,1.328814,0.832569,0.836949,0.831502,0.831662
4,0.679,1.337218,0.833716,0.836639,0.832839,0.833051
5,0.5242,1.274519,0.862385,0.862625,0.862128,0.862263
6,0.4105,1.300053,0.866972,0.868084,0.866465,0.866719
7,0.3332,1.284057,0.861239,0.861351,0.861044,0.861142
8,0.2687,1.264986,0.876147,0.876317,0.875937,0.876053
9,0.2216,1.354055,0.866972,0.867168,0.867222,0.866972
10,0.1809,1.361829,0.864679,0.865681,0.865181,0.864661


[I 2025-03-23 14:13:47,470] Trial 123 finished with value: 0.8657322778688039 and parameters: {'learning_rate': 0.0007461973088531268, 'weight_decay': 0.006, 'warmup_steps': 18, 'lambda_param': 0.1, 'temperature': 2.5}. Best is trial 100 with value: 0.881824628991491.


Trial 124 with params: {'learning_rate': 0.0007159773089454643, 'weight_decay': 0.004, 'warmup_steps': 35, 'lambda_param': 1.0, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7506,1.866576,0.770642,0.799116,0.773428,0.766215
2,1.2114,1.268818,0.831422,0.833042,0.830755,0.830972
3,0.8999,1.262051,0.855505,0.85742,0.854824,0.855101
4,0.6755,1.230291,0.849771,0.850547,0.84932,0.849528
5,0.5281,1.271004,0.854358,0.854399,0.854204,0.854273
6,0.4169,1.304015,0.866972,0.86713,0.866759,0.866872
7,0.3392,1.292337,0.861239,0.862231,0.86075,0.860988
8,0.2726,1.210213,0.876147,0.876105,0.876105,0.876105
9,0.2247,1.337751,0.870413,0.871168,0.870853,0.870404
10,0.1871,1.303185,0.873853,0.874389,0.874232,0.873851


[I 2025-03-23 14:18:54,405] Trial 124 finished with value: 0.876052897574124 and parameters: {'learning_rate': 0.0007159773089454643, 'weight_decay': 0.004, 'warmup_steps': 35, 'lambda_param': 1.0, 'temperature': 6.0}. Best is trial 100 with value: 0.881824628991491.


Trial 125 with params: {'learning_rate': 0.0003203028928719547, 'weight_decay': 0.002, 'warmup_steps': 19, 'lambda_param': 0.8, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8661,1.563282,0.787844,0.792229,0.78893,0.787419
2,1.4092,1.408138,0.825688,0.829278,0.824703,0.824859
3,1.211,1.359689,0.821101,0.826396,0.819904,0.819941
4,1.0483,1.428404,0.816514,0.822515,0.815231,0.815183
5,0.9002,1.339074,0.827982,0.827925,0.828008,0.827949
6,0.7803,1.304658,0.844037,0.844002,0.84411,0.844016
7,0.6827,1.400739,0.844037,0.844428,0.844363,0.844036
8,0.6103,1.423601,0.849771,0.849945,0.849531,0.849647
9,0.5468,1.432263,0.857798,0.858462,0.858213,0.857791
10,0.4955,1.535037,0.849771,0.853419,0.850709,0.849581


[I 2025-03-23 14:21:55,782] Trial 125 pruned. 


Trial 126 with params: {'learning_rate': 0.0013113286710708246, 'weight_decay': 0.006, 'warmup_steps': 31, 'lambda_param': 0.9, 'temperature': 5.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6495,1.506864,0.793578,0.802364,0.795108,0.792596
2,1.0417,1.232914,0.844037,0.84487,0.843563,0.84377
3,0.6896,1.556762,0.833716,0.837936,0.83267,0.832843
4,0.4753,1.174043,0.856651,0.856929,0.856372,0.856514
5,0.3405,1.158327,0.870413,0.870394,0.870517,0.870399
6,0.2478,1.228429,0.866972,0.867022,0.867138,0.866966
7,0.185,1.261479,0.862385,0.863671,0.861834,0.862095
8,0.1422,1.276558,0.864679,0.864729,0.864844,0.864672
9,0.1118,1.213349,0.863532,0.863685,0.86376,0.86353
10,0.0897,1.304442,0.863532,0.863773,0.863802,0.863532


[I 2025-03-23 14:25:02,943] Trial 126 pruned. 


Trial 127 with params: {'learning_rate': 0.0013516298784623038, 'weight_decay': 0.004, 'warmup_steps': 30, 'lambda_param': 1.0, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6438,1.488263,0.797018,0.803403,0.798318,0.796375
2,1.0362,1.15516,0.84633,0.847301,0.846826,0.84631
3,0.688,1.529303,0.829128,0.835211,0.827871,0.827922
4,0.4747,1.203393,0.854358,0.855701,0.853783,0.854035
5,0.3386,1.176994,0.861239,0.861184,0.861213,0.861197
6,0.2407,1.39077,0.852064,0.852849,0.851614,0.851826
7,0.1794,1.200469,0.862385,0.862351,0.862465,0.862367
8,0.1362,1.343721,0.856651,0.857808,0.856119,0.856364
9,0.1076,1.257591,0.868119,0.8681,0.868222,0.868105
10,0.0872,1.289346,0.857798,0.857776,0.857708,0.857738


[I 2025-03-23 14:28:07,481] Trial 127 pruned. 


Trial 128 with params: {'learning_rate': 0.0008109285936582722, 'weight_decay': 0.002, 'warmup_steps': 32, 'lambda_param': 1.0, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.736,1.577879,0.793578,0.801884,0.795066,0.792661
2,1.1736,1.290172,0.841743,0.842151,0.841395,0.841556
3,0.862,1.304972,0.831422,0.833268,0.830713,0.830931
4,0.6348,1.309806,0.841743,0.843795,0.841016,0.841262
5,0.4868,1.329221,0.853211,0.853157,0.853246,0.853183
6,0.3735,1.292578,0.863532,0.864203,0.863128,0.863336
7,0.2978,1.323726,0.862385,0.862846,0.862044,0.862222
8,0.2381,1.295564,0.87156,0.872337,0.871137,0.871364
9,0.1962,1.351011,0.858945,0.859287,0.859255,0.858945
10,0.1585,1.352522,0.868119,0.868585,0.868475,0.868118


[I 2025-03-23 14:32:43,902] Trial 128 finished with value: 0.8692103255006183 and parameters: {'learning_rate': 0.0008109285936582722, 'weight_decay': 0.002, 'warmup_steps': 32, 'lambda_param': 1.0, 'temperature': 6.0}. Best is trial 100 with value: 0.881824628991491.


Trial 129 with params: {'learning_rate': 0.0012246212026127275, 'weight_decay': 0.002, 'warmup_steps': 43, 'lambda_param': 1.0, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6975,1.544565,0.793578,0.801419,0.795024,0.792723
2,1.0775,1.27707,0.838303,0.84128,0.837427,0.837657
3,0.732,1.620553,0.827982,0.83822,0.826366,0.826129
4,0.5053,1.371436,0.841743,0.842567,0.841269,0.841473
5,0.3586,1.287465,0.856651,0.856674,0.856793,0.856642
6,0.2677,1.246651,0.862385,0.862366,0.862297,0.862327
7,0.2017,1.325389,0.861239,0.862621,0.860666,0.860931
8,0.1548,1.309668,0.863532,0.865149,0.862918,0.863199
9,0.1225,1.240037,0.862385,0.86258,0.862634,0.862385
10,0.0988,1.305866,0.860092,0.860286,0.860339,0.860091


[I 2025-03-23 14:35:47,583] Trial 129 pruned. 


Trial 130 with params: {'learning_rate': 0.0007431562036212198, 'weight_decay': 0.002, 'warmup_steps': 37, 'lambda_param': 0.9, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7413,1.706432,0.77867,0.79274,0.780637,0.776743
2,1.1932,1.332055,0.838303,0.839366,0.837764,0.837979
3,0.894,1.382528,0.834862,0.842693,0.83346,0.833461
4,0.6714,1.37947,0.837156,0.842377,0.836007,0.83616
5,0.5154,1.282312,0.862385,0.862407,0.862255,0.862313
6,0.3994,1.337669,0.863532,0.863555,0.863676,0.863523
7,0.325,1.332458,0.869266,0.870204,0.868801,0.869043
8,0.2603,1.270702,0.872706,0.873408,0.872306,0.872524
9,0.2131,1.387587,0.862385,0.863993,0.863013,0.862339
10,0.1762,1.266548,0.872706,0.872687,0.872811,0.872693


[I 2025-03-23 14:40:28,519] Trial 130 finished with value: 0.8669024611044442 and parameters: {'learning_rate': 0.0007431562036212198, 'weight_decay': 0.002, 'warmup_steps': 37, 'lambda_param': 0.9, 'temperature': 5.0}. Best is trial 100 with value: 0.881824628991491.


Trial 131 with params: {'learning_rate': 0.0007635029043475869, 'weight_decay': 0.004, 'warmup_steps': 39, 'lambda_param': 0.9, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7509,1.562831,0.795872,0.802444,0.797192,0.795198
2,1.1891,1.298216,0.83945,0.841017,0.838806,0.83904
3,0.8746,1.404648,0.845183,0.851631,0.843932,0.844091
4,0.6505,1.344667,0.832569,0.834543,0.831839,0.83206
5,0.4977,1.330866,0.849771,0.849945,0.849531,0.849647
6,0.386,1.276576,0.865826,0.866233,0.865507,0.865677
7,0.3115,1.402639,0.855505,0.858227,0.854698,0.854989
8,0.2492,1.343039,0.869266,0.869266,0.86939,0.869255
9,0.207,1.335884,0.862385,0.862385,0.862507,0.862374
10,0.1695,1.30304,0.872706,0.873175,0.873063,0.872705


[I 2025-03-23 14:45:09,117] Trial 131 finished with value: 0.8714920314111876 and parameters: {'learning_rate': 0.0007635029043475869, 'weight_decay': 0.004, 'warmup_steps': 39, 'lambda_param': 0.9, 'temperature': 6.0}. Best is trial 100 with value: 0.881824628991491.


Trial 132 with params: {'learning_rate': 0.002174644406261954, 'weight_decay': 0.002, 'warmup_steps': 22, 'lambda_param': 1.0, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5529,1.30278,0.822248,0.824059,0.822925,0.822163
2,0.8881,1.283901,0.845183,0.846903,0.844521,0.84477
3,0.5442,1.279857,0.836009,0.84412,0.834586,0.834582
4,0.3418,1.236156,0.845183,0.845942,0.844731,0.844934
5,0.2209,1.241794,0.853211,0.853211,0.85333,0.853199


[I 2025-03-23 14:46:37,137] Trial 132 pruned. 


Trial 133 with params: {'learning_rate': 0.00041197077291861046, 'weight_decay': 0.003, 'warmup_steps': 33, 'lambda_param': 0.8, 'temperature': 6.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8437,1.638548,0.788991,0.795856,0.790351,0.788238
2,1.3538,1.380768,0.823394,0.826638,0.822451,0.82261
3,1.1121,1.357884,0.830275,0.834983,0.829166,0.829297
4,0.9334,1.416447,0.827982,0.834252,0.826703,0.826734
5,0.7727,1.344751,0.850917,0.850858,0.850909,0.850879
6,0.6434,1.309143,0.855505,0.855554,0.855666,0.855498
7,0.5543,1.417593,0.847477,0.8475,0.847615,0.847467
8,0.4837,1.406443,0.852064,0.852008,0.852035,0.85202
9,0.4248,1.424888,0.852064,0.852959,0.852541,0.852048
10,0.3768,1.503909,0.848624,0.85178,0.849499,0.848468


[I 2025-03-23 14:49:55,571] Trial 133 pruned. 


Trial 134 with params: {'learning_rate': 6.558978114640059e-05, 'weight_decay': 0.0, 'warmup_steps': 19, 'lambda_param': 0.1, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4542,1.703442,0.775229,0.775904,0.774691,0.774802
2,1.7126,1.618026,0.784404,0.78487,0.783952,0.784075
3,1.5875,1.569369,0.794725,0.795062,0.79434,0.794465
4,1.5291,1.5541,0.793578,0.795436,0.792793,0.792897
5,1.48,1.511362,0.788991,0.788932,0.788878,0.788901


[I 2025-03-23 14:51:26,400] Trial 134 pruned. 


Trial 135 with params: {'learning_rate': 0.0009543631883571671, 'weight_decay': 0.008, 'warmup_steps': 34, 'lambda_param': 0.5, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7289,1.446161,0.81078,0.815083,0.811832,0.810439
2,1.1412,1.305116,0.840596,0.841337,0.840143,0.840339
3,0.8063,1.392996,0.832569,0.839856,0.831207,0.831218
4,0.5804,1.364842,0.840596,0.842283,0.839932,0.840171
5,0.4329,1.323741,0.841743,0.842414,0.841311,0.841502
6,0.3248,1.262786,0.866972,0.868976,0.866296,0.866601
7,0.2477,1.251228,0.862385,0.862625,0.862128,0.862263
8,0.1959,1.259684,0.869266,0.869745,0.868927,0.869111
9,0.1555,1.214447,0.865826,0.865773,0.865802,0.865786
10,0.1267,1.200117,0.869266,0.869352,0.869096,0.869183


[I 2025-03-23 14:56:18,295] Trial 135 finished with value: 0.8645185415360955 and parameters: {'learning_rate': 0.0009543631883571671, 'weight_decay': 0.008, 'warmup_steps': 34, 'lambda_param': 0.5, 'temperature': 6.5}. Best is trial 100 with value: 0.881824628991491.


Trial 136 with params: {'learning_rate': 0.0006976315589789534, 'weight_decay': 0.004, 'warmup_steps': 38, 'lambda_param': 1.0, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7607,1.730368,0.793578,0.807345,0.795487,0.791913
2,1.2036,1.308853,0.836009,0.838959,0.835133,0.835354
3,0.9062,1.307049,0.836009,0.836889,0.835512,0.835713
4,0.6821,1.289795,0.84289,0.844167,0.842311,0.842542
5,0.5296,1.29085,0.856651,0.856836,0.856414,0.856533
6,0.4192,1.293462,0.865826,0.866067,0.866096,0.865826
7,0.3402,1.324197,0.862385,0.862407,0.862255,0.862313
8,0.2754,1.327379,0.865826,0.867695,0.86517,0.865467
9,0.2292,1.48752,0.864679,0.866529,0.865349,0.864621
10,0.1901,1.24957,0.866972,0.867168,0.867222,0.866972


[I 2025-03-23 15:00:59,516] Trial 136 finished with value: 0.8749406267968206 and parameters: {'learning_rate': 0.0006976315589789534, 'weight_decay': 0.004, 'warmup_steps': 38, 'lambda_param': 1.0, 'temperature': 6.5}. Best is trial 100 with value: 0.881824628991491.


Trial 137 with params: {'learning_rate': 0.0007665379253614984, 'weight_decay': 0.004, 'warmup_steps': 37, 'lambda_param': 0.8, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7437,1.486921,0.797018,0.799686,0.797855,0.796824
2,1.1849,1.305347,0.841743,0.843326,0.8411,0.841339
3,0.88,1.433111,0.836009,0.844629,0.834544,0.83451
4,0.6529,1.286779,0.841743,0.842917,0.841185,0.841409
5,0.4989,1.250071,0.861239,0.861203,0.861171,0.861186
6,0.3878,1.317272,0.860092,0.861801,0.859455,0.859735
7,0.3118,1.293985,0.858945,0.858909,0.858876,0.858891
8,0.2509,1.241584,0.87156,0.872507,0.871095,0.87134
9,0.2057,1.410353,0.860092,0.861926,0.86076,0.860032
10,0.1683,1.343227,0.863532,0.865032,0.864139,0.863492


[I 2025-03-23 15:05:36,560] Trial 137 finished with value: 0.8749919443403237 and parameters: {'learning_rate': 0.0007665379253614984, 'weight_decay': 0.004, 'warmup_steps': 37, 'lambda_param': 0.8, 'temperature': 7.0}. Best is trial 100 with value: 0.881824628991491.


Trial 138 with params: {'learning_rate': 0.0010816547768586666, 'weight_decay': 0.007, 'warmup_steps': 43, 'lambda_param': 1.0, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7013,1.643714,0.788991,0.804484,0.791025,0.787014
2,1.0935,1.262036,0.853211,0.85335,0.852993,0.8531
3,0.7567,1.523916,0.837156,0.843642,0.83588,0.835975
4,0.5304,1.258415,0.845183,0.846903,0.844521,0.84477
5,0.3916,1.259483,0.852064,0.852334,0.851783,0.851922
6,0.2838,1.308003,0.864679,0.86476,0.864507,0.864593
7,0.218,1.286518,0.857798,0.85803,0.85754,0.857672
8,0.1681,1.287269,0.866972,0.867581,0.866591,0.866793
9,0.1343,1.297234,0.860092,0.860381,0.860381,0.860092
10,0.1081,1.283281,0.857798,0.857912,0.858003,0.857795


[I 2025-03-23 15:08:37,095] Trial 138 pruned. 


Trial 139 with params: {'learning_rate': 0.0006217182657224724, 'weight_decay': 0.005, 'warmup_steps': 39, 'lambda_param': 0.7000000000000001, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7713,1.669341,0.797018,0.805646,0.798529,0.796085
2,1.2381,1.302775,0.829128,0.829109,0.829218,0.82911
3,0.945,1.306175,0.83945,0.840807,0.838848,0.839076
4,0.7334,1.346607,0.836009,0.839263,0.835091,0.835305
5,0.5797,1.300088,0.854358,0.854352,0.854246,0.854289
6,0.4633,1.283943,0.856651,0.856695,0.856498,0.856568
7,0.3832,1.305316,0.854358,0.854299,0.854372,0.854325
8,0.3143,1.292796,0.869266,0.869352,0.869096,0.869183
9,0.2628,1.43563,0.854358,0.855433,0.854877,0.854335
10,0.2214,1.371912,0.861239,0.861834,0.861634,0.861234


[I 2025-03-23 15:11:40,667] Trial 139 pruned. 


Trial 140 with params: {'learning_rate': 0.0003691394502842829, 'weight_decay': 0.003, 'warmup_steps': 36, 'lambda_param': 0.8, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8832,1.695774,0.771789,0.786794,0.773838,0.7696
2,1.3792,1.353235,0.826835,0.827262,0.826461,0.826616
3,1.1483,1.3973,0.817661,0.824779,0.816273,0.816152
4,0.9798,1.505777,0.815367,0.825413,0.813726,0.813334
5,0.8337,1.327723,0.844037,0.844007,0.843942,0.84397
6,0.7056,1.412248,0.84289,0.84304,0.843111,0.842888
7,0.6077,1.487511,0.836009,0.836455,0.836354,0.836007
8,0.5363,1.532472,0.830275,0.834617,0.829208,0.829356
9,0.4713,1.54495,0.838303,0.840677,0.839069,0.83819
10,0.4227,1.491362,0.84289,0.84478,0.843574,0.842815


[I 2025-03-23 15:14:44,488] Trial 140 pruned. 


Trial 141 with params: {'learning_rate': 0.0010387939056753382, 'weight_decay': 0.004, 'warmup_steps': 39, 'lambda_param': 0.8, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7158,1.487013,0.794725,0.802823,0.796192,0.793844
2,1.1092,1.296057,0.834862,0.837107,0.834091,0.834318
3,0.777,1.436824,0.84289,0.846559,0.841932,0.842167
4,0.5543,1.28989,0.844037,0.845636,0.843395,0.843639
5,0.4098,1.353923,0.858945,0.859538,0.859339,0.85894
6,0.3051,1.313269,0.863532,0.865149,0.862918,0.863199
7,0.2343,1.306784,0.857798,0.85803,0.85754,0.857672
8,0.1841,1.334655,0.864679,0.864625,0.864718,0.864653
9,0.1451,1.3085,0.870413,0.870656,0.870685,0.870413
10,0.1164,1.327052,0.865826,0.865807,0.865928,0.865811


[I 2025-03-23 15:17:46,270] Trial 141 pruned. 


Trial 142 with params: {'learning_rate': 0.0021145470495806643, 'weight_decay': 0.007, 'warmup_steps': 34, 'lambda_param': 0.6000000000000001, 'temperature': 7.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5826,1.32705,0.818807,0.820719,0.819504,0.818712
2,0.9002,1.227156,0.849771,0.850892,0.849236,0.84947
3,0.5504,1.394263,0.838303,0.842245,0.837301,0.837508
4,0.3493,1.336993,0.841743,0.845913,0.840722,0.840939
5,0.2283,1.368886,0.853211,0.853607,0.85354,0.85321
6,0.1545,1.245059,0.863532,0.863685,0.86376,0.86353
7,0.1166,1.235145,0.866972,0.867901,0.866507,0.866745
8,0.0878,1.16087,0.875,0.87497,0.874937,0.874952
9,0.0709,1.143235,0.87156,0.871589,0.871432,0.871492
10,0.0593,1.249934,0.858945,0.859927,0.858455,0.858691


[I 2025-03-23 15:20:46,665] Trial 142 pruned. 


Trial 143 with params: {'learning_rate': 0.0010435362147662318, 'weight_decay': 0.006, 'warmup_steps': 37, 'lambda_param': 1.0, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7302,1.731168,0.766055,0.786613,0.76846,0.76281
2,1.1218,1.283621,0.84633,0.846542,0.846068,0.846194
3,0.7805,1.48987,0.831422,0.842072,0.829787,0.829566
4,0.5492,1.178175,0.849771,0.850892,0.849236,0.84947
5,0.4051,1.219607,0.858945,0.858909,0.858876,0.858891
6,0.3043,1.33385,0.83945,0.842292,0.838596,0.838831
7,0.2319,1.21328,0.862385,0.862339,0.862339,0.862339
8,0.178,1.286937,0.865826,0.866233,0.865507,0.865677
9,0.1431,1.190892,0.865826,0.866121,0.865549,0.865697
10,0.1145,1.277595,0.866972,0.866928,0.866928,0.866928


[I 2025-03-23 15:25:39,687] Trial 143 finished with value: 0.8645185415360955 and parameters: {'learning_rate': 0.0010435362147662318, 'weight_decay': 0.006, 'warmup_steps': 37, 'lambda_param': 1.0, 'temperature': 4.0}. Best is trial 100 with value: 0.881824628991491.


Trial 144 with params: {'learning_rate': 0.0012094596394122523, 'weight_decay': 0.008, 'warmup_steps': 2, 'lambda_param': 0.5, 'temperature': 4.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6068,1.582834,0.802752,0.811767,0.804286,0.801814
2,1.0553,1.185428,0.852064,0.852018,0.85212,0.852041
3,0.7118,1.523023,0.824541,0.835504,0.822861,0.822523
4,0.4909,1.324903,0.837156,0.838499,0.836554,0.836777
5,0.3537,1.293568,0.848624,0.849319,0.848194,0.848393
6,0.2529,1.223029,0.854358,0.854399,0.854204,0.854273
7,0.1915,1.398895,0.854358,0.856389,0.853656,0.853933
8,0.1479,1.342676,0.850917,0.852345,0.85032,0.850571
9,0.1177,1.291592,0.855505,0.855793,0.855793,0.855505
10,0.0947,1.285968,0.862385,0.862728,0.862086,0.862243


[I 2025-03-23 15:30:54,088] Trial 144 finished with value: 0.8633594189272645 and parameters: {'learning_rate': 0.0012094596394122523, 'weight_decay': 0.008, 'warmup_steps': 2, 'lambda_param': 0.5, 'temperature': 4.5}. Best is trial 100 with value: 0.881824628991491.


Trial 145 with params: {'learning_rate': 0.00041319815503314906, 'weight_decay': 0.005, 'warmup_steps': 26, 'lambda_param': 1.0, 'temperature': 6.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8297,1.566006,0.791284,0.795542,0.792351,0.790887
2,1.3409,1.354267,0.824541,0.824853,0.824209,0.824347
3,1.0981,1.373812,0.819954,0.825033,0.818778,0.81882
4,0.9144,1.428763,0.822248,0.827765,0.82103,0.821061
5,0.7575,1.344978,0.840596,0.840582,0.840479,0.840521


[I 2025-03-23 15:32:33,296] Trial 145 pruned. 


Trial 146 with params: {'learning_rate': 0.0011607614784531854, 'weight_decay': 0.0, 'warmup_steps': 2, 'lambda_param': 0.8, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.618,1.585494,0.799312,0.809,0.800907,0.798259
2,1.0697,1.234152,0.847477,0.847419,0.847447,0.847432
3,0.7324,1.406377,0.837156,0.841613,0.836091,0.836274
4,0.5074,1.283897,0.838303,0.84128,0.837427,0.837657
5,0.37,1.317016,0.84633,0.846876,0.845942,0.846123
6,0.2681,1.33339,0.861239,0.862418,0.860708,0.86096
7,0.2069,1.343971,0.855505,0.855732,0.855245,0.855376
8,0.158,1.343201,0.861239,0.861237,0.861129,0.861173
9,0.1276,1.289908,0.860092,0.860547,0.85975,0.859926
10,0.1007,1.280889,0.865826,0.865773,0.865802,0.865786


[I 2025-03-23 15:37:51,490] Trial 146 finished with value: 0.8736831015592077 and parameters: {'learning_rate': 0.0011607614784531854, 'weight_decay': 0.0, 'warmup_steps': 2, 'lambda_param': 0.8, 'temperature': 2.5}. Best is trial 100 with value: 0.881824628991491.


Trial 147 with params: {'learning_rate': 0.0013725464708098966, 'weight_decay': 0.0, 'warmup_steps': 9, 'lambda_param': 1.0, 'temperature': 2.5}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6048,1.454405,0.809633,0.812809,0.810537,0.809407
2,1.0376,1.212108,0.850917,0.851236,0.850615,0.850764
3,0.693,1.519691,0.836009,0.84182,0.834796,0.834915
4,0.4753,1.287065,0.849771,0.851524,0.84911,0.849369
5,0.3379,1.188909,0.869266,0.869623,0.868969,0.869131
6,0.238,1.254845,0.860092,0.86068,0.859708,0.859903
7,0.1776,1.317344,0.855505,0.855577,0.85533,0.855413
8,0.1328,1.229555,0.868119,0.8682,0.868306,0.868115
9,0.1065,1.177033,0.870413,0.870656,0.870685,0.870413
10,0.0865,1.25204,0.855505,0.855619,0.855709,0.855502


[I 2025-03-23 15:41:19,441] Trial 147 pruned. 


Trial 148 with params: {'learning_rate': 0.003199645143713299, 'weight_decay': 0.007, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 5.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4524,1.282989,0.826835,0.830298,0.827766,0.826616
2,0.7844,1.278039,0.841743,0.844052,0.840974,0.841221
3,0.4459,1.228873,0.850917,0.851349,0.850573,0.850741
4,0.2652,1.370055,0.84289,0.848427,0.841721,0.8419
5,0.173,1.403558,0.840596,0.843909,0.83968,0.839912


[I 2025-03-23 15:42:54,745] Trial 148 pruned. 


Trial 149 with params: {'learning_rate': 0.0006627853536758961, 'weight_decay': 0.0, 'warmup_steps': 0, 'lambda_param': 0.9, 'temperature': 4.0}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7281,1.679653,0.795872,0.805724,0.797487,0.794766
2,1.2296,1.278475,0.830275,0.830239,0.830176,0.830203
3,0.9441,1.382887,0.833716,0.840343,0.832418,0.832476
4,0.7258,1.369946,0.837156,0.840583,0.836217,0.836432
5,0.5722,1.327224,0.856651,0.85711,0.857003,0.85665
6,0.4557,1.389737,0.853211,0.853162,0.853162,0.853162
7,0.3743,1.450515,0.84289,0.844167,0.842311,0.842542
8,0.3029,1.389971,0.864679,0.86587,0.865223,0.864653
9,0.2531,1.570562,0.848624,0.850917,0.849373,0.848527
10,0.2126,1.394859,0.863532,0.86482,0.864097,0.863502


[I 2025-03-23 15:48:07,837] Trial 149 finished with value: 0.8703062418477439 and parameters: {'learning_rate': 0.0006627853536758961, 'weight_decay': 0.0, 'warmup_steps': 0, 'lambda_param': 0.9, 'temperature': 4.0}. Best is trial 100 with value: 0.881824628991491.


In [None]:
print(best_trial2)

BestRun(run_id='100', objective=0.881824628991491, hyperparameters={'learning_rate': 0.0008284065855808625, 'weight_decay': 0.002, 'warmup_steps': 42, 'lambda_param': 0.9, 'temperature': 5.5}, run_summary=None)


In [None]:
#Nápočet epoch na steps
data_length = len(all_train_data)
min_r = math.ceil(data_length/batch_size)*5
max_r = math.ceil(data_length/batch_size)*num_epochs
warm_up = math.ceil(data_length/batch_size/10)

In [None]:
base.reset_seed()

In [35]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base_aug_hp-search", logging_dir=f"~/logs/{DATASET}/bilstm-base_aug_hp-search", epochs=num_epochs, batch_size=batch_size)

In [36]:
def hp_space(trial):
    params =  {
        "learning_rate": trial.suggest_float("learning_rate", 5e-5, 5e-3, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0, 1e-2, step=1e-3),
        "warmup_steps" : trial.suggest_int("warmup_steps", 0, warm_up),
    }
    print(f"Trial {trial.number} with params: {params}")
    return params

In [37]:
pruner = optuna.pruners.HyperbandPruner(min_resource=min_r, max_resource=max_r, reduction_factor=2, bootstrap_count=2)
sampler = optuna.samplers.TPESampler(seed=42, multivariate=True)



In [38]:
trainer = Trainer(
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    model_init = lambda: get_BiLSTM(),
)
  

In [None]:
best_trial3 = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    compute_objective=lambda metrics: metrics["eval_f1"],
    pruner=pruner,
    sampler=sampler,
    study_name="Base-aug",
    n_trials=150
)

[I 2025-03-23 15:48:08,187] A new study created in memory with name: Base-aug


Trial 0 with params: {'learning_rate': 0.0002805758207667253, 'weight_decay': 0.01, 'warmup_steps': 305}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3011,0.412027,0.833716,0.838676,0.832586,0.832728
2,0.1684,0.438308,0.847477,0.847738,0.847194,0.847331
3,0.1074,0.616542,0.831422,0.833508,0.830671,0.830888
4,0.0733,0.642944,0.83945,0.839395,0.839395,0.839395
5,0.053,0.727546,0.852064,0.852087,0.852204,0.852055


[I 2025-03-23 15:59:40,554] Trial 0 pruned. 


Trial 1 with params: {'learning_rate': 0.0007875660249889869, 'weight_decay': 0.001, 'warmup_steps': 65}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2273,0.453908,0.854358,0.85532,0.853867,0.854095
2,0.0842,0.606551,0.858945,0.858887,0.858961,0.858914
3,0.047,0.776588,0.850917,0.850931,0.850783,0.850839
4,0.0306,0.867838,0.844037,0.843976,0.844026,0.843996
5,0.0216,1.039238,0.852064,0.853753,0.852709,0.852008
6,0.0163,1.193835,0.848624,0.848589,0.848699,0.848604
7,0.0124,1.30184,0.84633,0.846846,0.8467,0.846327
8,0.0097,1.546766,0.857798,0.857873,0.857624,0.857708
9,0.0077,1.498287,0.858945,0.858991,0.858792,0.858863
10,0.0061,1.719121,0.84633,0.846279,0.846279,0.846279


[I 2025-03-23 16:23:00,482] Trial 1 pruned. 


Trial 2 with params: {'learning_rate': 6.533369619026643e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 251}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3703,0.423267,0.801606,0.801566,0.801476,0.801511
2,0.2906,0.413528,0.826835,0.827538,0.827261,0.826824
3,0.2465,0.486361,0.806193,0.82131,0.80417,0.803116
4,0.2138,0.437856,0.833716,0.83558,0.833007,0.833231
5,0.1872,0.466014,0.831422,0.837557,0.830165,0.830232


[I 2025-03-23 16:34:24,029] Trial 2 pruned. 


Trial 3 with params: {'learning_rate': 0.0013035123791853842, 'weight_decay': 0.0, 'warmup_steps': 405}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2173,0.484506,0.84633,0.847343,0.845815,0.846038
2,0.072,0.605693,0.84289,0.842831,0.842858,0.842843
3,0.0403,0.74934,0.855505,0.855793,0.855793,0.855505
4,0.0265,0.888367,0.848624,0.848597,0.848531,0.848559
5,0.0193,1.171406,0.848624,0.849141,0.848994,0.848621
6,0.0147,1.07717,0.850917,0.850858,0.850909,0.850879
7,0.0113,1.220805,0.850917,0.850967,0.851078,0.85091
8,0.0092,1.276455,0.845183,0.845125,0.845152,0.845138
9,0.0073,1.549389,0.853211,0.853498,0.853498,0.853211
10,0.006,1.658733,0.841743,0.841934,0.841985,0.841742


[I 2025-03-23 16:57:27,906] Trial 3 pruned. 


Trial 4 with params: {'learning_rate': 0.002311294500510415, 'weight_decay': 0.002, 'warmup_steps': 76}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1828,0.43749,0.853211,0.853226,0.853077,0.853134
2,0.063,0.573769,0.850917,0.850867,0.850867,0.850867
3,0.0393,0.678781,0.857798,0.857744,0.857834,0.857771
4,0.0284,0.875998,0.852064,0.852087,0.852204,0.852055
5,0.022,0.873097,0.848624,0.849781,0.849162,0.848595


[I 2025-03-23 17:08:58,400] Trial 4 pruned. 


Trial 5 with params: {'learning_rate': 0.00011635338541918901, 'weight_decay': 0.003, 'warmup_steps': 219}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.343,0.399861,0.819954,0.820101,0.820167,0.819952
2,0.2426,0.403824,0.832569,0.832533,0.83247,0.832497
3,0.1916,0.459668,0.824541,0.824853,0.824209,0.824347
4,0.1557,0.460531,0.844037,0.84434,0.843732,0.843876
5,0.1282,0.500603,0.827982,0.828162,0.827713,0.827829
6,0.1073,0.575816,0.84633,0.846639,0.846026,0.846172
7,0.0907,0.662587,0.826835,0.827399,0.827219,0.826829
8,0.0775,0.690337,0.840596,0.841061,0.840227,0.840395
9,0.0663,0.791153,0.83945,0.840244,0.839901,0.839436
10,0.0578,0.79112,0.841743,0.841801,0.841564,0.841642


[I 2025-03-23 17:31:52,762] Trial 5 pruned. 


Trial 6 with params: {'learning_rate': 0.0003654769917956456, 'weight_decay': 0.003, 'warmup_steps': 255}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2815,0.401447,0.853211,0.856852,0.852278,0.852559
2,0.1409,0.454217,0.853211,0.853534,0.852909,0.85306
3,0.0831,0.609988,0.838303,0.838891,0.83789,0.838071
4,0.0549,0.635425,0.855505,0.855647,0.855288,0.855395
5,0.0389,0.715678,0.84633,0.847301,0.846826,0.84631


[I 2025-03-23 17:43:07,707] Trial 6 pruned. 


Trial 7 with params: {'learning_rate': 9.505122659935192e-05, 'weight_decay': 0.003, 'warmup_steps': 153}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3494,0.408116,0.817661,0.818217,0.818041,0.817655
2,0.2581,0.401724,0.832569,0.832533,0.83247,0.832497
3,0.2085,0.452596,0.823394,0.82835,0.82224,0.822314
4,0.1746,0.456872,0.834862,0.836398,0.834217,0.834441
5,0.1483,0.454725,0.84633,0.846279,0.846279,0.846279
6,0.1277,0.522993,0.83945,0.83965,0.839185,0.839307
7,0.1108,0.565484,0.833716,0.835344,0.834354,0.833652
8,0.0971,0.609358,0.836009,0.836163,0.835765,0.835874
9,0.0847,0.672183,0.838303,0.838283,0.838396,0.838286
10,0.0758,0.676302,0.84633,0.846542,0.846068,0.846194


[I 2025-03-23 18:17:32,657] Trial 7 finished with value: 0.8382548914108758 and parameters: {'learning_rate': 9.505122659935192e-05, 'weight_decay': 0.003, 'warmup_steps': 153}. Best is trial 7 with value: 0.8382548914108758.


Trial 8 with params: {'learning_rate': 0.00040842279473800845, 'weight_decay': 0.008, 'warmup_steps': 83}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2699,0.430878,0.836009,0.843628,0.834628,0.834652
2,0.1299,0.505233,0.858945,0.858991,0.858792,0.858863
3,0.0747,0.60418,0.84633,0.846542,0.846068,0.846194
4,0.0492,0.741194,0.844037,0.847235,0.843142,0.843391
5,0.0347,0.758176,0.855505,0.856321,0.855961,0.855492
6,0.0254,1.111431,0.834862,0.834807,0.834807,0.834807
7,0.0189,1.080665,0.84633,0.846522,0.846573,0.846329
8,0.0145,1.339333,0.836009,0.835962,0.836059,0.835983
9,0.0113,1.417455,0.852064,0.852302,0.85233,0.852064
10,0.0086,1.733423,0.849771,0.850397,0.849362,0.849555


[I 2025-03-23 18:52:00,348] Trial 8 finished with value: 0.8416756571849591 and parameters: {'learning_rate': 0.00040842279473800845, 'weight_decay': 0.008, 'warmup_steps': 83}. Best is trial 8 with value: 0.8416756571849591.


Trial 9 with params: {'learning_rate': 0.0005338741354740678, 'weight_decay': 0.006, 'warmup_steps': 19}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.249,0.465995,0.84289,0.849739,0.841595,0.841719
2,0.1067,0.541262,0.870413,0.870469,0.870264,0.870338
3,0.0596,0.673062,0.854358,0.854438,0.85454,0.854353
4,0.0388,0.754466,0.854358,0.854739,0.854035,0.854197
5,0.0271,0.85292,0.855505,0.85547,0.855582,0.855486
6,0.0197,1.192678,0.845183,0.845141,0.84511,0.845125
7,0.0152,1.147074,0.860092,0.860206,0.860297,0.860089
8,0.0115,1.327165,0.84289,0.844167,0.842311,0.842542
9,0.0089,1.586243,0.847477,0.847738,0.847194,0.847331
10,0.0074,1.52781,0.844037,0.843981,0.844068,0.844007


[I 2025-03-23 19:14:59,776] Trial 9 pruned. 


Trial 10 with params: {'learning_rate': 6.888788881730778e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 39}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3611,0.424359,0.798165,0.798295,0.797887,0.797986
2,0.2861,0.407897,0.836009,0.835963,0.835933,0.835947
3,0.2415,0.408982,0.838303,0.838761,0.837933,0.838098
4,0.2083,0.422065,0.837156,0.837552,0.836806,0.836963
5,0.1821,0.460848,0.838303,0.842245,0.837301,0.837508
6,0.1626,0.484712,0.834862,0.839993,0.835986,0.834514
7,0.146,0.580986,0.825688,0.829608,0.824661,0.824802
8,0.1336,0.525179,0.840596,0.841337,0.840143,0.840339
9,0.121,0.55671,0.838303,0.83833,0.838143,0.838209
10,0.1114,0.577026,0.837156,0.837273,0.836933,0.837033


[I 2025-03-23 19:38:15,884] Trial 10 pruned. 


Trial 11 with params: {'learning_rate': 8.238154754398708e-05, 'weight_decay': 0.003, 'warmup_steps': 29}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3521,0.418676,0.809633,0.809782,0.809358,0.809464
2,0.2699,0.409421,0.831422,0.831865,0.831765,0.83142
3,0.2221,0.46288,0.822248,0.832002,0.820651,0.820377
4,0.1891,0.450557,0.833716,0.83608,0.832923,0.833145
5,0.1631,0.44379,0.856651,0.857808,0.856119,0.856364
6,0.1425,0.470629,0.840596,0.840684,0.840395,0.840485
7,0.1261,0.559589,0.836009,0.838137,0.835259,0.83549
8,0.1131,0.556765,0.845183,0.845794,0.844774,0.844961
9,0.1006,0.607954,0.84633,0.84634,0.846194,0.846249
10,0.0911,0.610498,0.83945,0.83957,0.839227,0.839328


[I 2025-03-23 20:01:37,080] Trial 11 pruned. 


Trial 12 with params: {'learning_rate': 0.0004229895735463087, 'weight_decay': 0.009000000000000001, 'warmup_steps': 123}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2682,0.418231,0.840596,0.84834,0.839217,0.839277
2,0.1261,0.487402,0.84633,0.847729,0.845731,0.845973
3,0.0718,0.601733,0.84289,0.84283,0.8429,0.842855
4,0.0468,0.654091,0.849771,0.851765,0.849068,0.849333
5,0.0329,0.79094,0.848624,0.852781,0.849625,0.848393
6,0.024,0.912897,0.860092,0.860429,0.859792,0.859947
7,0.0181,0.984824,0.860092,0.860112,0.85996,0.860018
8,0.0139,1.141404,0.854358,0.855915,0.85374,0.854003
9,0.0107,1.358162,0.849771,0.849922,0.849994,0.849769
10,0.0083,1.539561,0.845183,0.845277,0.844984,0.845076


[I 2025-03-23 20:25:07,071] Trial 12 pruned. 


Trial 13 with params: {'learning_rate': 0.002704032693225816, 'weight_decay': 0.008, 'warmup_steps': 159}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1835,0.526417,0.853211,0.854257,0.852698,0.852932
2,0.0651,0.6223,0.858945,0.859405,0.859297,0.858943
3,0.0431,0.743694,0.84633,0.846723,0.846657,0.846329
4,0.0311,0.818214,0.850917,0.850883,0.850994,0.850898
5,0.0246,1.07538,0.847477,0.849385,0.848162,0.847405


[I 2025-03-23 20:36:49,947] Trial 13 pruned. 


Trial 14 with params: {'learning_rate': 5.716528877895461e-05, 'weight_decay': 0.0, 'warmup_steps': 131}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3709,0.424261,0.797018,0.796956,0.796929,0.796941
2,0.2988,0.417664,0.829128,0.829458,0.829429,0.829128
3,0.2573,0.414749,0.838303,0.838636,0.838606,0.838303
4,0.2262,0.426852,0.837156,0.837961,0.83668,0.836878
5,0.2004,0.441822,0.832569,0.835334,0.831713,0.831924
6,0.1814,0.465714,0.836009,0.836455,0.836354,0.836007
7,0.1648,0.564814,0.823394,0.829991,0.822072,0.822043
8,0.1522,0.504069,0.838303,0.83846,0.838059,0.83817
9,0.1403,0.533546,0.836009,0.836248,0.835722,0.835852
10,0.1308,0.546352,0.837156,0.837205,0.837312,0.837148


[I 2025-03-23 21:00:45,489] Trial 14 pruned. 


Trial 15 with params: {'learning_rate': 0.00012116148911900525, 'weight_decay': 0.006, 'warmup_steps': 166}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3379,0.396329,0.816514,0.816563,0.816662,0.816505
2,0.2376,0.423357,0.821101,0.824364,0.822009,0.820889
3,0.1863,0.446558,0.845183,0.845172,0.845068,0.84511
4,0.1502,0.457614,0.847477,0.847961,0.84711,0.847284
5,0.123,0.511934,0.832569,0.832954,0.832218,0.83237


[I 2025-03-23 21:12:35,434] Trial 15 pruned. 


Trial 16 with params: {'learning_rate': 0.0003247175832033686, 'weight_decay': 0.004, 'warmup_steps': 149}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2853,0.415061,0.840596,0.84694,0.839343,0.839471
2,0.1518,0.473009,0.844037,0.844007,0.843942,0.84397
3,0.0926,0.626867,0.834862,0.837107,0.834091,0.834318
4,0.0621,0.676571,0.844037,0.846931,0.843184,0.843436
5,0.0439,0.708323,0.850917,0.851204,0.851204,0.850917
6,0.0323,0.940145,0.848624,0.848673,0.848783,0.848617
7,0.0246,1.102128,0.852064,0.852018,0.85212,0.852041
8,0.0188,1.161475,0.849771,0.850661,0.850246,0.849755
9,0.0146,1.446864,0.852064,0.852025,0.851993,0.852008
10,0.0114,1.545099,0.847477,0.847436,0.847405,0.847419


[I 2025-03-23 21:47:21,027] Trial 16 finished with value: 0.850916646939688 and parameters: {'learning_rate': 0.0003247175832033686, 'weight_decay': 0.004, 'warmup_steps': 149}. Best is trial 16 with value: 0.850916646939688.


Trial 17 with params: {'learning_rate': 0.0020085822314002493, 'weight_decay': 0.008, 'warmup_steps': 337}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1988,0.489523,0.854358,0.86056,0.853151,0.853386
2,0.0671,0.552435,0.860092,0.860206,0.860297,0.860089
3,0.0402,0.764533,0.856651,0.856695,0.856498,0.856568
4,0.0283,0.850545,0.84633,0.847945,0.845689,0.845938
5,0.0212,0.911578,0.860092,0.860141,0.860255,0.860085
6,0.0168,1.065297,0.84633,0.846522,0.846573,0.846329
7,0.013,1.033969,0.84633,0.84675,0.845984,0.846148
8,0.0109,1.080167,0.864679,0.864729,0.864844,0.864672
9,0.0084,1.368441,0.850917,0.850863,0.850951,0.850889
10,0.0067,1.515102,0.861239,0.861582,0.86155,0.861238


[I 2025-03-23 22:22:23,553] Trial 17 finished with value: 0.8566285614320308 and parameters: {'learning_rate': 0.0020085822314002493, 'weight_decay': 0.008, 'warmup_steps': 337}. Best is trial 17 with value: 0.8566285614320308.


Trial 18 with params: {'learning_rate': 0.001536807544028234, 'weight_decay': 0.007, 'warmup_steps': 335}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2082,0.482685,0.84633,0.848682,0.845563,0.845823
2,0.0688,0.573689,0.848624,0.848816,0.848868,0.848623
3,0.0397,0.794391,0.853211,0.854791,0.853835,0.853162
4,0.0271,0.851177,0.837156,0.837674,0.836764,0.836936
5,0.0196,1.068714,0.849771,0.84985,0.849952,0.849766
6,0.0158,1.212833,0.849771,0.852203,0.850541,0.849666
7,0.012,1.400576,0.855505,0.855505,0.855624,0.855492
8,0.0096,1.363823,0.854358,0.854339,0.854456,0.854342
9,0.0077,1.451606,0.849771,0.849808,0.849615,0.849683
10,0.0062,1.669625,0.854358,0.856054,0.855003,0.854302


[I 2025-03-23 22:57:30,509] Trial 18 finished with value: 0.8589447686231806 and parameters: {'learning_rate': 0.001536807544028234, 'weight_decay': 0.007, 'warmup_steps': 335}. Best is trial 18 with value: 0.8589447686231806.


Trial 19 with params: {'learning_rate': 0.0018637145116967417, 'weight_decay': 0.008, 'warmup_steps': 361}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2024,0.477638,0.849771,0.850036,0.849489,0.849626
2,0.0675,0.536432,0.844037,0.844838,0.844489,0.844024
3,0.0394,0.663816,0.840596,0.840552,0.840522,0.840536
4,0.0274,0.853674,0.847477,0.847648,0.847236,0.847352
5,0.0204,0.946349,0.861239,0.861582,0.86155,0.861238
6,0.0163,1.027634,0.847477,0.847467,0.847363,0.847405
7,0.0129,1.266555,0.834862,0.8348,0.834849,0.83482
8,0.0101,1.411528,0.841743,0.844554,0.842574,0.841602
9,0.0084,1.438645,0.848624,0.848624,0.848741,0.848611
10,0.0066,1.580447,0.847477,0.847458,0.847573,0.847461


[I 2025-03-23 23:32:38,127] Trial 19 finished with value: 0.8520313332412541 and parameters: {'learning_rate': 0.0018637145116967417, 'weight_decay': 0.008, 'warmup_steps': 361}. Best is trial 18 with value: 0.8589447686231806.


Trial 20 with params: {'learning_rate': 0.0022626597514685, 'weight_decay': 0.004, 'warmup_steps': 238}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1888,0.484486,0.855505,0.855948,0.855161,0.855333
2,0.0644,0.667862,0.857798,0.85803,0.85754,0.857672
3,0.0401,0.762785,0.844037,0.84455,0.844405,0.844033
4,0.0287,0.938694,0.844037,0.844576,0.843647,0.843826
5,0.0217,1.001187,0.849771,0.851027,0.85033,0.849737


[I 2025-03-23 23:44:25,433] Trial 20 pruned. 


Trial 21 with params: {'learning_rate': 0.0017470216654098148, 'weight_decay': 0.008, 'warmup_steps': 357}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2051,0.509573,0.852064,0.853016,0.851572,0.851797
2,0.0674,0.567801,0.848624,0.848816,0.848868,0.848623
3,0.0394,0.828458,0.847477,0.848728,0.848036,0.847443
4,0.0275,0.792328,0.847477,0.847436,0.847405,0.847419
5,0.0201,0.907612,0.844037,0.845589,0.844658,0.843984
6,0.0154,1.158066,0.836009,0.836342,0.836312,0.836009
7,0.0125,1.182162,0.852064,0.852698,0.851657,0.851852
8,0.0098,1.375394,0.837156,0.837156,0.83727,0.837142
9,0.008,1.421026,0.840596,0.841045,0.840943,0.840594
10,0.0063,1.564664,0.84289,0.842913,0.843026,0.84288


[I 2025-03-24 00:07:49,774] Trial 21 pruned. 


Trial 22 with params: {'learning_rate': 0.004172059637224691, 'weight_decay': 0.004, 'warmup_steps': 417}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1873,0.463321,0.847477,0.847458,0.847573,0.847461
2,0.0754,0.595621,0.856651,0.856804,0.856877,0.85665
3,0.0546,0.722703,0.845183,0.845141,0.84511,0.845125
4,0.0428,0.783014,0.848624,0.848737,0.848825,0.848621
5,0.0349,0.929581,0.850917,0.851204,0.851204,0.850917
6,0.0288,0.999352,0.854358,0.854311,0.854414,0.854335
7,0.0232,1.074214,0.863532,0.863727,0.863297,0.86342
8,0.0193,1.288836,0.84633,0.84675,0.845984,0.846148
9,0.0158,1.349314,0.856651,0.857243,0.857045,0.856647
10,0.0121,1.486138,0.850917,0.851031,0.85112,0.850914


[I 2025-03-24 00:30:54,074] Trial 22 pruned. 


Trial 23 with params: {'learning_rate': 0.00045839085396423074, 'weight_decay': 0.005, 'warmup_steps': 354}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.275,0.405808,0.847477,0.852299,0.846394,0.846625
2,0.1255,0.50104,0.862385,0.863294,0.861918,0.86215
3,0.0703,0.629219,0.844037,0.84434,0.843732,0.843876
4,0.0455,0.67526,0.854358,0.854352,0.854246,0.854289
5,0.0317,0.78966,0.861239,0.862732,0.861844,0.861197
6,0.0231,0.934196,0.841743,0.842917,0.841185,0.841409
7,0.0169,1.122997,0.848624,0.849279,0.849036,0.848617
8,0.013,1.246649,0.837156,0.837674,0.836764,0.836936
9,0.0104,1.312271,0.850917,0.850967,0.851078,0.85091
10,0.0079,1.460496,0.855505,0.855554,0.855666,0.855498


[I 2025-03-24 01:05:51,828] Trial 23 finished with value: 0.8542886202128093 and parameters: {'learning_rate': 0.00045839085396423074, 'weight_decay': 0.005, 'warmup_steps': 354}. Best is trial 18 with value: 0.8589447686231806.


Trial 24 with params: {'learning_rate': 0.00025554160100023593, 'weight_decay': 0.005, 'warmup_steps': 381}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3085,0.391468,0.830275,0.834266,0.82925,0.829413
2,0.1773,0.42527,0.848624,0.84884,0.848362,0.848489
3,0.1163,0.55188,0.831422,0.835605,0.830376,0.830537
4,0.0807,0.606256,0.84289,0.843977,0.842353,0.842575
5,0.0588,0.800828,0.832569,0.834785,0.833312,0.832462


[I 2025-03-24 01:17:29,052] Trial 24 pruned. 


Trial 25 with params: {'learning_rate': 0.0009339869757242528, 'weight_decay': 0.006, 'warmup_steps': 296}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2309,0.451634,0.855505,0.85696,0.854909,0.855169
2,0.0812,0.563411,0.852064,0.852087,0.852204,0.852055
3,0.0445,0.71671,0.840596,0.840536,0.840606,0.840561
4,0.0289,0.915492,0.841743,0.841682,0.841732,0.841702
5,0.0203,1.031881,0.856651,0.857243,0.857045,0.856647
6,0.0152,1.249191,0.857798,0.85774,0.857792,0.857762
7,0.0117,1.201629,0.854358,0.854861,0.853993,0.854173
8,0.0091,1.375276,0.857798,0.857764,0.857877,0.857779
9,0.0074,1.34348,0.857798,0.857798,0.857919,0.857786
10,0.0058,1.552459,0.858945,0.860432,0.85955,0.858903


[I 2025-03-24 01:52:42,578] Trial 25 finished with value: 0.8635319306191749 and parameters: {'learning_rate': 0.0009339869757242528, 'weight_decay': 0.006, 'warmup_steps': 296}. Best is trial 25 with value: 0.8635319306191749.


Trial 26 with params: {'learning_rate': 0.0005251348606743107, 'weight_decay': 0.007, 'warmup_steps': 269}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2619,0.417219,0.84633,0.851729,0.845184,0.84539
2,0.1118,0.514068,0.844037,0.844045,0.8439,0.843955
3,0.0619,0.678694,0.853211,0.853162,0.853162,0.853162
4,0.04,0.747362,0.848624,0.853662,0.84752,0.847752
5,0.0277,0.812538,0.853211,0.853404,0.853456,0.85321
6,0.0202,1.122997,0.853211,0.854873,0.852572,0.852836
7,0.0151,0.97258,0.861239,0.863075,0.860581,0.860868
8,0.0117,1.072332,0.850917,0.850867,0.850867,0.850867
9,0.0093,1.367591,0.852064,0.852045,0.852162,0.852048
10,0.0071,1.47813,0.847477,0.847815,0.847784,0.847477


[I 2025-03-24 02:15:54,698] Trial 26 pruned. 


Trial 27 with params: {'learning_rate': 0.0030210510640078995, 'weight_decay': 0.01, 'warmup_steps': 289}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1854,0.502358,0.849771,0.851524,0.84911,0.849369
2,0.0668,0.585169,0.853211,0.853435,0.852951,0.85308
3,0.0436,0.68195,0.850917,0.851204,0.851204,0.850917
4,0.0327,0.746634,0.844037,0.844321,0.844321,0.844037
5,0.0262,1.010276,0.855505,0.856491,0.856003,0.855486
6,0.0203,0.99317,0.84633,0.846639,0.846026,0.846172
7,0.0171,0.933498,0.840596,0.840684,0.840395,0.840485
8,0.0138,1.191192,0.84633,0.846522,0.846573,0.846329
9,0.0109,1.030241,0.849771,0.849711,0.849783,0.849737
10,0.0083,1.285456,0.848624,0.848589,0.848699,0.848604


[I 2025-03-24 02:50:32,208] Trial 27 finished with value: 0.8566511876251837 and parameters: {'learning_rate': 0.0030210510640078995, 'weight_decay': 0.01, 'warmup_steps': 289}. Best is trial 25 with value: 0.8635319306191749.


Trial 28 with params: {'learning_rate': 0.0012578976339183787, 'weight_decay': 0.01, 'warmup_steps': 270}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2141,0.485913,0.852064,0.853394,0.851488,0.851736
2,0.0722,0.550958,0.852064,0.852144,0.852246,0.852059
3,0.0412,0.698518,0.850917,0.850867,0.850867,0.850867
4,0.0271,0.835273,0.863532,0.863935,0.863213,0.863381
5,0.0195,0.979895,0.83945,0.83996,0.839816,0.839446
6,0.0149,1.077523,0.856651,0.856804,0.856877,0.85665
7,0.0113,1.253386,0.853211,0.853731,0.853583,0.853208
8,0.0091,1.300222,0.847477,0.847419,0.847447,0.847432
9,0.0073,1.380441,0.866972,0.867376,0.867307,0.866972
10,0.006,1.383827,0.856651,0.856804,0.856877,0.85665


[I 2025-03-24 03:25:59,888] Trial 28 finished with value: 0.8623737373737375 and parameters: {'learning_rate': 0.0012578976339183787, 'weight_decay': 0.01, 'warmup_steps': 270}. Best is trial 25 with value: 0.8635319306191749.


Trial 29 with params: {'learning_rate': 0.0004918225770884542, 'weight_decay': 0.01, 'warmup_steps': 246}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2649,0.424941,0.84633,0.852147,0.845142,0.845334
2,0.117,0.510605,0.856651,0.857038,0.856329,0.856493
3,0.0649,0.62519,0.847477,0.847467,0.847363,0.847405
4,0.0422,0.67916,0.854358,0.855915,0.85374,0.854003
5,0.0292,0.798154,0.856651,0.858356,0.857298,0.856597
6,0.0213,0.877427,0.852064,0.852025,0.851993,0.852008
7,0.0161,0.915427,0.855505,0.855456,0.855456,0.855456
8,0.0123,1.283575,0.855505,0.855832,0.855203,0.855355
9,0.0096,1.210062,0.848624,0.848757,0.848404,0.848509
10,0.0076,1.463187,0.847477,0.847557,0.847657,0.847472


[I 2025-03-24 03:49:15,010] Trial 29 pruned. 


Trial 30 with params: {'learning_rate': 0.0011899829840926566, 'weight_decay': 0.004, 'warmup_steps': 321}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2176,0.475148,0.855505,0.856079,0.855119,0.85531
2,0.0727,0.54457,0.84289,0.842871,0.842984,0.842873
3,0.0406,0.718996,0.854358,0.854352,0.854246,0.854289
4,0.027,0.87258,0.84289,0.843054,0.842648,0.842761
5,0.0193,0.912697,0.858945,0.859097,0.859171,0.858943
6,0.0145,1.194339,0.854358,0.855433,0.854877,0.854335
7,0.0113,1.062308,0.865826,0.865906,0.866012,0.865821
8,0.0089,1.177232,0.849771,0.849945,0.849531,0.849647
9,0.0073,1.242993,0.853211,0.853162,0.853162,0.853162
10,0.0057,1.423479,0.862385,0.86258,0.862634,0.862385


[I 2025-03-24 04:12:57,536] Trial 30 finished with value: 0.8428897016372012 and parameters: {'learning_rate': 0.0011899829840926566, 'weight_decay': 0.004, 'warmup_steps': 321}. Best is trial 25 with value: 0.8635319306191749.


Trial 31 with params: {'learning_rate': 0.001614523497915837, 'weight_decay': 0.01, 'warmup_steps': 262}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2041,0.489247,0.852064,0.853394,0.851488,0.851736
2,0.0684,0.502157,0.870413,0.870436,0.870559,0.870404
3,0.0393,0.790436,0.844037,0.845636,0.843395,0.843639
4,0.0267,0.809398,0.853211,0.853186,0.853119,0.853148
5,0.0195,0.909417,0.850917,0.851896,0.851414,0.850898
6,0.0153,1.006977,0.84289,0.844549,0.843532,0.84283
7,0.012,1.183423,0.852064,0.852165,0.851867,0.851961
8,0.0097,1.008534,0.858945,0.858968,0.859087,0.858936
9,0.0077,1.398166,0.860092,0.860034,0.860087,0.860056
10,0.0062,1.441199,0.855505,0.855832,0.855203,0.855355


[I 2025-03-24 04:33:20,538] Trial 31 finished with value: 0.8577615011023589 and parameters: {'learning_rate': 0.001614523497915837, 'weight_decay': 0.01, 'warmup_steps': 262}. Best is trial 25 with value: 0.8635319306191749.


Trial 32 with params: {'learning_rate': 0.001403122459779143, 'weight_decay': 0.01, 'warmup_steps': 257}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2089,0.481572,0.858945,0.859227,0.858666,0.85881
2,0.0694,0.511717,0.854358,0.854311,0.854414,0.854335
3,0.0394,0.702197,0.853211,0.853325,0.853414,0.853208
4,0.0265,0.821205,0.850917,0.850892,0.850825,0.850854
5,0.0194,1.020348,0.860092,0.860286,0.860339,0.860091
6,0.0146,1.054736,0.852064,0.852104,0.851909,0.851978
7,0.0118,1.228997,0.848624,0.848673,0.848783,0.848617
8,0.0094,1.219429,0.834862,0.834827,0.834933,0.834841
9,0.0073,1.54456,0.848624,0.848816,0.848868,0.848623
10,0.006,1.499814,0.862385,0.862332,0.862423,0.862359


[I 2025-03-24 04:53:39,890] Trial 32 finished with value: 0.853204059193948 and parameters: {'learning_rate': 0.001403122459779143, 'weight_decay': 0.01, 'warmup_steps': 257}. Best is trial 25 with value: 0.8635319306191749.


Trial 33 with params: {'learning_rate': 0.0005301889096853833, 'weight_decay': 0.01, 'warmup_steps': 344}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2662,0.403714,0.855505,0.859176,0.854572,0.854862
2,0.1142,0.497607,0.858945,0.859054,0.85875,0.858847
3,0.0627,0.578123,0.852064,0.852018,0.85212,0.852041
4,0.0405,0.737818,0.857798,0.858379,0.857413,0.857606
5,0.028,0.851355,0.850917,0.851204,0.851204,0.850917
6,0.0205,1.03427,0.849771,0.853535,0.848815,0.84908
7,0.0152,0.912089,0.864679,0.865977,0.864128,0.864394
8,0.012,1.164971,0.850917,0.851621,0.850488,0.85069
9,0.0092,1.235885,0.860092,0.86024,0.859876,0.859986
10,0.0072,1.373455,0.84633,0.846275,0.846363,0.846301


[I 2025-03-24 05:07:11,296] Trial 33 pruned. 


Trial 34 with params: {'learning_rate': 0.0009140737525485632, 'weight_decay': 0.007, 'warmup_steps': 278}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2305,0.477387,0.853211,0.854257,0.852698,0.852932
2,0.0809,0.527098,0.850917,0.851031,0.85112,0.850914
3,0.0444,0.735089,0.845183,0.845217,0.845026,0.845094
4,0.0287,0.889165,0.83945,0.839744,0.839143,0.839284
5,0.0201,1.143577,0.844037,0.848151,0.845037,0.843799


[I 2025-03-24 05:13:57,305] Trial 34 pruned. 


Trial 35 with params: {'learning_rate': 0.001857468898001412, 'weight_decay': 0.007, 'warmup_steps': 219}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1951,0.532461,0.848624,0.848937,0.84832,0.848468
2,0.0647,0.62965,0.850917,0.850867,0.850867,0.850867
3,0.039,0.744352,0.84633,0.846279,0.846279,0.846279
4,0.0266,0.843836,0.845183,0.845277,0.844984,0.845076
5,0.0201,1.067148,0.841743,0.842027,0.842027,0.841743
6,0.0157,1.287766,0.834862,0.835651,0.835312,0.834848
7,0.0128,1.199635,0.837156,0.837159,0.837017,0.83707
8,0.0098,1.537463,0.833716,0.834751,0.834228,0.833689
9,0.0082,1.570045,0.845183,0.845277,0.844984,0.845076
10,0.0063,1.834253,0.848624,0.848589,0.848699,0.848604


[I 2025-03-24 05:34:53,293] Trial 35 finished with value: 0.8382770172953307 and parameters: {'learning_rate': 0.001857468898001412, 'weight_decay': 0.007, 'warmup_steps': 219}. Best is trial 25 with value: 0.8635319306191749.


Trial 36 with params: {'learning_rate': 0.0017539795315990006, 'weight_decay': 0.01, 'warmup_steps': 175}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1929,0.465736,0.865826,0.865827,0.865717,0.865762
2,0.0642,0.657306,0.850917,0.851575,0.85133,0.85091
3,0.0383,0.733988,0.857798,0.858974,0.85834,0.857771
4,0.0262,0.927822,0.844037,0.844007,0.843942,0.84397
5,0.0199,0.973231,0.853211,0.854791,0.853835,0.853162
6,0.0153,1.221262,0.84289,0.842843,0.842942,0.842865
7,0.0122,1.117538,0.836009,0.83659,0.835596,0.835774
8,0.0099,1.179109,0.84289,0.843054,0.842648,0.842761
9,0.0076,1.403614,0.866972,0.867376,0.867307,0.866972
10,0.0063,1.491482,0.848624,0.848737,0.848825,0.848621


[I 2025-03-24 05:48:44,264] Trial 36 pruned. 


Trial 37 with params: {'learning_rate': 0.0010588544048457534, 'weight_decay': 0.01, 'warmup_steps': 286}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2215,0.444243,0.865826,0.867457,0.865212,0.865499
2,0.0757,0.580387,0.853211,0.855801,0.854004,0.8531
3,0.042,0.674379,0.849771,0.850356,0.850162,0.849766
4,0.0275,0.787193,0.853211,0.853498,0.853498,0.853211
5,0.0194,1.122688,0.847477,0.852563,0.848583,0.847171
6,0.0146,1.318191,0.844037,0.843976,0.844026,0.843996
7,0.0114,1.24352,0.844037,0.844715,0.843605,0.843799
8,0.0092,1.394368,0.845183,0.845351,0.844942,0.845056
9,0.0072,1.538113,0.831422,0.83175,0.831092,0.831235
10,0.0059,1.797936,0.836009,0.836032,0.836143,0.835999


[I 2025-03-24 06:02:28,672] Trial 37 pruned. 


Trial 38 with params: {'learning_rate': 0.00012508880048385813, 'weight_decay': 0.009000000000000001, 'warmup_steps': 382}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3461,0.397097,0.816514,0.816789,0.816789,0.816514
2,0.2391,0.411084,0.837156,0.837156,0.83727,0.837142
3,0.1868,0.460971,0.832569,0.835055,0.831755,0.831971
4,0.1502,0.4642,0.854358,0.854299,0.854372,0.854325
5,0.1222,0.503334,0.834862,0.835144,0.835144,0.834862
6,0.1001,0.604562,0.850917,0.851349,0.850573,0.850741
7,0.0833,0.697903,0.829128,0.829834,0.829555,0.829117
8,0.071,0.744712,0.833716,0.833739,0.833554,0.833619
9,0.0604,0.876613,0.829128,0.8323,0.830018,0.828939
10,0.0517,0.806266,0.841743,0.841712,0.841648,0.841676


[I 2025-03-24 06:16:03,158] Trial 38 pruned. 


Trial 39 with params: {'learning_rate': 5.7801019639330395e-05, 'weight_decay': 0.002, 'warmup_steps': 381}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3795,0.424681,0.797018,0.796952,0.797013,0.796973
2,0.2994,0.417476,0.819954,0.8208,0.82042,0.819935
3,0.2576,0.417903,0.836009,0.836342,0.836312,0.836009
4,0.2263,0.429174,0.837156,0.838708,0.836512,0.83674
5,0.2005,0.44787,0.827982,0.831937,0.826955,0.827108


[I 2025-03-24 06:22:50,362] Trial 39 pruned. 


Trial 40 with params: {'learning_rate': 0.00442309608382424, 'weight_decay': 0.006, 'warmup_steps': 304}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1868,0.511163,0.856651,0.859534,0.855824,0.85612
2,0.0782,0.565971,0.847477,0.84793,0.847826,0.847475
3,0.0569,0.686761,0.837156,0.837156,0.83727,0.837142
4,0.0457,0.691942,0.841743,0.845231,0.840806,0.84104
5,0.0376,0.827626,0.823394,0.823443,0.823546,0.823386
6,0.0313,1.046726,0.837156,0.837273,0.836933,0.837033
7,0.0258,1.031305,0.838303,0.83833,0.838143,0.838209
8,0.0214,1.281939,0.834862,0.834864,0.834723,0.834775
9,0.0169,1.386388,0.832569,0.832569,0.832681,0.832555
10,0.0136,1.352874,0.836009,0.836159,0.836228,0.836007


[I 2025-03-24 06:36:15,131] Trial 40 pruned. 


Trial 41 with params: {'learning_rate': 0.003453814366154446, 'weight_decay': 0.01, 'warmup_steps': 205}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.187,0.419403,0.864679,0.86476,0.864507,0.864593
2,0.07,0.667692,0.847477,0.848781,0.846899,0.847139
3,0.0483,0.66301,0.850917,0.850931,0.850783,0.850839
4,0.0368,0.662214,0.857798,0.857817,0.857666,0.857723
5,0.0301,0.863357,0.850917,0.850858,0.850909,0.850879


[I 2025-03-24 06:42:58,353] Trial 41 pruned. 


Trial 42 with params: {'learning_rate': 0.0017810999010165504, 'weight_decay': 0.01, 'warmup_steps': 357}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2032,0.536371,0.856651,0.857808,0.856119,0.856364
2,0.0672,0.592254,0.858945,0.85985,0.859424,0.85893
3,0.0394,0.759232,0.848624,0.848816,0.848868,0.848623
4,0.0272,0.753726,0.847477,0.8475,0.847615,0.847467
5,0.02,1.051513,0.853211,0.853435,0.852951,0.85308
6,0.0156,1.075753,0.841743,0.841947,0.841479,0.841602
7,0.0122,1.135131,0.844037,0.844576,0.843647,0.843826
8,0.0096,1.400039,0.847477,0.848244,0.847026,0.847231
9,0.008,1.532936,0.84289,0.843142,0.842605,0.842739
10,0.0062,1.508242,0.854358,0.854302,0.85433,0.854315


[I 2025-03-24 07:03:10,942] Trial 42 finished with value: 0.8496470591332251 and parameters: {'learning_rate': 0.0017810999010165504, 'weight_decay': 0.01, 'warmup_steps': 357}. Best is trial 25 with value: 0.8635319306191749.


Trial 43 with params: {'learning_rate': 0.0030668738210386758, 'weight_decay': 0.01, 'warmup_steps': 239}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1857,0.468279,0.84633,0.84646,0.84611,0.846214
2,0.0686,0.577533,0.84633,0.847729,0.845731,0.845973
3,0.0457,0.577888,0.862385,0.862351,0.862465,0.862367
4,0.034,0.661172,0.858945,0.859337,0.858624,0.858789
5,0.0268,0.978853,0.847477,0.847512,0.84732,0.847389


[I 2025-03-24 07:09:51,377] Trial 43 pruned. 


Trial 44 with params: {'learning_rate': 0.0034104651800238926, 'weight_decay': 0.01, 'warmup_steps': 297}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1846,0.486978,0.860092,0.860206,0.860297,0.860089
2,0.0705,0.638008,0.850917,0.850931,0.850783,0.850839
3,0.0481,0.720432,0.853211,0.853325,0.853414,0.853208
4,0.0371,0.723554,0.857798,0.858131,0.857498,0.857651
5,0.0293,0.884021,0.854358,0.854698,0.854667,0.854358
6,0.024,0.964237,0.857798,0.857798,0.857919,0.857786
7,0.0192,1.030546,0.856651,0.857161,0.856287,0.85647
8,0.0162,1.154444,0.84289,0.843054,0.842648,0.842761
9,0.0127,1.22675,0.856651,0.856891,0.856919,0.856651
10,0.0098,1.583062,0.853211,0.853157,0.853246,0.853183


[I 2025-03-24 07:30:06,344] Trial 44 finished with value: 0.8496660547078061 and parameters: {'learning_rate': 0.0034104651800238926, 'weight_decay': 0.01, 'warmup_steps': 297}. Best is trial 25 with value: 0.8635319306191749.


Trial 45 with params: {'learning_rate': 0.001112407136080856, 'weight_decay': 0.009000000000000001, 'warmup_steps': 287}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2194,0.462439,0.858945,0.859227,0.858666,0.85881


In [43]:
print(best_trial3)

BestRun(run_id='88', objective=0.783874708715165, hyperparameters={'learning_rate': 0.0034351095517178363, 'weight_decay': 0.009000000000000001, 'warmup_steps': 15}, run_summary=None)


In [31]:
base.reset_seed()

In [32]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_aug_hp-search", logging_dir=f"~/logs/{DATASET}/bilstm-distill_aug_hp-search", remove_unused_columns=False, epochs=num_epochs, batch_size=batch_size)

In [33]:
def hp_space(trial):
    params =  {
        "learning_rate": trial.suggest_float("learning_rate", 5e-5, 5e-3, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0, 1e-2, step=1e-3),
        "warmup_steps" : trial.suggest_int("warmup_steps", 0, warm_up),
        "lambda_param": trial.suggest_float("lambda_param",0,1,step=.1),
        "temperature": trial.suggest_float("temperature", 2,7, step=.5)
    }
    print(f"Trial {trial.number} with params: {params}")
    return params

In [34]:
pruner = optuna.pruners.HyperbandPruner(min_resource=min_r, max_resource=max_r, reduction_factor=2, bootstrap_count=2)
sampler = optuna.samplers.TPESampler(seed=42, multivariate=True)



In [35]:
trainer = base.DistilTrainer(
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    model_init = lambda: get_BiLSTM(),
)
  

In [36]:
best_trial4 = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    compute_objective=lambda metrics: metrics["eval_f1"],
    pruner=pruner,
    sampler=sampler,
    study_name="Distill-aug",
    n_trials=150
)

In [37]:
print(best_trial4)

BestRun(run_id='69', objective=0.8232465493473291, hyperparameters={'learning_rate': 0.003663666314607629, 'weight_decay': 0.0, 'warmup_steps': 21, 'lambda_param': 0.6000000000000001, 'temperature': 5.5}, run_summary=None)


In [38]:
print("Best normal training score: ", best_trial)
print("Best distilation trianing score: ", best_trial2)
print("Best normal training score with augmentations: ", best_trial3)
print("Best distilation trianing score with augmentations: ",best_trial4)

NameError: name 'best_trial' is not defined