In [1]:
from transformers import Trainer, BertTokenizer, BertForSequenceClassification
from datasets import load_from_disk
import optuna
import torch
import math
import base

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 1g.10gb


In [3]:
DATASET = "sst2"

In [4]:
train_data = load_from_disk(f"~/data/{DATASET}/train-logits")
eval_data = load_from_disk(f"~/data/{DATASET}/eval-logits")
test_data = load_from_disk(f"~/data/{DATASET}/test-logits")

all_train_data = load_from_disk(f"~/data/{DATASET}/train-logits-augmented")
tokenizer = BertTokenizer.from_pretrained("ndavid/autotrain-trec-fine-bert-739422530")

In [5]:
train = train_data.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the train dataset")
eval = eval_data.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the eval dataset")
test = test_data.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the test dataset")

train_aug = all_train_data.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the augmented dataset")

In [6]:
num_epochs = 15
batch_size = 128

In [7]:
#Nápočet epoch na steps
data_length = len(train_data)
min_r = math.ceil(data_length/batch_size)*5
max_r = math.ceil(data_length/batch_size)*num_epochs
warm_up = math.ceil(data_length/batch_size/10)

In [8]:
def hp_space(trial):
    params =  {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-4, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0, 1e-2, step=1e-3),
        "warmup_steps" : trial.suggest_int("warmup_steps", 0, warm_up)
    }   
    print(f"Trial {trial.number} with params: {params}")
    return params

In [9]:
pruner = optuna.pruners.HyperbandPruner(min_resource=min_r, max_resource=max_r, reduction_factor=2, bootstrap_count=2)
sampler = optuna.samplers.TPESampler(seed=42, multivariate=True)



In [10]:
def get_Bert():
    return BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=2)

In [11]:
base.reset_seed()

In [12]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-base_hp-search", logging_dir=f"~/logs/{DATASET}/bert-base_hp-search", epochs=num_epochs, batch_size=batch_size)

In [13]:
trainer = Trainer(
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    model_init = lambda: get_Bert(),
)
  

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    compute_objective=lambda metrics: metrics["eval_f1"],
    pruner=pruner,
    sampler=sampler,
    study_name="Test-base",
    n_trials=150
)

[I 2025-03-28 11:31:45,321] A new study created in memory with name: Test-base


Trial 0 with params: {'learning_rate': 4.3284502212938785e-05, 'weight_decay': 0.01, 'warmup_steps': 32}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5189,0.461005,0.793578,0.793637,0.79334,0.793421
2,0.3628,0.434238,0.809633,0.809585,0.809527,0.809552
3,0.3057,0.43637,0.815367,0.815301,0.815326,0.815312
4,0.2721,0.454227,0.822248,0.82238,0.821998,0.822101
5,0.2462,0.441469,0.81078,0.810802,0.810906,0.810768
6,0.2274,0.461721,0.81422,0.814494,0.814494,0.81422
7,0.214,0.481988,0.813073,0.813301,0.813326,0.813073
8,0.2026,0.489547,0.81422,0.814175,0.814116,0.814141
9,0.1922,0.500466,0.808486,0.809646,0.809032,0.808444
10,0.1852,0.505199,0.816514,0.816456,0.816536,0.816479


[I 2025-03-28 11:38:30,533] Trial 0 pruned. 


Trial 1 with params: {'learning_rate': 0.00010401663679887307, 'weight_decay': 0.001, 'warmup_steps': 6}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.437,0.433008,0.808486,0.809646,0.809032,0.808444
2,0.2841,0.453273,0.803899,0.804788,0.803349,0.803506
3,0.2277,0.459349,0.819954,0.820032,0.820125,0.819948
4,0.1948,0.503375,0.825688,0.827847,0.824914,0.825113
5,0.1712,0.515211,0.813073,0.813504,0.81341,0.813071


[I 2025-03-28 11:41:59,601] Trial 1 pruned. 


Trial 2 with params: {'learning_rate': 1.2551115172973821e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 26}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6328,0.560277,0.739679,0.739647,0.739444,0.739498
2,0.4937,0.477547,0.779817,0.780281,0.780163,0.779812
3,0.4296,0.461764,0.791284,0.791225,0.791298,0.791245
4,0.3976,0.459046,0.78555,0.785693,0.785247,0.785345
5,0.3757,0.452762,0.793578,0.793775,0.793256,0.793365
6,0.3572,0.443788,0.81078,0.810802,0.810906,0.810768
7,0.3454,0.44071,0.813073,0.81322,0.813284,0.813071
8,0.3334,0.440655,0.809633,0.809597,0.809695,0.809608
9,0.3244,0.439242,0.811927,0.811927,0.812032,0.811911
10,0.3178,0.442916,0.811927,0.81201,0.811695,0.811784


[I 2025-03-28 11:48:38,556] Trial 2 pruned. 


Trial 3 with params: {'learning_rate': 0.00015958573588141273, 'weight_decay': 0.0, 'warmup_steps': 42}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4159,0.4161,0.806193,0.80892,0.807032,0.806007
2,0.2525,0.477402,0.808486,0.809103,0.808022,0.808177
3,0.1966,0.518501,0.815367,0.816057,0.815789,0.815355
4,0.1642,0.537852,0.811927,0.812776,0.8114,0.811569
5,0.1407,0.580719,0.802752,0.803122,0.803065,0.802751


[I 2025-03-28 11:52:00,636] Trial 3 pruned. 


Trial 4 with params: {'learning_rate': 0.00025959425503112657, 'weight_decay': 0.002, 'warmup_steps': 8}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.376,0.430053,0.798165,0.80185,0.79915,0.797858
2,0.2235,0.529651,0.808486,0.809398,0.807938,0.808102
3,0.1681,0.592634,0.81422,0.814269,0.814368,0.814211
4,0.1348,0.574313,0.811927,0.812368,0.811526,0.811673
5,0.1111,0.705184,0.802752,0.802694,0.80277,0.802715
6,0.0925,0.78037,0.797018,0.797242,0.797266,0.797018
7,0.0778,0.869577,0.795872,0.796054,0.796097,0.79587
8,0.0689,0.83061,0.793578,0.793964,0.793172,0.7933
9,0.0607,0.955186,0.78555,0.789908,0.786636,0.785121
10,0.0526,0.97384,0.78555,0.785862,0.785836,0.78555


[I 2025-03-28 11:58:42,774] Trial 4 pruned. 


Trial 5 with params: {'learning_rate': 2.049268011541735e-05, 'weight_decay': 0.003, 'warmup_steps': 23}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5888,0.494544,0.770642,0.771171,0.770144,0.770251
2,0.4342,0.457863,0.793578,0.793519,0.793593,0.793539
3,0.38,0.444194,0.803899,0.803977,0.804065,0.803893
4,0.3481,0.448086,0.800459,0.801577,0.799844,0.799995
5,0.3245,0.440928,0.813073,0.813269,0.812779,0.812894


[I 2025-03-28 12:02:04,453] Trial 5 pruned. 


Trial 6 with params: {'learning_rate': 5.4182823195332406e-05, 'weight_decay': 0.003, 'warmup_steps': 26}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4971,0.451238,0.795872,0.795817,0.795761,0.795785
2,0.3409,0.430616,0.809633,0.809682,0.809779,0.809624
3,0.2834,0.438365,0.817661,0.817607,0.817578,0.817591
4,0.2499,0.46381,0.821101,0.821197,0.820872,0.820965
5,0.2243,0.453397,0.811927,0.812112,0.812158,0.811926
6,0.2066,0.479937,0.811927,0.812416,0.812284,0.811923
7,0.1929,0.501806,0.81422,0.814331,0.81441,0.814216
8,0.1821,0.520086,0.815367,0.815489,0.815115,0.815215
9,0.1719,0.532604,0.809633,0.810887,0.8102,0.809584
10,0.1649,0.539663,0.815367,0.815319,0.81541,0.815338


[I 2025-03-28 12:12:05,814] Trial 6 finished with value: 0.8164983164983165 and parameters: {'learning_rate': 5.4182823195332406e-05, 'weight_decay': 0.003, 'warmup_steps': 26}. Best is trial 6 with value: 0.8164983164983165.


Trial 7 with params: {'learning_rate': 1.7258215396625005e-05, 'weight_decay': 0.003, 'warmup_steps': 16}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6046,0.512565,0.762615,0.763293,0.762051,0.762139
2,0.4522,0.463292,0.786697,0.786637,0.78671,0.786657
3,0.3971,0.450236,0.795872,0.795836,0.795929,0.795845
4,0.3653,0.451116,0.795872,0.796378,0.795424,0.795561
5,0.3425,0.444939,0.806193,0.806462,0.805854,0.805978


[I 2025-03-28 12:15:27,944] Trial 7 pruned. 


Trial 8 with params: {'learning_rate': 5.954553793888986e-05, 'weight_decay': 0.008, 'warmup_steps': 8}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4843,0.447157,0.802752,0.802686,0.802686,0.802686
2,0.3316,0.430202,0.811927,0.811891,0.81199,0.811902
3,0.2744,0.437524,0.817661,0.817631,0.817536,0.817574
4,0.2411,0.469189,0.823394,0.823871,0.822998,0.823156
5,0.2156,0.46056,0.81078,0.811102,0.811074,0.81078


[I 2025-03-28 12:18:50,743] Trial 8 pruned. 


Trial 9 with params: {'learning_rate': 7.475992999956501e-05, 'weight_decay': 0.006, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4626,0.443429,0.801606,0.801545,0.801518,0.80153
2,0.3112,0.436101,0.813073,0.813041,0.812947,0.812985
3,0.254,0.442275,0.813073,0.813078,0.812905,0.812965
4,0.2211,0.482657,0.826835,0.82784,0.826292,0.826488
5,0.1965,0.478751,0.811927,0.812543,0.812326,0.811918
6,0.1797,0.508953,0.81078,0.81133,0.811158,0.810774
7,0.1652,0.546123,0.811927,0.811927,0.812032,0.811911
8,0.1556,0.57079,0.816514,0.816969,0.816115,0.816266
9,0.1452,0.589826,0.806193,0.807532,0.80678,0.806135
10,0.138,0.597323,0.813073,0.813009,0.813074,0.813032


[I 2025-03-28 12:25:31,512] Trial 9 pruned. 


Trial 10 with params: {'learning_rate': 0.000247535485253281, 'weight_decay': 0.005, 'warmup_steps': 34}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3861,0.421908,0.803899,0.806888,0.80478,0.803682
2,0.2268,0.521161,0.809633,0.809782,0.809358,0.809464
3,0.1714,0.564505,0.815367,0.815799,0.815705,0.815365
4,0.137,0.598941,0.81422,0.814185,0.814284,0.814196
5,0.1143,0.716075,0.797018,0.797556,0.797392,0.797012


[I 2025-03-28 12:28:54,548] Trial 10 pruned. 


Trial 11 with params: {'learning_rate': 4.640705568040428e-05, 'weight_decay': 0.007, 'warmup_steps': 42}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.515,0.458557,0.793578,0.793637,0.79334,0.793421
2,0.3564,0.432715,0.808486,0.808438,0.808527,0.808456
3,0.2989,0.437386,0.817661,0.817607,0.817578,0.817591
4,0.2651,0.456671,0.821101,0.821197,0.820872,0.820965
5,0.2392,0.444662,0.81422,0.814269,0.814368,0.814211
6,0.2207,0.467614,0.81422,0.814494,0.814494,0.81422
7,0.2073,0.488377,0.809633,0.809906,0.809906,0.809633
8,0.196,0.498985,0.81422,0.814175,0.814116,0.814141
9,0.1857,0.510463,0.811927,0.813006,0.812453,0.811891
10,0.1787,0.516075,0.816514,0.816456,0.816536,0.816479


[I 2025-03-28 12:39:03,289] Trial 11 finished with value: 0.8141957479047304 and parameters: {'learning_rate': 4.640705568040428e-05, 'weight_decay': 0.007, 'warmup_steps': 42}. Best is trial 6 with value: 0.8164983164983165.


Trial 12 with params: {'learning_rate': 6.735226471879416e-05, 'weight_decay': 0.006, 'warmup_steps': 39}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4812,0.444684,0.805046,0.80501,0.805107,0.80502
2,0.3208,0.432085,0.811927,0.811891,0.81199,0.811902
3,0.2629,0.439497,0.816514,0.816603,0.816284,0.816375
4,0.2298,0.477914,0.821101,0.821839,0.82062,0.820795
5,0.205,0.47226,0.81078,0.811209,0.811116,0.810778


[I 2025-03-28 12:42:26,096] Trial 12 pruned. 


Trial 13 with params: {'learning_rate': 1.3343699108210729e-05, 'weight_decay': 0.007, 'warmup_steps': 43}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6306,0.553083,0.738532,0.73856,0.738234,0.738299
2,0.4861,0.474523,0.784404,0.784668,0.784668,0.784404
3,0.4235,0.459808,0.793578,0.793542,0.793635,0.793551
4,0.3915,0.457392,0.786697,0.78688,0.786373,0.786477
5,0.3693,0.451159,0.793578,0.793775,0.793256,0.793365
6,0.3507,0.442237,0.81078,0.810858,0.810948,0.810774
7,0.3387,0.439642,0.808486,0.808509,0.808611,0.808474
8,0.3265,0.440044,0.81078,0.810715,0.810779,0.810738
9,0.3175,0.438952,0.813073,0.813096,0.8132,0.813061
10,0.3108,0.442824,0.815367,0.815425,0.815157,0.815238


[I 2025-03-28 12:49:10,952] Trial 13 pruned. 


Trial 14 with params: {'learning_rate': 0.00010904880653392973, 'weight_decay': 0.003, 'warmup_steps': 25}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4384,0.431211,0.802752,0.805069,0.803528,0.802603
2,0.2808,0.456231,0.806193,0.80763,0.805517,0.805675
3,0.2238,0.463777,0.821101,0.82115,0.821251,0.821092
4,0.1911,0.511,0.823394,0.825064,0.822704,0.822902
5,0.1676,0.526253,0.811927,0.8122,0.8122,0.811927
6,0.1503,0.566655,0.807339,0.808244,0.807822,0.807314
7,0.1352,0.636703,0.805046,0.805156,0.805233,0.805042
8,0.1252,0.647055,0.811927,0.813305,0.811274,0.811447
9,0.1142,0.695792,0.797018,0.799183,0.797771,0.796877
10,0.1072,0.695591,0.807339,0.807304,0.807401,0.807314


[I 2025-03-28 12:55:49,186] Trial 14 pruned. 


Trial 15 with params: {'learning_rate': 4.211177033787934e-05, 'weight_decay': 0.002, 'warmup_steps': 40}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5233,0.462096,0.793578,0.793586,0.793382,0.793447
2,0.3657,0.434939,0.809633,0.809615,0.809485,0.809533
3,0.3084,0.435718,0.81422,0.814158,0.814158,0.814158
4,0.2749,0.453082,0.819954,0.820083,0.819704,0.819806
5,0.249,0.440047,0.81422,0.81422,0.814326,0.814205
6,0.23,0.459618,0.813073,0.813396,0.813368,0.813073
7,0.2167,0.479535,0.813073,0.81322,0.813284,0.813071
8,0.2052,0.485872,0.815367,0.815336,0.815242,0.815279
9,0.1948,0.496923,0.808486,0.809646,0.809032,0.808444
10,0.1877,0.501373,0.815367,0.815303,0.815368,0.815326


[I 2025-03-28 13:05:55,755] Trial 15 finished with value: 0.8164790066294854 and parameters: {'learning_rate': 4.211177033787934e-05, 'weight_decay': 0.002, 'warmup_steps': 40}. Best is trial 6 with value: 0.8164983164983165.


Trial 16 with params: {'learning_rate': 2.253617142285837e-05, 'weight_decay': 0.002, 'warmup_steps': 38}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5822,0.487769,0.78211,0.782283,0.781784,0.781885
2,0.4253,0.455102,0.793578,0.793507,0.793551,0.793525
3,0.3707,0.4414,0.807339,0.807524,0.807569,0.807338
4,0.3385,0.446829,0.802752,0.803715,0.802181,0.802336
5,0.3145,0.438808,0.815367,0.815566,0.815073,0.81519
6,0.2949,0.436247,0.81422,0.814185,0.814284,0.814196
7,0.2817,0.439375,0.809633,0.81012,0.80999,0.809629
8,0.2693,0.443794,0.81422,0.814153,0.8142,0.814172
9,0.259,0.446034,0.81078,0.811102,0.811074,0.81078
10,0.2518,0.449827,0.815367,0.815336,0.815242,0.815279


[I 2025-03-28 13:12:38,954] Trial 16 pruned. 


Trial 17 with params: {'learning_rate': 7.03604506316601e-05, 'weight_decay': 0.003, 'warmup_steps': 35}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4764,0.444201,0.805046,0.804987,0.805064,0.805009
2,0.3168,0.433334,0.813073,0.813009,0.813074,0.813032
3,0.259,0.440987,0.816514,0.816546,0.816326,0.816397
4,0.226,0.480724,0.821101,0.821839,0.82062,0.820795
5,0.2014,0.476768,0.809633,0.81012,0.80999,0.809629
6,0.1843,0.507897,0.809633,0.810247,0.810032,0.809624
7,0.17,0.53902,0.813073,0.81322,0.813284,0.813071
8,0.1602,0.562309,0.816514,0.816675,0.816241,0.81635
9,0.1498,0.579975,0.802752,0.804382,0.803402,0.802668
10,0.1429,0.586653,0.815367,0.815319,0.81541,0.815338


[I 2025-03-28 13:22:45,291] Trial 17 finished with value: 0.8141957479047304 and parameters: {'learning_rate': 7.03604506316601e-05, 'weight_decay': 0.003, 'warmup_steps': 35}. Best is trial 6 with value: 0.8164983164983165.


Trial 18 with params: {'learning_rate': 0.0002950137270531351, 'weight_decay': 0.01, 'warmup_steps': 15}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3722,0.436959,0.806193,0.80773,0.806822,0.806119
2,0.2178,0.538754,0.806193,0.807093,0.805643,0.805804
3,0.162,0.585874,0.823394,0.823443,0.823546,0.823386
4,0.1281,0.627788,0.806193,0.806239,0.80598,0.806058
5,0.104,0.736716,0.800459,0.801201,0.800897,0.800442
6,0.0858,0.830111,0.802752,0.80289,0.802475,0.802577
7,0.0722,0.93485,0.790138,0.79067,0.790509,0.790131
8,0.0647,0.872465,0.798165,0.798372,0.797845,0.797957
9,0.0558,0.964403,0.788991,0.793908,0.790141,0.7885
10,0.0487,0.969067,0.792431,0.792454,0.792551,0.792418


[I 2025-03-28 13:29:29,749] Trial 18 pruned. 


Trial 19 with params: {'learning_rate': 4.243886237843546e-05, 'weight_decay': 0.0, 'warmup_steps': 26}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5193,0.46164,0.792431,0.792462,0.792214,0.792287
2,0.3645,0.434751,0.808486,0.808451,0.808359,0.808395
3,0.3077,0.436179,0.81422,0.814158,0.814158,0.814158
4,0.2741,0.453622,0.819954,0.820083,0.819704,0.819806
5,0.2482,0.440832,0.813073,0.813054,0.813158,0.813053
6,0.2293,0.46021,0.816514,0.816891,0.816831,0.816513
7,0.216,0.480182,0.813073,0.81322,0.813284,0.813071
8,0.2045,0.487121,0.815367,0.815336,0.815242,0.815279
9,0.1941,0.497982,0.807339,0.808409,0.807864,0.807303
10,0.1871,0.502386,0.81422,0.814153,0.8142,0.814172


[I 2025-03-28 13:36:17,299] Trial 19 pruned. 


Trial 20 with params: {'learning_rate': 1.4635648999047601e-05, 'weight_decay': 0.006, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6172,0.533641,0.75,0.750444,0.749495,0.749573
2,0.471,0.469642,0.783257,0.783279,0.783373,0.783243
3,0.4134,0.45628,0.794725,0.794676,0.794761,0.794692
4,0.3819,0.455382,0.788991,0.789363,0.788583,0.788706
5,0.3597,0.44899,0.798165,0.798565,0.79776,0.797893
6,0.3411,0.440347,0.809633,0.809597,0.809695,0.809608
7,0.3289,0.43857,0.809633,0.809682,0.809779,0.809624
8,0.3167,0.439671,0.81078,0.810715,0.810779,0.810738
9,0.3074,0.439185,0.811927,0.812037,0.812116,0.811923
10,0.3007,0.443231,0.813073,0.813192,0.812821,0.81292


[I 2025-03-28 13:43:03,281] Trial 20 pruned. 


Trial 21 with params: {'learning_rate': 3.549077053310177e-05, 'weight_decay': 0.008, 'warmup_steps': 41}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5392,0.467534,0.791284,0.791227,0.791172,0.791195
2,0.3821,0.439669,0.803899,0.803943,0.803686,0.803763
3,0.3255,0.434552,0.809633,0.809633,0.809737,0.809617
4,0.2924,0.448834,0.813073,0.813269,0.812779,0.812894
5,0.2667,0.43527,0.813073,0.813009,0.813074,0.813032
6,0.2471,0.448982,0.817661,0.817808,0.817873,0.817658
7,0.2338,0.463963,0.816514,0.816789,0.816789,0.816514
8,0.2221,0.468531,0.817661,0.817595,0.81762,0.817607
9,0.2115,0.477164,0.81422,0.814981,0.814663,0.814205
10,0.2042,0.481366,0.821101,0.821059,0.820999,0.821025


[I 2025-03-28 13:53:07,870] Trial 21 finished with value: 0.8176065796760941 and parameters: {'learning_rate': 3.549077053310177e-05, 'weight_decay': 0.008, 'warmup_steps': 41}. Best is trial 21 with value: 0.8176065796760941.


Trial 22 with params: {'learning_rate': 0.00016346269168385325, 'weight_decay': 0.009000000000000001, 'warmup_steps': 43}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4146,0.415505,0.807339,0.80994,0.808159,0.807168
2,0.2509,0.478353,0.806193,0.806801,0.805727,0.80588
3,0.1951,0.522842,0.81422,0.814839,0.814621,0.814211
4,0.1628,0.538862,0.81078,0.811872,0.81019,0.810361
5,0.1391,0.587849,0.802752,0.803023,0.803023,0.802752
6,0.121,0.63035,0.801606,0.802279,0.802023,0.801593
7,0.1057,0.731946,0.809633,0.809682,0.809779,0.809624
8,0.0953,0.744622,0.806193,0.807093,0.805643,0.805804
9,0.0857,0.812466,0.792431,0.795069,0.793266,0.792232
10,0.0796,0.798995,0.800459,0.800507,0.800602,0.800449


[I 2025-03-28 13:59:50,900] Trial 22 pruned. 


Trial 23 with params: {'learning_rate': 6.51901776232881e-05, 'weight_decay': 0.002, 'warmup_steps': 42}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4848,0.445062,0.805046,0.80501,0.805107,0.80502
2,0.3238,0.431336,0.81078,0.81076,0.810863,0.81076
3,0.2659,0.438527,0.818807,0.818841,0.81862,0.818692
4,0.2327,0.475525,0.821101,0.821698,0.820662,0.820829
5,0.2078,0.468937,0.81078,0.811007,0.811032,0.81078


[I 2025-03-28 14:03:13,302] Trial 23 pruned. 


Trial 24 with params: {'learning_rate': 3.9182923462709525e-05, 'weight_decay': 0.007, 'warmup_steps': 29}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5273,0.464649,0.792431,0.792418,0.792256,0.792311
2,0.3723,0.436934,0.806193,0.806239,0.80598,0.806058
3,0.3156,0.435243,0.81078,0.810731,0.810821,0.81075
4,0.2822,0.451126,0.818807,0.818972,0.818536,0.818646
5,0.2564,0.437978,0.813073,0.813025,0.813116,0.813044
6,0.2371,0.455002,0.81422,0.814494,0.814494,0.81422
7,0.2238,0.473219,0.815367,0.815596,0.815621,0.815367
8,0.2123,0.478184,0.81422,0.814175,0.814116,0.814141
9,0.2018,0.488377,0.81422,0.815304,0.814747,0.814185
10,0.1946,0.492344,0.819954,0.819889,0.819915,0.819901


[I 2025-03-28 14:13:19,235] Trial 24 finished with value: 0.814172283698243 and parameters: {'learning_rate': 3.9182923462709525e-05, 'weight_decay': 0.007, 'warmup_steps': 29}. Best is trial 21 with value: 0.8176065796760941.


Trial 25 with params: {'learning_rate': 0.0003026895453749053, 'weight_decay': 0.0, 'warmup_steps': 27}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3753,0.435407,0.798165,0.80185,0.79915,0.797858
2,0.2171,0.541408,0.81078,0.810971,0.810485,0.810598
3,0.1605,0.621798,0.815367,0.81539,0.815494,0.815355
4,0.1259,0.610062,0.81078,0.810746,0.810653,0.81069
5,0.1024,0.770615,0.795872,0.795981,0.796055,0.795867


[I 2025-03-28 14:16:40,897] Trial 25 pruned. 


Trial 26 with params: {'learning_rate': 1.5421203730264887e-05, 'weight_decay': 0.01, 'warmup_steps': 42}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.619,0.530811,0.754587,0.754947,0.754126,0.754213
2,0.4669,0.467876,0.780963,0.780986,0.781079,0.780949
3,0.4087,0.454543,0.794725,0.794676,0.794761,0.794692
4,0.3768,0.453688,0.791284,0.791564,0.790919,0.791037
5,0.3542,0.447563,0.800459,0.800865,0.800055,0.80019
6,0.3354,0.439212,0.811927,0.811891,0.81199,0.811902
7,0.323,0.437797,0.808486,0.808564,0.808653,0.80848
8,0.3106,0.439416,0.813073,0.813009,0.813074,0.813032
9,0.3012,0.439181,0.811927,0.812037,0.812116,0.811923
10,0.2945,0.443126,0.815367,0.815489,0.815115,0.815215


[I 2025-03-28 14:23:30,786] Trial 26 pruned. 


Trial 27 with params: {'learning_rate': 2.968297395257728e-05, 'weight_decay': 0.0, 'warmup_steps': 37}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.555,0.472553,0.786697,0.786698,0.786499,0.786561
2,0.3987,0.444844,0.800459,0.800474,0.800265,0.800332
3,0.3432,0.435863,0.808486,0.808632,0.808695,0.808484
4,0.3106,0.446266,0.816514,0.816969,0.816115,0.816266
5,0.2855,0.43492,0.817661,0.817631,0.817536,0.817574
6,0.2657,0.440511,0.815367,0.815445,0.815536,0.815361
7,0.2523,0.450539,0.809633,0.81012,0.80999,0.809629
8,0.2403,0.455377,0.818807,0.818746,0.818746,0.818746
9,0.2297,0.460458,0.816514,0.816789,0.816789,0.816514
10,0.2223,0.465389,0.817661,0.817595,0.81762,0.817607


[I 2025-03-28 14:33:41,416] Trial 27 finished with value: 0.8210406668350594 and parameters: {'learning_rate': 2.968297395257728e-05, 'weight_decay': 0.0, 'warmup_steps': 37}. Best is trial 27 with value: 0.8210406668350594.


Trial 28 with params: {'learning_rate': 2.3378491074230134e-05, 'weight_decay': 0.0, 'warmup_steps': 35}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.578,0.485068,0.78211,0.782283,0.781784,0.781885
2,0.4216,0.453978,0.793578,0.793507,0.793551,0.793525
3,0.367,0.440464,0.808486,0.808713,0.808737,0.808486
4,0.3348,0.446589,0.803899,0.804788,0.803349,0.803506
5,0.3106,0.438215,0.81422,0.814307,0.813989,0.814079
6,0.291,0.436528,0.815367,0.815348,0.815452,0.815347
7,0.2778,0.440311,0.81078,0.811209,0.811116,0.810778
8,0.2654,0.444888,0.811927,0.811863,0.811863,0.811863
9,0.255,0.447509,0.809633,0.810007,0.809948,0.809632
10,0.2478,0.451377,0.817661,0.817631,0.817536,0.817574


[I 2025-03-28 14:43:55,170] Trial 28 finished with value: 0.8164983164983165 and parameters: {'learning_rate': 2.3378491074230134e-05, 'weight_decay': 0.0, 'warmup_steps': 35}. Best is trial 27 with value: 0.8210406668350594.


Trial 29 with params: {'learning_rate': 5.987571743934924e-05, 'weight_decay': 0.006, 'warmup_steps': 20}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4868,0.446797,0.805046,0.804977,0.805022,0.804996
2,0.3314,0.430145,0.81078,0.81076,0.810863,0.81076
3,0.2738,0.437763,0.816514,0.81647,0.81641,0.816436
4,0.2405,0.469815,0.823394,0.823756,0.82304,0.823185
5,0.2151,0.461497,0.81078,0.811102,0.811074,0.81078
6,0.1978,0.48911,0.81422,0.814981,0.814663,0.814205
7,0.1839,0.513199,0.818807,0.818918,0.818999,0.818804
8,0.1735,0.534864,0.816514,0.816759,0.816199,0.816324
9,0.1632,0.549102,0.809633,0.811287,0.810285,0.809552
10,0.1563,0.555292,0.81422,0.814185,0.814284,0.814196


[I 2025-03-28 14:50:43,090] Trial 29 pruned. 


Trial 30 with params: {'learning_rate': 2.1223336686458735e-05, 'weight_decay': 0.005, 'warmup_steps': 33}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5872,0.492097,0.775229,0.775663,0.774775,0.774887
2,0.431,0.456821,0.794725,0.794658,0.794719,0.794679
3,0.3766,0.44313,0.803899,0.803977,0.804065,0.803893
4,0.3446,0.447571,0.801606,0.80282,0.80097,0.801122
5,0.3208,0.440079,0.81422,0.814461,0.813905,0.814028
6,0.3013,0.436148,0.813073,0.813025,0.813116,0.813044
7,0.2883,0.438203,0.81078,0.811209,0.811116,0.810778
8,0.2759,0.442337,0.815367,0.815301,0.815326,0.815312
9,0.2656,0.444023,0.811927,0.812301,0.812242,0.811926
10,0.2586,0.447878,0.816514,0.816501,0.816368,0.816417


[I 2025-03-28 15:00:57,707] Trial 30 finished with value: 0.8187835072157246 and parameters: {'learning_rate': 2.1223336686458735e-05, 'weight_decay': 0.005, 'warmup_steps': 33}. Best is trial 27 with value: 0.8210406668350594.


Trial 31 with params: {'learning_rate': 1.3271009826348826e-05, 'weight_decay': 0.003, 'warmup_steps': 32}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6296,0.552509,0.738532,0.73856,0.738234,0.738299
2,0.4862,0.474619,0.784404,0.784668,0.784668,0.784404
3,0.4239,0.459902,0.792431,0.792382,0.792467,0.792398
4,0.392,0.457578,0.786697,0.78688,0.786373,0.786477
5,0.3699,0.451298,0.792431,0.792668,0.792088,0.792201


[I 2025-03-28 15:04:20,610] Trial 31 pruned. 


Trial 32 with params: {'learning_rate': 1.8537060185862908e-05, 'weight_decay': 0.005, 'warmup_steps': 27}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5995,0.505006,0.768349,0.768868,0.76785,0.767953
2,0.4449,0.461033,0.790138,0.790071,0.79013,0.790091
3,0.3901,0.447719,0.798165,0.798213,0.798308,0.798156
4,0.3582,0.449569,0.797018,0.797591,0.79655,0.796691
5,0.335,0.443253,0.81422,0.814461,0.813905,0.814028
6,0.3159,0.43692,0.815367,0.815303,0.815368,0.815326
7,0.303,0.43718,0.81078,0.811007,0.811032,0.81078
8,0.2906,0.440323,0.811927,0.811869,0.811947,0.811891
9,0.2807,0.441264,0.809633,0.809906,0.809906,0.809633
10,0.2738,0.44502,0.815367,0.815374,0.8152,0.81526


[I 2025-03-28 15:14:29,580] Trial 32 finished with value: 0.8164790066294854 and parameters: {'learning_rate': 1.8537060185862908e-05, 'weight_decay': 0.005, 'warmup_steps': 27}. Best is trial 27 with value: 0.8210406668350594.


Trial 33 with params: {'learning_rate': 4.795168759678875e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 40}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5116,0.457122,0.793578,0.793637,0.79334,0.793421
2,0.3532,0.432191,0.808486,0.808438,0.808527,0.808456
3,0.2956,0.437868,0.816514,0.816452,0.816452,0.816452
4,0.2619,0.458078,0.819954,0.820017,0.819746,0.819829
5,0.236,0.446449,0.813073,0.813096,0.8132,0.813061
6,0.2177,0.470423,0.81422,0.814494,0.814494,0.81422
7,0.2042,0.491395,0.811927,0.812301,0.812242,0.811926
8,0.193,0.503736,0.815367,0.815336,0.815242,0.815279
9,0.1827,0.515613,0.809633,0.810708,0.810158,0.809597
10,0.1757,0.521592,0.816514,0.816478,0.816578,0.81649


[I 2025-03-28 15:21:13,247] Trial 33 pruned. 


Trial 34 with params: {'learning_rate': 4.762569890675935e-05, 'weight_decay': 0.004, 'warmup_steps': 26}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5088,0.456994,0.792431,0.792518,0.792172,0.79226
2,0.3534,0.432231,0.808486,0.808438,0.808527,0.808456
3,0.2962,0.437926,0.817661,0.817607,0.817578,0.817591
4,0.2626,0.457898,0.819954,0.820017,0.819746,0.819829
5,0.2367,0.446094,0.813073,0.813096,0.8132,0.813061
6,0.2184,0.469164,0.813073,0.813396,0.813368,0.813073
7,0.2049,0.490271,0.811927,0.812301,0.812242,0.811926
8,0.1937,0.50244,0.816514,0.816501,0.816368,0.816417
9,0.1834,0.513555,0.809633,0.810708,0.810158,0.809597
10,0.1764,0.519685,0.815367,0.815319,0.81541,0.815338


[I 2025-03-28 15:31:20,164] Trial 34 finished with value: 0.8164790066294854 and parameters: {'learning_rate': 4.762569890675935e-05, 'weight_decay': 0.004, 'warmup_steps': 26}. Best is trial 27 with value: 0.8210406668350594.


Trial 35 with params: {'learning_rate': 5.820403075952114e-05, 'weight_decay': 0.003, 'warmup_steps': 20}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4893,0.447892,0.801606,0.801536,0.80156,0.801547
2,0.334,0.430155,0.811927,0.811891,0.81199,0.811902
3,0.2765,0.437776,0.816514,0.81647,0.81641,0.816436
4,0.2432,0.468046,0.821101,0.821456,0.820746,0.820889
5,0.2177,0.45906,0.813073,0.813396,0.813368,0.813073
6,0.2003,0.48624,0.813073,0.81376,0.813495,0.813061
7,0.1864,0.509586,0.816514,0.816625,0.816705,0.81651
8,0.1759,0.530453,0.816514,0.816675,0.816241,0.81635
9,0.1656,0.544112,0.808486,0.810032,0.809116,0.808413
10,0.1587,0.550525,0.813073,0.813025,0.813116,0.813044


[I 2025-03-28 15:38:02,225] Trial 35 pruned. 


Trial 36 with params: {'learning_rate': 0.0004180301872969493, 'weight_decay': 0.006, 'warmup_steps': 5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3553,0.435383,0.809633,0.81174,0.810369,0.809512
2,0.2061,0.601377,0.795872,0.796505,0.795382,0.795523
3,0.149,0.588913,0.808486,0.808509,0.808611,0.808474
4,0.1132,0.648933,0.787844,0.788804,0.787246,0.787374
5,0.0905,0.827626,0.793578,0.793586,0.793382,0.793447
6,0.0745,0.885744,0.787844,0.787795,0.787878,0.78781
7,0.0618,0.902952,0.787844,0.788259,0.788173,0.787842
8,0.0536,0.94665,0.794725,0.794681,0.794592,0.794627
9,0.0454,1.158072,0.792431,0.798736,0.793729,0.791774
10,0.0399,1.076639,0.794725,0.79539,0.79514,0.794712


[I 2025-03-28 15:44:45,757] Trial 36 pruned. 


Trial 37 with params: {'learning_rate': 1.795062715761262e-05, 'weight_decay': 0.006, 'warmup_steps': 35}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6041,0.509395,0.767202,0.767776,0.766681,0.766782
2,0.4487,0.462161,0.790138,0.790071,0.79013,0.790091
3,0.3933,0.448929,0.794725,0.794705,0.794803,0.794703
4,0.3614,0.450197,0.794725,0.795289,0.794256,0.794394
5,0.3384,0.443971,0.808486,0.808673,0.80819,0.808302


[I 2025-03-28 15:47:47,363] Trial 37 pruned. 


Trial 38 with params: {'learning_rate': 2.428567376732491e-05, 'weight_decay': 0.0, 'warmup_steps': 43}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5759,0.482828,0.78211,0.782366,0.781742,0.781852
2,0.4181,0.452714,0.793578,0.793507,0.793551,0.793525
3,0.3633,0.439498,0.809633,0.809906,0.809906,0.809633
4,0.331,0.446303,0.806193,0.80694,0.805685,0.805843
5,0.3066,0.437479,0.816514,0.816546,0.816326,0.816397
6,0.2868,0.436833,0.81422,0.81422,0.814326,0.814205
7,0.2736,0.441461,0.809633,0.81012,0.80999,0.809629
8,0.2613,0.44616,0.813073,0.813007,0.813031,0.813018
9,0.2508,0.449175,0.81078,0.81133,0.811158,0.810774
10,0.2436,0.45309,0.818807,0.818746,0.818746,0.818746


[I 2025-03-28 15:53:42,808] Trial 38 finished with value: 0.8176411246568802 and parameters: {'learning_rate': 2.428567376732491e-05, 'weight_decay': 0.0, 'warmup_steps': 43}. Best is trial 27 with value: 0.8210406668350594.


Trial 39 with params: {'learning_rate': 1.0718747648822253e-05, 'weight_decay': 0.001, 'warmup_steps': 39}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6441,0.584723,0.725917,0.725947,0.725594,0.725654
2,0.5186,0.488497,0.78211,0.783127,0.782626,0.782069
3,0.4467,0.46782,0.790138,0.790088,0.790172,0.790104
4,0.4137,0.463598,0.78555,0.785693,0.785247,0.785345
5,0.392,0.457488,0.792431,0.792518,0.792172,0.79226
6,0.3735,0.448136,0.799312,0.799246,0.799307,0.799267
7,0.3621,0.444314,0.806193,0.80627,0.806359,0.806186
8,0.3504,0.442985,0.81078,0.810802,0.810906,0.810768
9,0.3418,0.441009,0.807339,0.807449,0.807527,0.807335
10,0.3352,0.443898,0.809633,0.809658,0.809443,0.809512


[I 2025-03-28 15:57:37,451] Trial 39 pruned. 


Trial 40 with params: {'learning_rate': 1.9560937675681444e-05, 'weight_decay': 0.0, 'warmup_steps': 43}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5972,0.500498,0.768349,0.768868,0.76785,0.767953
2,0.4398,0.459567,0.794725,0.794676,0.794761,0.794692
3,0.3848,0.445929,0.803899,0.803977,0.804065,0.803893
4,0.3528,0.448701,0.798165,0.799103,0.797592,0.79774
5,0.3293,0.442103,0.813073,0.813269,0.812779,0.812894
6,0.31,0.436626,0.816514,0.816456,0.816536,0.816479
7,0.2971,0.437474,0.808486,0.808914,0.808822,0.808484
8,0.2846,0.441031,0.813073,0.813009,0.813074,0.813032
9,0.2746,0.442349,0.811927,0.812301,0.812242,0.811926
10,0.2676,0.446031,0.815367,0.815336,0.815242,0.815279


[I 2025-03-28 16:01:33,424] Trial 40 pruned. 


Trial 41 with params: {'learning_rate': 1.2431112024586663e-05, 'weight_decay': 0.0, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6307,0.558895,0.739679,0.739647,0.739444,0.739498
2,0.4935,0.477738,0.78211,0.782469,0.782416,0.782109
3,0.4302,0.462002,0.792431,0.792382,0.792467,0.792398
4,0.3985,0.459454,0.78555,0.785693,0.785247,0.785345
5,0.3766,0.453057,0.793578,0.793775,0.793256,0.793365
6,0.3582,0.444075,0.81078,0.810802,0.810906,0.810768
7,0.3465,0.440977,0.813073,0.81322,0.813284,0.813071
8,0.3345,0.440877,0.808486,0.808438,0.808527,0.808456
9,0.3256,0.439421,0.811927,0.811891,0.81199,0.811902
10,0.319,0.443096,0.811927,0.81201,0.811695,0.811784


[I 2025-03-28 16:05:36,329] Trial 41 pruned. 


Trial 42 with params: {'learning_rate': 4.5074303476162835e-05, 'weight_decay': 0.0, 'warmup_steps': 33}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5155,0.459516,0.793578,0.793637,0.79334,0.793421
2,0.3589,0.433328,0.81078,0.810715,0.810779,0.810738
3,0.3017,0.437077,0.818807,0.818746,0.818746,0.818746
4,0.268,0.455695,0.821101,0.821197,0.820872,0.820965
5,0.2421,0.443411,0.813073,0.813151,0.813242,0.813067
6,0.2235,0.465024,0.813073,0.813396,0.813368,0.813073
7,0.2101,0.485656,0.81078,0.811007,0.811032,0.81078
8,0.1988,0.494939,0.81422,0.814158,0.814158,0.814158
9,0.1884,0.506061,0.81078,0.811772,0.811284,0.81075
10,0.1814,0.511279,0.816514,0.816456,0.816536,0.816479


[I 2025-03-28 16:09:32,270] Trial 42 pruned. 


Trial 43 with params: {'learning_rate': 3.472712926997433e-05, 'weight_decay': 0.001, 'warmup_steps': 43}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5417,0.467896,0.792431,0.792386,0.792298,0.792333
2,0.3842,0.440298,0.802752,0.80277,0.80256,0.802627
3,0.3277,0.434538,0.809633,0.809633,0.809737,0.809617
4,0.2946,0.448246,0.81422,0.814377,0.813947,0.814055
5,0.269,0.434829,0.811927,0.811859,0.811905,0.811878
6,0.2494,0.447569,0.817661,0.817808,0.817873,0.817658
7,0.236,0.461926,0.815367,0.815691,0.815663,0.815367
8,0.2242,0.466492,0.816514,0.816452,0.816452,0.816452
9,0.2136,0.4746,0.815367,0.816057,0.815789,0.815355
10,0.2064,0.478975,0.822248,0.822196,0.822167,0.82218


[I 2025-03-28 16:15:28,105] Trial 43 finished with value: 0.8176065796760941 and parameters: {'learning_rate': 3.472712926997433e-05, 'weight_decay': 0.001, 'warmup_steps': 43}. Best is trial 27 with value: 0.8210406668350594.


Trial 44 with params: {'learning_rate': 5.969782925975992e-05, 'weight_decay': 0.0, 'warmup_steps': 40}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.492,0.447127,0.807339,0.807304,0.807401,0.807314
2,0.332,0.430598,0.81078,0.81076,0.810863,0.81076
3,0.2741,0.438239,0.817661,0.817607,0.817578,0.817591
4,0.2408,0.46976,0.824541,0.824962,0.824166,0.824319
5,0.2154,0.461077,0.813073,0.813396,0.813368,0.813073
6,0.1981,0.489548,0.815367,0.816057,0.815789,0.815355
7,0.1842,0.512932,0.817661,0.817739,0.817831,0.817655
8,0.1736,0.534327,0.81422,0.814377,0.813947,0.814055
9,0.1634,0.549126,0.809633,0.811081,0.810242,0.809569
10,0.1564,0.555784,0.81078,0.810715,0.810779,0.810738


[I 2025-03-28 16:19:24,484] Trial 44 pruned. 


Trial 45 with params: {'learning_rate': 2.3546370901208507e-05, 'weight_decay': 0.003, 'warmup_steps': 41}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5785,0.484808,0.78211,0.782283,0.781784,0.781885
2,0.4211,0.453761,0.793578,0.793507,0.793551,0.793525
3,0.3663,0.440248,0.808486,0.808713,0.808737,0.808486
4,0.3341,0.446458,0.803899,0.804788,0.803349,0.803506
5,0.3099,0.438,0.81422,0.814307,0.813989,0.814079
6,0.2901,0.436549,0.816514,0.816514,0.81662,0.816498
7,0.277,0.440513,0.81078,0.811209,0.811116,0.810778
8,0.2646,0.445129,0.81078,0.810723,0.810695,0.810708
9,0.2542,0.447783,0.809633,0.810007,0.809948,0.809632
10,0.247,0.451682,0.817661,0.817631,0.817536,0.817574


[I 2025-03-28 16:25:20,258] Trial 45 finished with value: 0.8176411246568802 and parameters: {'learning_rate': 2.3546370901208507e-05, 'weight_decay': 0.003, 'warmup_steps': 41}. Best is trial 27 with value: 0.8210406668350594.


Trial 46 with params: {'learning_rate': 1.4983957759694008e-05, 'weight_decay': 0.004, 'warmup_steps': 43}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6216,0.535261,0.747706,0.74825,0.747158,0.747226
2,0.4706,0.4691,0.78211,0.782158,0.782247,0.7821
3,0.4116,0.455599,0.795872,0.795812,0.795887,0.795833
4,0.3797,0.45439,0.790138,0.790369,0.789793,0.789905
5,0.3572,0.448242,0.799312,0.799662,0.798929,0.799058


[I 2025-03-28 16:27:19,126] Trial 46 pruned. 


Trial 47 with params: {'learning_rate': 2.9399379535433413e-05, 'weight_decay': 0.005, 'warmup_steps': 39}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5564,0.472949,0.786697,0.786698,0.786499,0.786561
2,0.3997,0.445148,0.800459,0.800474,0.800265,0.800332
3,0.3441,0.435993,0.808486,0.808632,0.808695,0.808484
4,0.3116,0.446173,0.816514,0.816969,0.816115,0.816266
5,0.2865,0.434969,0.817661,0.817631,0.817536,0.817574
6,0.2666,0.440322,0.815367,0.81539,0.815494,0.815355
7,0.2533,0.450002,0.809633,0.81012,0.80999,0.809629
8,0.2413,0.45481,0.817661,0.817595,0.81762,0.817607
9,0.2306,0.459814,0.81422,0.814596,0.814536,0.814219
10,0.2233,0.464673,0.816514,0.816447,0.816494,0.816466


[I 2025-03-28 16:33:17,884] Trial 47 finished with value: 0.8199008365355143 and parameters: {'learning_rate': 2.9399379535433413e-05, 'weight_decay': 0.005, 'warmup_steps': 39}. Best is trial 27 with value: 0.8210406668350594.


Trial 48 with params: {'learning_rate': 2.7666837299817864e-05, 'weight_decay': 0.003, 'warmup_steps': 43}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5631,0.47569,0.78555,0.785627,0.785289,0.785374
2,0.4056,0.447385,0.800459,0.800406,0.800349,0.800374
3,0.3502,0.436906,0.81078,0.810926,0.81099,0.810778
4,0.3178,0.446031,0.81078,0.811406,0.810316,0.810474
5,0.2929,0.435561,0.818807,0.818796,0.818662,0.818712
6,0.273,0.439025,0.815367,0.815445,0.815536,0.815361
7,0.2597,0.446847,0.81078,0.81133,0.811158,0.810774
8,0.2475,0.451704,0.818807,0.818741,0.818788,0.818761
9,0.2369,0.456035,0.81422,0.814711,0.814579,0.814216
10,0.2296,0.460556,0.821101,0.821041,0.821041,0.821041


[I 2025-03-28 16:39:14,695] Trial 48 finished with value: 0.8176315301415431 and parameters: {'learning_rate': 2.7666837299817864e-05, 'weight_decay': 0.003, 'warmup_steps': 43}. Best is trial 27 with value: 0.8210406668350594.


Trial 49 with params: {'learning_rate': 2.9184668552378307e-05, 'weight_decay': 0.005, 'warmup_steps': 38}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5569,0.473251,0.786697,0.786698,0.786499,0.786561
2,0.4003,0.445422,0.799312,0.799304,0.799139,0.799195
3,0.3449,0.436097,0.81078,0.810926,0.81099,0.810778
4,0.3124,0.446121,0.816514,0.816969,0.816115,0.816266
5,0.2873,0.435072,0.817661,0.817631,0.817536,0.817574
6,0.2674,0.440115,0.816514,0.816563,0.816662,0.816505
7,0.2541,0.449617,0.809633,0.81012,0.80999,0.809629
8,0.242,0.454398,0.817661,0.817595,0.81762,0.817607
9,0.2314,0.459304,0.81422,0.814596,0.814536,0.814219
10,0.2241,0.464199,0.816514,0.816447,0.816494,0.816466


[I 2025-03-28 16:43:09,828] Trial 49 pruned. 


Trial 50 with params: {'learning_rate': 2.4578217928446838e-05, 'weight_decay': 0.004, 'warmup_steps': 34}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5728,0.481784,0.783257,0.783562,0.782868,0.782983
2,0.4167,0.452318,0.793578,0.793507,0.793551,0.793525
3,0.362,0.439251,0.809633,0.810007,0.809948,0.809632
4,0.3298,0.446351,0.806193,0.80694,0.805685,0.805843
5,0.3054,0.437362,0.816514,0.816546,0.816326,0.816397
6,0.2856,0.437069,0.815367,0.815348,0.815452,0.815347
7,0.2724,0.441913,0.808486,0.809168,0.808906,0.808474
8,0.2601,0.446634,0.813073,0.813009,0.813074,0.813032
9,0.2496,0.449821,0.81078,0.81133,0.811158,0.810774
10,0.2424,0.453799,0.818807,0.818746,0.818746,0.818746


[I 2025-03-28 16:49:05,840] Trial 50 finished with value: 0.8187920875420875 and parameters: {'learning_rate': 2.4578217928446838e-05, 'weight_decay': 0.004, 'warmup_steps': 34}. Best is trial 27 with value: 0.8210406668350594.


Trial 51 with params: {'learning_rate': 3.136836462081459e-05, 'weight_decay': 0.004, 'warmup_steps': 31}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5484,0.470644,0.787844,0.787869,0.787625,0.787696
2,0.3933,0.443246,0.798165,0.79823,0.797929,0.798012
3,0.3377,0.435284,0.808486,0.808632,0.808695,0.808484
4,0.305,0.446814,0.81422,0.814558,0.813863,0.814
5,0.2797,0.434833,0.813073,0.813007,0.813031,0.813018
6,0.2599,0.442623,0.816514,0.816625,0.816705,0.81651
7,0.2466,0.454156,0.81422,0.814596,0.814536,0.814219
8,0.2347,0.458957,0.817661,0.817597,0.817662,0.81762
9,0.224,0.465062,0.813073,0.813504,0.81341,0.813071
10,0.2167,0.469913,0.818807,0.818746,0.818746,0.818746


[I 2025-03-28 16:55:14,295] Trial 51 finished with value: 0.8221801222215643 and parameters: {'learning_rate': 3.136836462081459e-05, 'weight_decay': 0.004, 'warmup_steps': 31}. Best is trial 51 with value: 0.8221801222215643.


Trial 52 with params: {'learning_rate': 2.8888977110270142e-05, 'weight_decay': 0.005, 'warmup_steps': 33}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5567,0.473629,0.786697,0.786698,0.786499,0.786561
2,0.4011,0.445797,0.799312,0.799304,0.799139,0.799195
3,0.3459,0.436319,0.81078,0.810926,0.81099,0.810778
4,0.3134,0.446188,0.815367,0.815879,0.814947,0.815102
5,0.2884,0.43529,0.817661,0.817631,0.817536,0.817574
6,0.2685,0.439916,0.816514,0.816563,0.816662,0.816505
7,0.2552,0.449115,0.809633,0.81012,0.80999,0.809629
8,0.2431,0.453964,0.817661,0.817595,0.81762,0.817607
9,0.2325,0.458733,0.815367,0.815691,0.815663,0.815367
10,0.2252,0.463485,0.817661,0.817595,0.81762,0.817607


[I 2025-03-28 17:01:13,783] Trial 52 finished with value: 0.817620015390383 and parameters: {'learning_rate': 2.8888977110270142e-05, 'weight_decay': 0.005, 'warmup_steps': 33}. Best is trial 51 with value: 0.8221801222215643.


Trial 53 with params: {'learning_rate': 3.362759858007177e-05, 'weight_decay': 0.004, 'warmup_steps': 27}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.541,0.468698,0.790138,0.790122,0.789962,0.790016
2,0.3866,0.44124,0.802752,0.80277,0.80256,0.802627
3,0.3308,0.434863,0.808486,0.808509,0.808611,0.808474
4,0.2979,0.447875,0.815367,0.815657,0.815031,0.815163
5,0.2724,0.43513,0.813073,0.813007,0.813031,0.813018
6,0.2527,0.445937,0.815367,0.815596,0.815621,0.815367
7,0.2394,0.459545,0.815367,0.815691,0.815663,0.815367
8,0.2276,0.464144,0.817661,0.817595,0.81762,0.817607
9,0.2169,0.471648,0.811927,0.812543,0.812326,0.811918
10,0.2097,0.476051,0.821101,0.821041,0.821041,0.821041


[I 2025-03-28 17:07:12,056] Trial 53 finished with value: 0.8198856721857136 and parameters: {'learning_rate': 3.362759858007177e-05, 'weight_decay': 0.004, 'warmup_steps': 27}. Best is trial 51 with value: 0.8221801222215643.


Trial 54 with params: {'learning_rate': 3.2717712928282766e-05, 'weight_decay': 0.004, 'warmup_steps': 32}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5447,0.469434,0.788991,0.788957,0.788836,0.78888
2,0.3894,0.442069,0.801606,0.801647,0.801391,0.801467
3,0.3335,0.434955,0.81078,0.810858,0.810948,0.810774
4,0.3007,0.44732,0.81422,0.814558,0.813863,0.814
5,0.2753,0.434801,0.813073,0.813007,0.813031,0.813018


[I 2025-03-28 17:09:10,908] Trial 54 pruned. 


Trial 55 with params: {'learning_rate': 2.436869330093948e-05, 'weight_decay': 0.004, 'warmup_steps': 26}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.572,0.482045,0.783257,0.783562,0.782868,0.782983
2,0.4172,0.452681,0.793578,0.793507,0.793551,0.793525
3,0.3629,0.439539,0.808486,0.808713,0.808737,0.808486
4,0.3307,0.446558,0.806193,0.80694,0.805685,0.805843
5,0.3063,0.437634,0.815367,0.815425,0.815157,0.815238
6,0.2866,0.436984,0.815367,0.815348,0.815452,0.815347
7,0.2734,0.44162,0.807339,0.807952,0.807738,0.80733
8,0.2611,0.446316,0.813073,0.813009,0.813074,0.813032
9,0.2506,0.449399,0.81078,0.81133,0.811158,0.810774
10,0.2434,0.453405,0.817661,0.817607,0.817578,0.817591


[I 2025-03-28 17:15:08,204] Trial 55 finished with value: 0.817648799542307 and parameters: {'learning_rate': 2.436869330093948e-05, 'weight_decay': 0.004, 'warmup_steps': 26}. Best is trial 51 with value: 0.8221801222215643.


Trial 56 with params: {'learning_rate': 3.981577297029585e-05, 'weight_decay': 0.005, 'warmup_steps': 31}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5263,0.464097,0.791284,0.79129,0.791088,0.791151
2,0.3708,0.436467,0.806193,0.806239,0.80598,0.806058
3,0.314,0.435319,0.81078,0.810731,0.810821,0.81075
4,0.2806,0.451596,0.818807,0.818972,0.818536,0.818646
5,0.2547,0.438374,0.813073,0.813025,0.813116,0.813044
6,0.2356,0.455973,0.815367,0.815514,0.815578,0.815365
7,0.2222,0.47464,0.81422,0.814494,0.814494,0.81422
8,0.2107,0.479798,0.813073,0.813041,0.812947,0.812985
9,0.2002,0.490143,0.811927,0.813187,0.812495,0.811878
10,0.1931,0.49418,0.818807,0.818741,0.818788,0.818761


[I 2025-03-28 17:21:06,598] Trial 56 finished with value: 0.8130180659572537 and parameters: {'learning_rate': 3.981577297029585e-05, 'weight_decay': 0.005, 'warmup_steps': 31}. Best is trial 51 with value: 0.8221801222215643.


Trial 57 with params: {'learning_rate': 1.7967385120353152e-05, 'weight_decay': 0.003, 'warmup_steps': 29}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6029,0.508787,0.767202,0.767776,0.766681,0.766782
2,0.4484,0.462064,0.790138,0.790071,0.79013,0.790091
3,0.3932,0.448863,0.794725,0.794705,0.794803,0.794703
4,0.3613,0.450179,0.795872,0.796378,0.795424,0.795561
5,0.3383,0.443977,0.808486,0.808673,0.80819,0.808302
6,0.3192,0.437155,0.81422,0.814162,0.814242,0.814185
7,0.3064,0.437155,0.81078,0.811007,0.811032,0.81078
8,0.294,0.44008,0.811927,0.811869,0.811947,0.811891
9,0.2842,0.44072,0.808486,0.808807,0.80878,0.808486
10,0.2773,0.444543,0.815367,0.815374,0.8152,0.81526


[I 2025-03-28 17:25:02,858] Trial 57 pruned. 


Trial 58 with params: {'learning_rate': 3.376913526648993e-05, 'weight_decay': 0.003, 'warmup_steps': 32}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5417,0.468589,0.790138,0.790122,0.789962,0.790016
2,0.3864,0.441119,0.801606,0.801647,0.801391,0.801467
3,0.3304,0.434767,0.808486,0.808509,0.808611,0.808474
4,0.2975,0.447824,0.815367,0.815657,0.815031,0.815163
5,0.2719,0.434897,0.813073,0.813007,0.813031,0.813018
6,0.2523,0.446059,0.815367,0.815596,0.815621,0.815367
7,0.2389,0.459787,0.816514,0.816789,0.816789,0.816514
8,0.2271,0.464439,0.817661,0.817595,0.81762,0.817607
9,0.2165,0.471984,0.811927,0.812543,0.812326,0.811918
10,0.2092,0.476445,0.821101,0.821041,0.821041,0.821041


[I 2025-03-28 17:30:58,370] Trial 58 finished with value: 0.8199008365355143 and parameters: {'learning_rate': 3.376913526648993e-05, 'weight_decay': 0.003, 'warmup_steps': 32}. Best is trial 51 with value: 0.8221801222215643.


Trial 59 with params: {'learning_rate': 2.579909624398062e-05, 'weight_decay': 0.003, 'warmup_steps': 33}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5678,0.479067,0.78211,0.782212,0.781826,0.781916
2,0.412,0.450574,0.797018,0.796976,0.796887,0.796922
3,0.3572,0.438286,0.81078,0.811102,0.811074,0.81078
4,0.3249,0.446171,0.809633,0.810324,0.809148,0.809308
5,0.3003,0.436677,0.815367,0.815374,0.8152,0.81526
6,0.2804,0.437796,0.816514,0.816514,0.81662,0.816498
7,0.2672,0.443754,0.809633,0.810247,0.810032,0.809624
8,0.255,0.448581,0.816514,0.816452,0.816452,0.816452
9,0.2444,0.452297,0.811927,0.812416,0.812284,0.811923
10,0.2372,0.456422,0.818807,0.818746,0.818746,0.818746


[I 2025-03-28 17:36:57,220] Trial 59 finished with value: 0.8187920875420875 and parameters: {'learning_rate': 2.579909624398062e-05, 'weight_decay': 0.003, 'warmup_steps': 33}. Best is trial 51 with value: 0.8221801222215643.


Trial 60 with params: {'learning_rate': 6.710937090018343e-05, 'weight_decay': 0.004, 'warmup_steps': 31}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4795,0.444444,0.803899,0.80385,0.803938,0.803868
2,0.321,0.431766,0.811927,0.811891,0.81199,0.811902
3,0.2633,0.439349,0.816514,0.816603,0.816284,0.816375
4,0.2301,0.477631,0.822248,0.822918,0.821788,0.821961
5,0.2053,0.472177,0.81078,0.811209,0.811116,0.810778
6,0.1882,0.502289,0.81078,0.81133,0.811158,0.810774
7,0.174,0.530678,0.81422,0.814406,0.814452,0.814219
8,0.164,0.554301,0.815367,0.815489,0.815115,0.815215
9,0.1537,0.570594,0.805046,0.80648,0.805654,0.80498
10,0.1468,0.576663,0.815367,0.815319,0.81541,0.815338


[I 2025-03-28 17:42:54,024] Trial 60 finished with value: 0.8141957479047304 and parameters: {'learning_rate': 6.710937090018343e-05, 'weight_decay': 0.004, 'warmup_steps': 31}. Best is trial 51 with value: 0.8221801222215643.


Trial 61 with params: {'learning_rate': 3.332559849712927e-05, 'weight_decay': 0.003, 'warmup_steps': 32}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.543,0.468911,0.788991,0.788957,0.788836,0.78888
2,0.3876,0.441489,0.801606,0.801647,0.801391,0.801467
3,0.3317,0.434822,0.81078,0.810802,0.810906,0.810768
4,0.2988,0.447599,0.81422,0.814558,0.813863,0.814
5,0.2733,0.434854,0.813073,0.813007,0.813031,0.813018


[I 2025-03-28 17:44:52,686] Trial 61 pruned. 


Trial 62 with params: {'learning_rate': 2.871282927455198e-05, 'weight_decay': 0.002, 'warmup_steps': 29}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5564,0.473863,0.78555,0.785573,0.785331,0.785401
2,0.4016,0.446024,0.799312,0.799304,0.799139,0.799195
3,0.3464,0.43642,0.81078,0.810926,0.81099,0.810778
4,0.314,0.446224,0.815367,0.815879,0.814947,0.815102
5,0.289,0.435392,0.817661,0.817631,0.817536,0.817574
6,0.2692,0.439822,0.816514,0.816563,0.816662,0.816505
7,0.2559,0.448803,0.809633,0.81012,0.80999,0.809629
8,0.2438,0.453665,0.817661,0.817595,0.81762,0.817607
9,0.2332,0.458371,0.815367,0.815799,0.815705,0.815365
10,0.2259,0.46303,0.816514,0.816452,0.816452,0.816452


[I 2025-03-28 17:50:49,506] Trial 62 finished with value: 0.817620015390383 and parameters: {'learning_rate': 2.871282927455198e-05, 'weight_decay': 0.002, 'warmup_steps': 29}. Best is trial 51 with value: 0.8221801222215643.


Trial 63 with params: {'learning_rate': 3.107537670155256e-05, 'weight_decay': 0.003, 'warmup_steps': 33}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5497,0.470949,0.787844,0.787869,0.787625,0.787696
2,0.3942,0.443536,0.799312,0.799351,0.799097,0.799172
3,0.3386,0.435385,0.809633,0.809818,0.809864,0.809632
4,0.306,0.446661,0.81422,0.814558,0.813863,0.814
5,0.2807,0.434796,0.81422,0.814158,0.814158,0.814158
6,0.2609,0.442253,0.816514,0.816625,0.816705,0.81651
7,0.2475,0.453542,0.81422,0.814596,0.814536,0.814219
8,0.2356,0.458304,0.817661,0.817597,0.817662,0.81762
9,0.225,0.464206,0.813073,0.813396,0.813368,0.813073
10,0.2177,0.469138,0.818807,0.818746,0.818746,0.818746


[I 2025-03-28 17:56:45,539] Trial 63 finished with value: 0.8221801222215643 and parameters: {'learning_rate': 3.107537670155256e-05, 'weight_decay': 0.003, 'warmup_steps': 33}. Best is trial 51 with value: 0.8221801222215643.


Trial 64 with params: {'learning_rate': 2.090646578509708e-05, 'weight_decay': 0.006, 'warmup_steps': 18}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5858,0.492517,0.774083,0.774567,0.773607,0.773718
2,0.4319,0.457258,0.793578,0.793519,0.793593,0.793539
3,0.378,0.44359,0.802752,0.802862,0.802938,0.802748
4,0.3461,0.447914,0.801606,0.80282,0.80097,0.801122
5,0.3224,0.440599,0.81422,0.814461,0.813905,0.814028
6,0.303,0.436237,0.81422,0.814162,0.814242,0.814185
7,0.29,0.438029,0.809633,0.809906,0.809906,0.809633
8,0.2776,0.442098,0.815367,0.815301,0.815326,0.815312
9,0.2675,0.443647,0.81078,0.811102,0.811074,0.81078
10,0.2604,0.447448,0.815367,0.815336,0.815242,0.815279


[I 2025-03-28 18:00:44,113] Trial 64 pruned. 


Trial 65 with params: {'learning_rate': 3.386295106767767e-05, 'weight_decay': 0.004, 'warmup_steps': 31}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5412,0.468485,0.790138,0.790122,0.789962,0.790016
2,0.3861,0.44103,0.802752,0.80277,0.80256,0.802627
3,0.3301,0.43475,0.808486,0.808509,0.808611,0.808474
4,0.2972,0.447862,0.816514,0.816759,0.816199,0.816324


Trial 66 with params: {'learning_rate': 2.5587190033751624e-05, 'weight_decay': 0.002, 'warmup_steps': 32}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5684,0.479461,0.780963,0.781097,0.780658,0.780753
2,0.4127,0.450851,0.797018,0.796976,0.796887,0.796922
3,0.358,0.438461,0.81078,0.811102,0.811074,0.81078
4,0.3257,0.446264,0.809633,0.810324,0.809148,0.809308
5,0.3012,0.436794,0.817661,0.817669,0.817494,0.817555
6,0.2813,0.437674,0.816514,0.816514,0.81662,0.816498
7,0.2681,0.443414,0.81078,0.811464,0.8112,0.810768
8,0.2558,0.448193,0.816514,0.816452,0.816452,0.816452
9,0.2453,0.451845,0.811927,0.812416,0.812284,0.811923
10,0.2381,0.455943,0.817661,0.817607,0.817578,0.817591


[I 2025-03-28 18:08:43,577] Trial 66 finished with value: 0.817648799542307 and parameters: {'learning_rate': 2.5587190033751624e-05, 'weight_decay': 0.002, 'warmup_steps': 32}. Best is trial 51 with value: 0.8221801222215643.


Trial 67 with params: {'learning_rate': 1.409348001376769e-05, 'weight_decay': 0.001, 'warmup_steps': 20}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6233,0.541714,0.739679,0.739862,0.739275,0.739349
2,0.4772,0.47152,0.784404,0.784452,0.784542,0.784393
3,0.4176,0.457742,0.793578,0.793519,0.793593,0.793539
4,0.3859,0.456086,0.787844,0.788162,0.787457,0.787576
5,0.3636,0.449761,0.800459,0.800865,0.800055,0.80019


[I 2025-03-28 18:10:41,538] Trial 67 pruned. 


Trial 68 with params: {'learning_rate': 4.190275252502407e-05, 'weight_decay': 0.006, 'warmup_steps': 40}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5238,0.462264,0.793578,0.793586,0.793382,0.793447
2,0.3662,0.435045,0.809633,0.809615,0.809485,0.809533
3,0.3089,0.435625,0.81422,0.814158,0.814158,0.814158
4,0.2754,0.452895,0.817661,0.817864,0.817368,0.817486
5,0.2495,0.439845,0.81422,0.81422,0.814326,0.814205
6,0.2305,0.459278,0.81422,0.814494,0.814494,0.81422
7,0.2172,0.479056,0.813073,0.81322,0.813284,0.813071
8,0.2057,0.485231,0.815367,0.815336,0.815242,0.815279
9,0.1953,0.496226,0.808486,0.809646,0.809032,0.808444
10,0.1882,0.500687,0.815367,0.815303,0.815368,0.815326


[I 2025-03-28 18:14:37,999] Trial 68 pruned. 


Trial 69 with params: {'learning_rate': 0.00046740581144500315, 'weight_decay': 0.007, 'warmup_steps': 34}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3623,0.440969,0.817661,0.817641,0.817746,0.817641
2,0.2041,0.570821,0.806193,0.806562,0.805812,0.805947
3,0.1458,0.644113,0.795872,0.796472,0.796266,0.795862
4,0.1094,0.697168,0.779817,0.780809,0.779195,0.779305
5,0.0865,0.874891,0.787844,0.78851,0.787331,0.787461
6,0.0715,0.879289,0.794725,0.796484,0.793961,0.794075
7,0.0577,0.974065,0.794725,0.795567,0.794172,0.794313
8,0.0504,0.912168,0.806193,0.806156,0.806064,0.806101
9,0.0428,1.072344,0.788991,0.7929,0.790014,0.788631
10,0.0373,1.132572,0.783257,0.783207,0.783289,0.783222


[I 2025-03-28 18:18:35,808] Trial 69 pruned. 


Trial 70 with params: {'learning_rate': 0.0003432306292766826, 'weight_decay': 0.004, 'warmup_steps': 18}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3665,0.454334,0.808486,0.810244,0.809158,0.808395
2,0.2122,0.56716,0.807339,0.807417,0.807106,0.807193
3,0.1559,0.617009,0.809633,0.809743,0.809822,0.809629
4,0.1208,0.662873,0.800459,0.800762,0.800097,0.800222
5,0.097,0.773072,0.793578,0.793548,0.793424,0.793469
6,0.0799,0.872373,0.800459,0.800507,0.800602,0.800449
7,0.0663,0.959243,0.795872,0.795812,0.795887,0.795833
8,0.0579,0.953465,0.803899,0.803943,0.803686,0.803763
9,0.0486,1.093834,0.787844,0.79257,0.788972,0.787374
10,0.0435,1.074588,0.794725,0.795261,0.795098,0.794718


[I 2025-03-28 18:22:33,719] Trial 70 pruned. 


Trial 71 with params: {'learning_rate': 3.70477842852424e-05, 'weight_decay': 0.002, 'warmup_steps': 33}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5334,0.466521,0.792431,0.792386,0.792298,0.792333
2,0.3777,0.438512,0.803899,0.803896,0.803728,0.803785
3,0.3212,0.434854,0.811927,0.811891,0.81199,0.811902
4,0.288,0.449769,0.815367,0.815566,0.815073,0.81519
5,0.2622,0.436541,0.81422,0.814153,0.8142,0.814172
6,0.2428,0.451848,0.816514,0.8167,0.816747,0.816513
7,0.2295,0.468203,0.816514,0.816789,0.816789,0.816514
8,0.2178,0.47262,0.816514,0.816452,0.816452,0.816452
9,0.2072,0.482228,0.813073,0.81407,0.813579,0.813044
10,0.2001,0.486196,0.819954,0.819901,0.819872,0.819886


[I 2025-03-28 18:28:34,481] Trial 71 finished with value: 0.8153259275336583 and parameters: {'learning_rate': 3.70477842852424e-05, 'weight_decay': 0.002, 'warmup_steps': 33}. Best is trial 51 with value: 0.8221801222215643.


Trial 72 with params: {'learning_rate': 4.785297222553491e-05, 'weight_decay': 0.004, 'warmup_steps': 39}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5115,0.457178,0.793578,0.793637,0.79334,0.793421
2,0.3534,0.432194,0.808486,0.808438,0.808527,0.808456
3,0.2958,0.437873,0.816514,0.816452,0.816452,0.816452
4,0.2621,0.457996,0.819954,0.820017,0.819746,0.819829
5,0.2362,0.446349,0.813073,0.813096,0.8132,0.813061
6,0.2179,0.470142,0.81422,0.814494,0.814494,0.81422
7,0.2045,0.491116,0.811927,0.812301,0.812242,0.811926
8,0.1932,0.503337,0.815367,0.815336,0.815242,0.815279
9,0.1829,0.51518,0.809633,0.810708,0.810158,0.809597
10,0.1759,0.521199,0.816514,0.816478,0.816578,0.81649


[I 2025-03-28 18:34:37,264] Trial 72 finished with value: 0.8164790066294854 and parameters: {'learning_rate': 4.785297222553491e-05, 'weight_decay': 0.004, 'warmup_steps': 39}. Best is trial 51 with value: 0.8221801222215643.


Trial 73 with params: {'learning_rate': 2.654871957864928e-05, 'weight_decay': 0.003, 'warmup_steps': 32}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5647,0.477491,0.784404,0.78451,0.784121,0.784212
2,0.4092,0.449304,0.797018,0.796976,0.796887,0.796922
3,0.3543,0.437786,0.811927,0.812112,0.812158,0.811926
4,0.322,0.446165,0.809633,0.810324,0.809148,0.809308
5,0.2973,0.43629,0.817661,0.817721,0.817452,0.817534
6,0.2774,0.438303,0.817661,0.817683,0.817789,0.817649
7,0.2642,0.444988,0.809633,0.810247,0.810032,0.809624
8,0.252,0.449904,0.817661,0.817597,0.817662,0.81762
9,0.2414,0.453835,0.813073,0.813626,0.813452,0.813067
10,0.2341,0.458088,0.819954,0.819889,0.819915,0.819901


[I 2025-03-28 18:40:34,152] Trial 73 finished with value: 0.8187920875420875 and parameters: {'learning_rate': 2.654871957864928e-05, 'weight_decay': 0.003, 'warmup_steps': 32}. Best is trial 51 with value: 0.8221801222215643.


Trial 74 with params: {'learning_rate': 4.1363313232318837e-05, 'weight_decay': 0.002, 'warmup_steps': 23}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.521,0.462433,0.794725,0.794758,0.794508,0.794582
2,0.3669,0.435432,0.809633,0.809615,0.809485,0.809533
3,0.3102,0.435801,0.81422,0.814158,0.814158,0.814158
4,0.2767,0.452832,0.818807,0.818972,0.818536,0.818646
5,0.2508,0.439882,0.813073,0.813054,0.813158,0.813053
6,0.2318,0.458313,0.815367,0.815596,0.815621,0.815367
7,0.2185,0.477805,0.813073,0.81322,0.813284,0.813071
8,0.207,0.4839,0.81422,0.814206,0.814073,0.814122
9,0.1966,0.494657,0.808486,0.809646,0.809032,0.808444
10,0.1895,0.498777,0.81422,0.814153,0.8142,0.814172


[I 2025-03-28 18:44:32,358] Trial 74 pruned. 


Trial 75 with params: {'learning_rate': 2.1035077926232735e-05, 'weight_decay': 0.004, 'warmup_steps': 32}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5879,0.492804,0.772936,0.773361,0.77248,0.77259
2,0.4319,0.457139,0.793578,0.793519,0.793593,0.793539
3,0.3775,0.443387,0.802752,0.802862,0.802938,0.802748
4,0.3455,0.447666,0.800459,0.801577,0.799844,0.799995
5,0.3218,0.440299,0.813073,0.813358,0.812737,0.812866


[I 2025-03-28 18:46:31,377] Trial 75 pruned. 


Trial 76 with params: {'learning_rate': 5.765910166226165e-05, 'weight_decay': 0.003, 'warmup_steps': 30}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4926,0.448336,0.803899,0.803834,0.803896,0.803855
2,0.3351,0.430408,0.811927,0.811891,0.81199,0.811902
3,0.2774,0.438178,0.817661,0.817607,0.817578,0.817591
4,0.244,0.467539,0.821101,0.821356,0.820788,0.820916
5,0.2185,0.458223,0.813073,0.813396,0.813368,0.813073
6,0.2011,0.485894,0.81422,0.814839,0.814621,0.814211
7,0.1873,0.508494,0.815367,0.815514,0.815578,0.815365
8,0.1766,0.529123,0.815367,0.815489,0.815115,0.815215
9,0.1664,0.542994,0.81078,0.812132,0.811369,0.810724
10,0.1595,0.54967,0.81422,0.814185,0.814284,0.814196


[I 2025-03-28 18:50:29,733] Trial 76 pruned. 


Trial 77 with params: {'learning_rate': 2.8245230488944487e-05, 'weight_decay': 0.003, 'warmup_steps': 34}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5591,0.474684,0.786697,0.786747,0.786457,0.786536
2,0.4033,0.446562,0.798165,0.798138,0.798013,0.798059
3,0.3481,0.436653,0.808486,0.808632,0.808695,0.808484
4,0.3157,0.446182,0.813073,0.813708,0.812611,0.812772
5,0.2907,0.435445,0.817661,0.817631,0.817536,0.817574
6,0.2709,0.439415,0.816514,0.816563,0.816662,0.816505
7,0.2576,0.447902,0.809633,0.81012,0.80999,0.809629
8,0.2455,0.452843,0.817661,0.817595,0.81762,0.817607
9,0.2348,0.457314,0.81422,0.814711,0.814579,0.814216
10,0.2275,0.461963,0.817661,0.817595,0.81762,0.817607


[I 2025-03-28 18:56:26,507] Trial 77 finished with value: 0.8153375871244556 and parameters: {'learning_rate': 2.8245230488944487e-05, 'weight_decay': 0.003, 'warmup_steps': 34}. Best is trial 51 with value: 0.8221801222215643.


Trial 78 with params: {'learning_rate': 0.00015823988166036246, 'weight_decay': 0.008, 'warmup_steps': 24}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4116,0.416075,0.808486,0.811228,0.809327,0.808302
2,0.2527,0.475163,0.802752,0.803284,0.802307,0.802452
3,0.197,0.517512,0.813073,0.813504,0.81341,0.813071
4,0.165,0.533794,0.81422,0.814669,0.813821,0.81397
5,0.1414,0.587866,0.799312,0.799852,0.799686,0.799305


[I 2025-03-28 18:58:25,500] Trial 78 pruned. 


Trial 79 with params: {'learning_rate': 4.7117596993793786e-05, 'weight_decay': 0.005, 'warmup_steps': 23}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.509,0.457438,0.792431,0.792518,0.792172,0.79226
2,0.3543,0.432411,0.808486,0.808438,0.808527,0.808456
3,0.2973,0.437813,0.817661,0.817607,0.817578,0.817591
4,0.2636,0.457513,0.819954,0.820017,0.819746,0.819829
5,0.2377,0.445571,0.813073,0.813096,0.8132,0.813061
6,0.2194,0.468162,0.813073,0.813396,0.813368,0.813073
7,0.206,0.489231,0.811927,0.812301,0.812242,0.811926
8,0.1947,0.500913,0.816514,0.816501,0.816368,0.816417
9,0.1844,0.51184,0.809633,0.810708,0.810158,0.809597
10,0.1774,0.517801,0.816514,0.816478,0.816578,0.81649


[I 2025-03-28 19:04:22,601] Trial 79 finished with value: 0.8164790066294854 and parameters: {'learning_rate': 4.7117596993793786e-05, 'weight_decay': 0.005, 'warmup_steps': 23}. Best is trial 51 with value: 0.8221801222215643.


Trial 80 with params: {'learning_rate': 2.663337414236467e-05, 'weight_decay': 0.0, 'warmup_steps': 30}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.564,0.477305,0.784404,0.78451,0.784121,0.784212
2,0.4088,0.449155,0.797018,0.796976,0.796887,0.796922
3,0.354,0.437743,0.811927,0.812112,0.812158,0.811926
4,0.3217,0.446264,0.809633,0.810324,0.809148,0.809308
5,0.2969,0.436296,0.817661,0.817721,0.817452,0.817534
6,0.2771,0.438375,0.817661,0.817683,0.817789,0.817649
7,0.2638,0.445138,0.809633,0.810247,0.810032,0.809624
8,0.2516,0.450059,0.817661,0.817597,0.817662,0.81762
9,0.2411,0.453966,0.813073,0.813626,0.813452,0.813067
10,0.2338,0.458288,0.819954,0.819889,0.819915,0.819901


[I 2025-03-28 19:10:18,476] Trial 80 finished with value: 0.8187920875420875 and parameters: {'learning_rate': 2.663337414236467e-05, 'weight_decay': 0.0, 'warmup_steps': 30}. Best is trial 51 with value: 0.8221801222215643.


Trial 81 with params: {'learning_rate': 1.382966814410927e-05, 'weight_decay': 0.01, 'warmup_steps': 15}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6241,0.543986,0.739679,0.739793,0.739318,0.739391
2,0.4794,0.472371,0.783257,0.783333,0.783415,0.78325
3,0.4194,0.458358,0.792431,0.792364,0.792424,0.792385
4,0.3877,0.456583,0.786697,0.786965,0.786331,0.786444
5,0.3656,0.450269,0.798165,0.798565,0.79776,0.797893
6,0.347,0.441444,0.813073,0.813096,0.8132,0.813061
7,0.335,0.439136,0.811927,0.811927,0.812032,0.811911
8,0.3228,0.439853,0.81078,0.810715,0.810779,0.810738
9,0.3137,0.438984,0.813073,0.813096,0.8132,0.813061
10,0.307,0.442954,0.816514,0.816603,0.816284,0.816375


[I 2025-03-28 19:16:18,326] Trial 81 finished with value: 0.8164896275602275 and parameters: {'learning_rate': 1.382966814410927e-05, 'weight_decay': 0.01, 'warmup_steps': 15}. Best is trial 51 with value: 0.8221801222215643.


Trial 82 with params: {'learning_rate': 0.0002891902282670203, 'weight_decay': 0.0, 'warmup_steps': 5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3701,0.432953,0.803899,0.805429,0.804528,0.803825
2,0.2188,0.540377,0.805046,0.806192,0.804433,0.804593
3,0.1631,0.58414,0.813073,0.813007,0.813031,0.813018
4,0.1292,0.582015,0.809633,0.809782,0.809358,0.809464
5,0.105,0.761585,0.798165,0.798348,0.798392,0.798164


[I 2025-03-28 19:18:17,385] Trial 82 pruned. 


Trial 83 with params: {'learning_rate': 1.1683809439115933e-05, 'weight_decay': 0.003, 'warmup_steps': 32}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6383,0.571767,0.731651,0.731667,0.731351,0.731413
2,0.5047,0.482058,0.780963,0.781612,0.781374,0.780949
3,0.4372,0.464365,0.792431,0.792382,0.792467,0.792398
4,0.4049,0.46094,0.78555,0.785693,0.785247,0.785345
5,0.3831,0.454782,0.792431,0.792518,0.792172,0.79226
6,0.3646,0.445663,0.805046,0.80501,0.805107,0.80502
7,0.353,0.442175,0.809633,0.809743,0.809822,0.809629
8,0.3411,0.44156,0.808486,0.808438,0.808527,0.808456
9,0.3323,0.439794,0.808486,0.808564,0.808653,0.80848
10,0.3257,0.443173,0.809633,0.809658,0.809443,0.809512


[I 2025-03-28 19:22:15,414] Trial 83 pruned. 


Trial 84 with params: {'learning_rate': 2.407908508263029e-05, 'weight_decay': 0.003, 'warmup_steps': 31}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5742,0.482962,0.783257,0.783562,0.782868,0.782983
2,0.4185,0.453025,0.793578,0.793507,0.793551,0.793525
3,0.3641,0.439748,0.808486,0.808713,0.808737,0.808486
4,0.3319,0.446513,0.806193,0.80694,0.805685,0.805843
5,0.3076,0.437754,0.815367,0.815425,0.815157,0.815238
6,0.2878,0.436803,0.815367,0.815348,0.815452,0.815347
7,0.2746,0.441214,0.809633,0.81012,0.80999,0.809629
8,0.2623,0.445844,0.81422,0.814153,0.8142,0.814172
9,0.2519,0.448855,0.811927,0.812416,0.812284,0.811923
10,0.2447,0.452729,0.817661,0.817607,0.817578,0.817591


[I 2025-03-28 19:26:16,155] Trial 84 pruned. 


Trial 85 with params: {'learning_rate': 3.830978114707282e-05, 'weight_decay': 0.003, 'warmup_steps': 35}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5308,0.46564,0.792431,0.792418,0.792256,0.792311
2,0.3746,0.43755,0.805046,0.805066,0.804854,0.804922
3,0.3179,0.43503,0.81078,0.810731,0.810821,0.81075
4,0.2845,0.450596,0.818807,0.818972,0.818536,0.818646
5,0.2587,0.437287,0.81422,0.814162,0.814242,0.814185
6,0.2394,0.45388,0.816514,0.8167,0.816747,0.816513
7,0.2261,0.471408,0.815367,0.815596,0.815621,0.815367
8,0.2145,0.47602,0.813073,0.813017,0.812989,0.813002
9,0.2039,0.486037,0.81422,0.815304,0.814747,0.814185
10,0.1968,0.489988,0.818807,0.818746,0.818746,0.818746


[I 2025-03-28 19:32:13,257] Trial 85 finished with value: 0.814184994212354 and parameters: {'learning_rate': 3.830978114707282e-05, 'weight_decay': 0.003, 'warmup_steps': 35}. Best is trial 51 with value: 0.8221801222215643.


Trial 86 with params: {'learning_rate': 2.1100268354513166e-05, 'weight_decay': 0.004, 'warmup_steps': 37}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5886,0.492891,0.772936,0.773361,0.77248,0.77259
2,0.4318,0.457034,0.793578,0.793519,0.793593,0.793539
3,0.3772,0.443332,0.802752,0.802862,0.802938,0.802748
4,0.3451,0.447571,0.800459,0.801577,0.799844,0.799995
5,0.3214,0.440224,0.813073,0.813358,0.812737,0.812866


[I 2025-03-28 19:34:10,300] Trial 86 pruned. 


Trial 87 with params: {'learning_rate': 2.067709333338839e-05, 'weight_decay': 0.003, 'warmup_steps': 23}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5879,0.493754,0.772936,0.773361,0.77248,0.77259
2,0.4333,0.457592,0.793578,0.793519,0.793593,0.793539
3,0.3791,0.443941,0.802752,0.802862,0.802938,0.802748
4,0.3472,0.447988,0.799312,0.800338,0.798718,0.798867
5,0.3236,0.440758,0.813073,0.813358,0.812737,0.812866
6,0.3042,0.436286,0.815367,0.815319,0.81541,0.815338
7,0.2912,0.437865,0.809633,0.809906,0.809906,0.809633
8,0.2788,0.441884,0.81422,0.814153,0.8142,0.814172
9,0.2686,0.443306,0.81078,0.811102,0.811074,0.81078
10,0.2615,0.447166,0.815367,0.815336,0.815242,0.815279


[I 2025-03-28 19:38:05,193] Trial 87 pruned. 


Trial 88 with params: {'learning_rate': 3.738633777325785e-05, 'weight_decay': 0.003, 'warmup_steps': 28}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5314,0.466259,0.793578,0.793548,0.793424,0.793469
2,0.3767,0.438252,0.803899,0.803896,0.803728,0.803785
3,0.3203,0.434995,0.81078,0.810731,0.810821,0.81075
4,0.287,0.450015,0.816514,0.816675,0.816241,0.81635
5,0.2612,0.436892,0.81422,0.814153,0.8142,0.814172
6,0.2419,0.452412,0.816514,0.8167,0.816747,0.816513
7,0.2285,0.469227,0.815367,0.815596,0.815621,0.815367
8,0.2169,0.473605,0.813073,0.813017,0.812989,0.813002
9,0.2064,0.483233,0.811927,0.813006,0.812453,0.811891
10,0.1992,0.487167,0.819954,0.819901,0.819872,0.819886


[I 2025-03-28 19:43:58,485] Trial 88 finished with value: 0.8164664530353019 and parameters: {'learning_rate': 3.738633777325785e-05, 'weight_decay': 0.003, 'warmup_steps': 28}. Best is trial 51 with value: 0.8221801222215643.


Trial 89 with params: {'learning_rate': 2.0626243186970018e-05, 'weight_decay': 0.006, 'warmup_steps': 28}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5891,0.494258,0.771789,0.772265,0.771312,0.771421
2,0.4337,0.457681,0.793578,0.793519,0.793593,0.793539
3,0.3794,0.444012,0.802752,0.802862,0.802938,0.802748
4,0.3474,0.447971,0.799312,0.800512,0.798676,0.798823
5,0.3238,0.440743,0.813073,0.813358,0.812737,0.812866


[I 2025-03-28 19:45:54,982] Trial 89 pruned. 


Trial 90 with params: {'learning_rate': 3.6994363225924204e-05, 'weight_decay': 0.005, 'warmup_steps': 37}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5344,0.466592,0.791284,0.791253,0.79113,0.791175
2,0.378,0.438515,0.803899,0.803896,0.803728,0.803785
3,0.3214,0.434774,0.811927,0.811891,0.81199,0.811902
4,0.2881,0.44979,0.815367,0.815566,0.815073,0.81519
5,0.2623,0.436336,0.815367,0.815303,0.815368,0.815326
6,0.2429,0.451768,0.816514,0.8167,0.816747,0.816513
7,0.2296,0.468076,0.816514,0.816789,0.816789,0.816514
8,0.2179,0.472469,0.816514,0.816452,0.816452,0.816452
9,0.2074,0.482031,0.813073,0.81407,0.813579,0.813044
10,0.2002,0.486111,0.819954,0.819901,0.819872,0.819886


[I 2025-03-28 19:51:51,081] Trial 90 finished with value: 0.8153259275336583 and parameters: {'learning_rate': 3.6994363225924204e-05, 'weight_decay': 0.005, 'warmup_steps': 37}. Best is trial 51 with value: 0.8221801222215643.


Trial 91 with params: {'learning_rate': 1.5775528862129258e-05, 'weight_decay': 0.0, 'warmup_steps': 28}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6148,0.52589,0.758028,0.758816,0.75742,0.757491
2,0.4633,0.466767,0.784404,0.784368,0.784457,0.784375
3,0.4062,0.453582,0.795872,0.795836,0.795929,0.795845
4,0.3744,0.453174,0.788991,0.789363,0.788583,0.788706
5,0.3519,0.447031,0.802752,0.803166,0.802349,0.802486
6,0.333,0.438821,0.81078,0.810731,0.810821,0.81075
7,0.3206,0.437651,0.808486,0.808564,0.808653,0.80848
8,0.3082,0.439416,0.811927,0.811859,0.811905,0.811878
9,0.2988,0.439371,0.81078,0.810926,0.81099,0.810778
10,0.292,0.443306,0.816514,0.816603,0.816284,0.816375


[I 2025-03-28 19:57:47,499] Trial 91 finished with value: 0.8130318396769336 and parameters: {'learning_rate': 1.5775528862129258e-05, 'weight_decay': 0.0, 'warmup_steps': 28}. Best is trial 51 with value: 0.8221801222215643.


Trial 92 with params: {'learning_rate': 2.7062569698455856e-05, 'weight_decay': 0.0, 'warmup_steps': 33}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5631,0.476596,0.784404,0.78451,0.784121,0.784212
2,0.4074,0.448453,0.797018,0.797009,0.796845,0.796901
3,0.3524,0.437409,0.811927,0.812037,0.812116,0.811923
4,0.32,0.446181,0.808486,0.809244,0.80798,0.808141
5,0.2952,0.436029,0.817661,0.817721,0.817452,0.817534
6,0.2754,0.438646,0.817661,0.817683,0.817789,0.817649
7,0.2621,0.445885,0.81078,0.81133,0.811158,0.810774
8,0.2499,0.450792,0.818807,0.818741,0.818788,0.818761
9,0.2394,0.454841,0.813073,0.813626,0.813452,0.813067
10,0.2321,0.459208,0.821101,0.821041,0.821041,0.821041


[I 2025-03-28 20:03:42,756] Trial 92 finished with value: 0.8176411246568802 and parameters: {'learning_rate': 2.7062569698455856e-05, 'weight_decay': 0.0, 'warmup_steps': 33}. Best is trial 51 with value: 0.8221801222215643.


Trial 93 with params: {'learning_rate': 2.1207807932658264e-05, 'weight_decay': 0.0, 'warmup_steps': 32}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5871,0.49211,0.775229,0.775663,0.774775,0.774887
2,0.4311,0.456833,0.794725,0.794658,0.794719,0.794679
3,0.3767,0.443147,0.803899,0.803977,0.804065,0.803893
4,0.3446,0.447551,0.801606,0.80282,0.80097,0.801122
5,0.3209,0.440116,0.81422,0.814461,0.813905,0.814028
6,0.3014,0.436171,0.81422,0.814162,0.814242,0.814185
7,0.2884,0.438172,0.809633,0.810007,0.809948,0.809632
8,0.2759,0.442331,0.815367,0.815301,0.815326,0.815312
9,0.2657,0.443997,0.811927,0.812301,0.812242,0.811926
10,0.2586,0.447875,0.816514,0.816501,0.816368,0.816417


[I 2025-03-28 20:09:40,998] Trial 93 finished with value: 0.8199349469882402 and parameters: {'learning_rate': 2.1207807932658264e-05, 'weight_decay': 0.0, 'warmup_steps': 32}. Best is trial 51 with value: 0.8221801222215643.


Trial 94 with params: {'learning_rate': 1.9690163329695552e-05, 'weight_decay': 0.0, 'warmup_steps': 35}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5951,0.49916,0.768349,0.768868,0.76785,0.767953
2,0.4388,0.45923,0.795872,0.795836,0.795929,0.795845
3,0.3841,0.445647,0.803899,0.803977,0.804065,0.803893
4,0.3521,0.448586,0.797018,0.798031,0.796424,0.796569
5,0.3287,0.441891,0.81422,0.814377,0.813947,0.814055
6,0.3094,0.436532,0.815367,0.815319,0.81541,0.815338
7,0.2964,0.43749,0.808486,0.808914,0.808822,0.808484
8,0.2839,0.441123,0.813073,0.813009,0.813074,0.813032
9,0.2739,0.442405,0.81078,0.811209,0.811116,0.810778
10,0.2669,0.446125,0.81422,0.814206,0.814073,0.814122


[I 2025-03-28 20:13:36,801] Trial 94 pruned. 


Trial 95 with params: {'learning_rate': 2.1208717836239785e-05, 'weight_decay': 0.003, 'warmup_steps': 35}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5877,0.492297,0.774083,0.774459,0.773649,0.773759
2,0.4312,0.456868,0.794725,0.794658,0.794719,0.794679
3,0.3767,0.443168,0.803899,0.803977,0.804065,0.803893
4,0.3446,0.447544,0.801606,0.80282,0.80097,0.801122
5,0.3209,0.440108,0.81422,0.814461,0.813905,0.814028
6,0.3014,0.436185,0.813073,0.813025,0.813116,0.813044
7,0.2883,0.438197,0.81078,0.811209,0.811116,0.810778
8,0.2759,0.442336,0.815367,0.815301,0.815326,0.815312
9,0.2657,0.444023,0.81078,0.811102,0.811074,0.81078
10,0.2586,0.447838,0.815367,0.815336,0.815242,0.815279


[I 2025-03-28 20:17:32,534] Trial 95 pruned. 


Trial 96 with params: {'learning_rate': 3.417395370091366e-05, 'weight_decay': 0.002, 'warmup_steps': 35}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5413,0.468289,0.792431,0.792386,0.792298,0.792333
2,0.3854,0.440732,0.802752,0.80277,0.80256,0.802627
3,0.3292,0.434684,0.808486,0.808509,0.808611,0.808474
4,0.2963,0.448025,0.815367,0.815566,0.815073,0.81519
5,0.2707,0.434939,0.813073,0.813007,0.813031,0.813018


[I 2025-03-28 20:19:29,064] Trial 96 pruned. 


Trial 97 with params: {'learning_rate': 2.554508127002213e-05, 'weight_decay': 0.004, 'warmup_steps': 28}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5677,0.47944,0.780963,0.781097,0.780658,0.780753
2,0.4127,0.450978,0.795872,0.795817,0.795761,0.795785
3,0.3582,0.438557,0.81078,0.811102,0.811074,0.81078
4,0.3259,0.446329,0.809633,0.810324,0.809148,0.809308
5,0.3013,0.436866,0.817661,0.817669,0.817494,0.817555
6,0.2815,0.437699,0.815367,0.815348,0.815452,0.815347
7,0.2683,0.443353,0.81078,0.811464,0.8112,0.810768
8,0.2561,0.448222,0.816514,0.816452,0.816452,0.816452
9,0.2455,0.45177,0.811927,0.812416,0.812284,0.811923
10,0.2383,0.455874,0.817661,0.817607,0.817578,0.817591


[I 2025-03-28 20:25:21,408] Trial 97 finished with value: 0.817648799542307 and parameters: {'learning_rate': 2.554508127002213e-05, 'weight_decay': 0.004, 'warmup_steps': 28}. Best is trial 51 with value: 0.8221801222215643.


Trial 98 with params: {'learning_rate': 0.00036979530813873037, 'weight_decay': 0.009000000000000001, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3561,0.429779,0.809633,0.810541,0.810116,0.809608
2,0.2096,0.574433,0.803899,0.804788,0.803349,0.803506
3,0.1534,0.600963,0.806193,0.806301,0.805938,0.806033
4,0.1191,0.645975,0.795872,0.795843,0.795719,0.795764
5,0.095,0.798493,0.794725,0.794681,0.794592,0.794627
6,0.0779,0.915139,0.798165,0.79823,0.797929,0.798012
7,0.0651,0.904904,0.795872,0.795817,0.795761,0.795785
8,0.057,0.936535,0.795872,0.795803,0.795803,0.795803
9,0.0485,1.027001,0.800459,0.801351,0.800939,0.800432
10,0.0431,1.021904,0.788991,0.78892,0.78892,0.78892


[I 2025-03-28 20:29:17,562] Trial 98 pruned. 


Trial 99 with params: {'learning_rate': 5.9614195699865725e-05, 'weight_decay': 0.002, 'warmup_steps': 31}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4899,0.447069,0.807339,0.807304,0.807401,0.807314
2,0.332,0.430503,0.81078,0.81076,0.810863,0.81076
3,0.2742,0.438109,0.816514,0.81647,0.81641,0.816436
4,0.2409,0.469831,0.824541,0.824962,0.824166,0.824319
5,0.2155,0.461188,0.813073,0.813396,0.813368,0.813073
6,0.1982,0.489267,0.815367,0.816057,0.815789,0.815355
7,0.1843,0.512759,0.818807,0.818918,0.818999,0.818804
8,0.1738,0.534339,0.81422,0.814377,0.813947,0.814055
9,0.1636,0.548799,0.808486,0.810032,0.809116,0.808413
10,0.1566,0.555231,0.81078,0.810715,0.810779,0.810738


[I 2025-03-28 20:33:11,852] Trial 99 pruned. 


Trial 100 with params: {'learning_rate': 1.6789879334421116e-05, 'weight_decay': 0.005, 'warmup_steps': 38}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6109,0.518269,0.758028,0.758561,0.757504,0.757591
2,0.4564,0.464492,0.786697,0.786661,0.786752,0.786669
3,0.4,0.451349,0.795872,0.795836,0.795929,0.795845
4,0.3681,0.451606,0.795872,0.796378,0.795424,0.795561
5,0.3453,0.445475,0.803899,0.804163,0.803559,0.803682


[I 2025-03-28 20:35:10,139] Trial 100 pruned. 


Trial 101 with params: {'learning_rate': 3.8465371212942565e-05, 'weight_decay': 0.0, 'warmup_steps': 35}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5304,0.465492,0.792431,0.792418,0.792256,0.792311
2,0.3742,0.437415,0.805046,0.805066,0.804854,0.804922
3,0.3175,0.435095,0.81078,0.810731,0.810821,0.81075
4,0.2841,0.450706,0.818807,0.818972,0.818536,0.818646
5,0.2583,0.437386,0.81422,0.814162,0.814242,0.814185
6,0.239,0.454104,0.815367,0.815596,0.815621,0.815367
7,0.2257,0.471792,0.815367,0.815596,0.815621,0.815367
8,0.2141,0.476417,0.81422,0.814158,0.814158,0.814158
9,0.2035,0.486491,0.81422,0.815304,0.814747,0.814185
10,0.1964,0.490416,0.818807,0.818746,0.818746,0.818746


[I 2025-03-28 20:41:06,803] Trial 101 finished with value: 0.814184994212354 and parameters: {'learning_rate': 3.8465371212942565e-05, 'weight_decay': 0.0, 'warmup_steps': 35}. Best is trial 51 with value: 0.8221801222215643.


Trial 102 with params: {'learning_rate': 2.9414531729427016e-05, 'weight_decay': 0.0, 'warmup_steps': 24}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.553,0.472815,0.786697,0.786698,0.786499,0.786561
2,0.3991,0.445273,0.800459,0.800474,0.800265,0.800332
3,0.344,0.436113,0.808486,0.808632,0.808695,0.808484
4,0.3116,0.446433,0.816514,0.816969,0.816115,0.816266
5,0.2865,0.435293,0.817661,0.817631,0.817536,0.817574
6,0.2667,0.440527,0.815367,0.81539,0.815494,0.815355
7,0.2533,0.450143,0.809633,0.81012,0.80999,0.809629
8,0.2413,0.455028,0.818807,0.818741,0.818788,0.818761
9,0.2307,0.460056,0.81422,0.814494,0.814494,0.81422
10,0.2234,0.464775,0.815367,0.815301,0.815326,0.815312


[I 2025-03-28 20:45:01,699] Trial 102 pruned. 


Trial 103 with params: {'learning_rate': 2.5418705508557885e-05, 'weight_decay': 0.0, 'warmup_steps': 28}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5682,0.479704,0.780963,0.781097,0.780658,0.780753
2,0.4132,0.451122,0.794725,0.794661,0.794635,0.794647
3,0.3587,0.438666,0.81078,0.811102,0.811074,0.81078
4,0.3264,0.446326,0.809633,0.810324,0.809148,0.809308
5,0.3019,0.436971,0.817661,0.817669,0.817494,0.817555
6,0.2821,0.437582,0.815367,0.815348,0.815452,0.815347
7,0.2688,0.443166,0.81078,0.811464,0.8112,0.810768
8,0.2566,0.44799,0.816514,0.816452,0.816452,0.816452
9,0.2461,0.451527,0.811927,0.812416,0.812284,0.811923
10,0.2388,0.455611,0.817661,0.817607,0.817578,0.817591


[I 2025-03-28 20:50:53,790] Trial 103 finished with value: 0.817648799542307 and parameters: {'learning_rate': 2.5418705508557885e-05, 'weight_decay': 0.0, 'warmup_steps': 28}. Best is trial 51 with value: 0.8221801222215643.


Trial 104 with params: {'learning_rate': 2.2451250620334333e-05, 'weight_decay': 0.001, 'warmup_steps': 29}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5808,0.487619,0.78211,0.782283,0.781784,0.781885
2,0.4253,0.455184,0.793578,0.793507,0.793551,0.793525
3,0.371,0.441533,0.807339,0.807524,0.807569,0.807338
4,0.3389,0.447018,0.802752,0.803715,0.802181,0.802336
5,0.3149,0.438973,0.815367,0.815566,0.815073,0.81519
6,0.2953,0.436292,0.81422,0.814185,0.814284,0.814196
7,0.2822,0.43929,0.81078,0.811209,0.811116,0.810778
8,0.2698,0.443735,0.815367,0.815301,0.815326,0.815312
9,0.2595,0.445885,0.81078,0.811102,0.811074,0.81078
10,0.2523,0.449736,0.815367,0.815336,0.815242,0.815279


[I 2025-03-28 20:54:48,832] Trial 104 pruned. 


Trial 105 with params: {'learning_rate': 8.347446557533028e-05, 'weight_decay': 0.01, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4537,0.440061,0.806193,0.806215,0.806317,0.80618
2,0.3019,0.441227,0.816514,0.816546,0.816326,0.816397
3,0.2448,0.446615,0.818807,0.818741,0.818788,0.818761
4,0.212,0.48852,0.823394,0.824852,0.822746,0.822944
5,0.1877,0.488432,0.813073,0.813626,0.813452,0.813067
6,0.171,0.519623,0.81078,0.811464,0.8112,0.810768
7,0.1561,0.562707,0.813073,0.813054,0.813158,0.813053
8,0.1466,0.587983,0.815367,0.816155,0.814863,0.815034
9,0.1361,0.611294,0.808486,0.809646,0.809032,0.808444
10,0.1286,0.620272,0.81422,0.814185,0.814284,0.814196


[I 2025-03-28 20:58:43,398] Trial 105 pruned. 


Trial 106 with params: {'learning_rate': 1.508409012592342e-05, 'weight_decay': 0.002, 'warmup_steps': 34}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6197,0.533125,0.75344,0.753846,0.752957,0.753043
2,0.4692,0.468693,0.78211,0.782158,0.782247,0.7821
3,0.4108,0.455293,0.795872,0.795812,0.795887,0.795833
4,0.379,0.454265,0.791284,0.791564,0.790919,0.791037
5,0.3566,0.448068,0.799312,0.799662,0.798929,0.799058


[I 2025-03-28 21:00:40,497] Trial 106 pruned. 


Trial 107 with params: {'learning_rate': 0.00010121968952843504, 'weight_decay': 0.006, 'warmup_steps': 33}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4461,0.432898,0.805046,0.806684,0.805696,0.804963
2,0.2869,0.452148,0.809633,0.810472,0.809106,0.809271
3,0.2295,0.458867,0.824541,0.824564,0.824672,0.82453
4,0.1966,0.504587,0.824541,0.826332,0.82383,0.82403
5,0.1732,0.516701,0.81078,0.811209,0.811116,0.810778
6,0.1559,0.555492,0.805046,0.806111,0.80557,0.805009
7,0.1408,0.613808,0.809633,0.809743,0.809822,0.809629
8,0.1311,0.629267,0.818807,0.81969,0.818283,0.818463
9,0.1201,0.669681,0.800459,0.802522,0.801191,0.800332
10,0.1129,0.674922,0.808486,0.808467,0.808569,0.808466


[I 2025-03-28 21:04:35,375] Trial 107 pruned. 


Trial 108 with params: {'learning_rate': 1.0134376014913452e-05, 'weight_decay': 0.0, 'warmup_steps': 35}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6467,0.591629,0.715596,0.715898,0.71508,0.715111
2,0.5273,0.493249,0.78211,0.783474,0.78271,0.782037
3,0.453,0.470218,0.786697,0.786661,0.786752,0.786669
4,0.4194,0.465414,0.784404,0.78451,0.784121,0.784212
5,0.3978,0.459419,0.788991,0.789105,0.788709,0.788803
6,0.3793,0.449806,0.798165,0.798106,0.798181,0.798127
7,0.368,0.44585,0.805046,0.805094,0.805191,0.805037
8,0.3564,0.444096,0.806193,0.806215,0.806317,0.80618
9,0.3479,0.441973,0.808486,0.808564,0.808653,0.80848
10,0.3413,0.444519,0.805046,0.805024,0.804896,0.804943


[I 2025-03-28 21:08:41,880] Trial 108 pruned. 


Trial 109 with params: {'learning_rate': 2.4585844678745737e-05, 'weight_decay': 0.001, 'warmup_steps': 34}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5728,0.481803,0.783257,0.783562,0.782868,0.782983
2,0.4166,0.452319,0.793578,0.793507,0.793551,0.793525
3,0.362,0.439284,0.809633,0.810007,0.809948,0.809632
4,0.3298,0.446339,0.806193,0.80694,0.805685,0.805843
5,0.3054,0.437367,0.816514,0.816546,0.816326,0.816397
6,0.2856,0.437052,0.815367,0.815348,0.815452,0.815347
7,0.2724,0.441903,0.808486,0.809168,0.808906,0.808474
8,0.2601,0.446635,0.813073,0.813009,0.813074,0.813032
9,0.2496,0.449817,0.81078,0.81133,0.811158,0.810774
10,0.2424,0.453799,0.817661,0.817607,0.817578,0.817591


[I 2025-03-28 21:14:36,362] Trial 109 finished with value: 0.8187920875420875 and parameters: {'learning_rate': 2.4585844678745737e-05, 'weight_decay': 0.001, 'warmup_steps': 34}. Best is trial 51 with value: 0.8221801222215643.


Trial 110 with params: {'learning_rate': 5.596988913048012e-05, 'weight_decay': 0.003, 'warmup_steps': 11}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4905,0.449712,0.798165,0.798097,0.798097,0.798097
2,0.3375,0.430156,0.811927,0.811891,0.81199,0.811902
3,0.2803,0.437697,0.816514,0.81647,0.81641,0.816436
4,0.2469,0.465516,0.819954,0.820162,0.819662,0.819781
5,0.2212,0.455572,0.81078,0.811102,0.811074,0.81078


[I 2025-03-28 21:16:33,270] Trial 110 pruned. 


Trial 111 with params: {'learning_rate': 2.5991571071450347e-05, 'weight_decay': 0.0, 'warmup_steps': 33}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.567,0.478673,0.783257,0.783395,0.782952,0.783049
2,0.4112,0.450264,0.797018,0.796976,0.796887,0.796922
3,0.3565,0.438159,0.811927,0.8122,0.8122,0.811927
4,0.3241,0.446202,0.809633,0.810324,0.809148,0.809308
5,0.2995,0.436584,0.815367,0.815374,0.8152,0.81526
6,0.2797,0.437929,0.817661,0.817683,0.817789,0.817649
7,0.2664,0.444074,0.809633,0.810247,0.810032,0.809624
8,0.2542,0.44893,0.817661,0.817595,0.81762,0.817607
9,0.2436,0.452684,0.811927,0.812416,0.812284,0.811923
10,0.2364,0.45687,0.818807,0.818746,0.818746,0.818746


[I 2025-03-28 21:22:25,659] Trial 111 finished with value: 0.8199349469882402 and parameters: {'learning_rate': 2.5991571071450347e-05, 'weight_decay': 0.0, 'warmup_steps': 33}. Best is trial 51 with value: 0.8221801222215643.


Trial 112 with params: {'learning_rate': 2.784460988800823e-05, 'weight_decay': 0.0, 'warmup_steps': 33}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5603,0.475297,0.786697,0.786747,0.786457,0.786536
2,0.4046,0.4472,0.799312,0.799271,0.799181,0.799217
3,0.3495,0.43689,0.809633,0.809818,0.809864,0.809632
4,0.3171,0.44617,0.81078,0.811406,0.810316,0.810474
5,0.2922,0.435647,0.817661,0.817631,0.817536,0.817574
6,0.2724,0.439192,0.816514,0.816563,0.816662,0.816505
7,0.2591,0.447198,0.809633,0.81012,0.80999,0.809629
8,0.247,0.452121,0.817661,0.817595,0.81762,0.817607
9,0.2363,0.456482,0.813073,0.813626,0.813452,0.813067
10,0.2291,0.460976,0.821101,0.821041,0.821041,0.821041


[I 2025-03-28 21:28:20,289] Trial 112 finished with value: 0.8164896275602275 and parameters: {'learning_rate': 2.784460988800823e-05, 'weight_decay': 0.0, 'warmup_steps': 33}. Best is trial 51 with value: 0.8221801222215643.


Trial 113 with params: {'learning_rate': 0.00018984118670607364, 'weight_decay': 0.002, 'warmup_steps': 35}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4024,0.413423,0.802752,0.804598,0.803444,0.802648
2,0.2415,0.486022,0.806193,0.806462,0.805854,0.805978
3,0.1862,0.539477,0.809633,0.810388,0.810074,0.809617
4,0.1534,0.540024,0.815367,0.816011,0.814905,0.815069
5,0.1301,0.626151,0.803899,0.804443,0.804275,0.803893
6,0.1116,0.66112,0.800459,0.801064,0.800855,0.800449
7,0.0961,0.790847,0.800459,0.801064,0.800855,0.800449
8,0.0859,0.769886,0.801606,0.801647,0.801391,0.801467
9,0.0767,0.849715,0.797018,0.799428,0.797813,0.796851
10,0.0702,0.833328,0.802752,0.803122,0.803065,0.802751


[I 2025-03-28 21:32:13,782] Trial 113 pruned. 


Trial 114 with params: {'learning_rate': 4.783249558891983e-05, 'weight_decay': 0.005, 'warmup_steps': 31}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5096,0.456938,0.792431,0.792518,0.792172,0.79226
2,0.3531,0.432165,0.808486,0.808438,0.808527,0.808456
3,0.2958,0.437943,0.817661,0.817607,0.817578,0.817591
4,0.2621,0.457993,0.819954,0.820017,0.819746,0.819829
5,0.2362,0.446272,0.813073,0.813096,0.8132,0.813061
6,0.218,0.469829,0.813073,0.813396,0.813368,0.813073
7,0.2045,0.490826,0.811927,0.812301,0.812242,0.811926
8,0.1933,0.503114,0.816514,0.816501,0.816368,0.816417
9,0.183,0.51452,0.809633,0.810708,0.810158,0.809597
10,0.176,0.520582,0.815367,0.815319,0.81541,0.815338


[I 2025-03-28 21:36:07,805] Trial 114 pruned. 


Trial 115 with params: {'learning_rate': 3.110981689709106e-05, 'weight_decay': 0.0, 'warmup_steps': 32}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5494,0.470924,0.787844,0.787869,0.787625,0.787696
2,0.3941,0.443486,0.798165,0.79823,0.797929,0.798012
3,0.3385,0.435365,0.809633,0.809818,0.809864,0.809632
4,0.3059,0.446667,0.81422,0.814558,0.813863,0.814
5,0.2806,0.434825,0.81422,0.814158,0.814158,0.814158
6,0.2608,0.44231,0.816514,0.816625,0.816705,0.81651
7,0.2474,0.453602,0.81422,0.814596,0.814536,0.814219
8,0.2355,0.458381,0.817661,0.817597,0.817662,0.81762
9,0.2248,0.464298,0.813073,0.813396,0.813368,0.813073
10,0.2176,0.469251,0.818807,0.818746,0.818746,0.818746


[I 2025-03-28 21:42:03,911] Trial 115 finished with value: 0.8221801222215643 and parameters: {'learning_rate': 3.110981689709106e-05, 'weight_decay': 0.0, 'warmup_steps': 32}. Best is trial 51 with value: 0.8221801222215643.


Trial 116 with params: {'learning_rate': 3.568543861233653e-05, 'weight_decay': 0.002, 'warmup_steps': 36}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5375,0.467358,0.791284,0.791227,0.791172,0.791195
2,0.3814,0.439503,0.805046,0.805066,0.804854,0.804922
3,0.325,0.434675,0.809633,0.809633,0.809737,0.809617
4,0.2918,0.44898,0.813073,0.813269,0.812779,0.812894
5,0.2661,0.435561,0.813073,0.813009,0.813074,0.813032


[I 2025-03-28 21:44:03,602] Trial 116 pruned. 


Trial 117 with params: {'learning_rate': 3.540275970611972e-05, 'weight_decay': 0.004, 'warmup_steps': 26}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.536,0.467445,0.791284,0.791227,0.791172,0.791195
2,0.3818,0.43982,0.805046,0.805066,0.804854,0.804922
3,0.3257,0.434792,0.808486,0.808509,0.808611,0.808474
4,0.2927,0.448866,0.813073,0.813269,0.812779,0.812894
5,0.267,0.435738,0.811927,0.811859,0.811905,0.811878
6,0.2474,0.448993,0.817661,0.817808,0.817873,0.817658
7,0.2341,0.463959,0.816514,0.816789,0.816789,0.816514
8,0.2224,0.468423,0.816514,0.816452,0.816452,0.816452
9,0.2118,0.477033,0.81422,0.814981,0.814663,0.814205
10,0.2046,0.481232,0.819954,0.819926,0.81983,0.819869


[I 2025-03-28 21:50:00,407] Trial 117 finished with value: 0.8176065796760941 and parameters: {'learning_rate': 3.540275970611972e-05, 'weight_decay': 0.004, 'warmup_steps': 26}. Best is trial 51 with value: 0.8221801222215643.


Trial 118 with params: {'learning_rate': 3.6664026396879975e-05, 'weight_decay': 0.0, 'warmup_steps': 34}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5346,0.466775,0.793578,0.793522,0.793466,0.79349
2,0.3788,0.438809,0.805046,0.805066,0.804854,0.804922
3,0.3223,0.4348,0.81078,0.810731,0.810821,0.81075
4,0.2891,0.449569,0.815367,0.815566,0.815073,0.81519
5,0.2633,0.43622,0.81422,0.814153,0.8142,0.814172
6,0.2438,0.451206,0.816514,0.8167,0.816747,0.816513
7,0.2305,0.467198,0.815367,0.815596,0.815621,0.815367
8,0.2188,0.47165,0.816514,0.816452,0.816452,0.816452
9,0.2083,0.480996,0.813073,0.81407,0.813579,0.813044
10,0.2011,0.484998,0.819954,0.819901,0.819872,0.819886


[I 2025-03-28 21:55:59,123] Trial 118 finished with value: 0.8164664530353019 and parameters: {'learning_rate': 3.6664026396879975e-05, 'weight_decay': 0.0, 'warmup_steps': 34}. Best is trial 51 with value: 0.8221801222215643.


Trial 119 with params: {'learning_rate': 8.728609309231897e-05, 'weight_decay': 0.01, 'warmup_steps': 18}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4541,0.437882,0.808486,0.808807,0.80878,0.808486
2,0.2984,0.440995,0.81422,0.814307,0.813989,0.814079
3,0.241,0.449521,0.819954,0.819889,0.819915,0.819901
4,0.208,0.492893,0.826835,0.828644,0.826124,0.82633
5,0.1843,0.494556,0.808486,0.808807,0.80878,0.808486


[I 2025-03-28 21:57:57,920] Trial 119 pruned. 


Trial 120 with params: {'learning_rate': 3.6086783836865125e-05, 'weight_decay': 0.004, 'warmup_steps': 37}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5367,0.467097,0.791284,0.791227,0.791172,0.791195
2,0.3803,0.439158,0.803899,0.803943,0.803686,0.803763
3,0.3239,0.434668,0.81078,0.81076,0.810863,0.81076
4,0.2907,0.449132,0.81422,0.814461,0.813905,0.814028
5,0.265,0.435782,0.813073,0.813009,0.813074,0.813032
6,0.2454,0.450149,0.817661,0.817808,0.817873,0.817658
7,0.2321,0.46559,0.816514,0.816789,0.816789,0.816514
8,0.2204,0.470093,0.816514,0.816452,0.816452,0.816452
9,0.2098,0.4792,0.813073,0.813908,0.813537,0.813053
10,0.2026,0.483287,0.818807,0.818765,0.818704,0.81873


[I 2025-03-28 22:03:55,324] Trial 120 finished with value: 0.8176065796760941 and parameters: {'learning_rate': 3.6086783836865125e-05, 'weight_decay': 0.004, 'warmup_steps': 37}. Best is trial 51 with value: 0.8221801222215643.


Trial 121 with params: {'learning_rate': 6.745912212460918e-05, 'weight_decay': 0.0, 'warmup_steps': 26}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4778,0.444211,0.801606,0.801557,0.801644,0.801574
2,0.3205,0.431584,0.81422,0.814185,0.814284,0.814196
3,0.2628,0.439411,0.816514,0.816603,0.816284,0.816375
4,0.2297,0.477796,0.822248,0.822918,0.821788,0.821961
5,0.2049,0.472604,0.81078,0.811209,0.811116,0.810778


[I 2025-03-28 22:05:55,817] Trial 121 pruned. 


Trial 122 with params: {'learning_rate': 3.6369044045644285e-05, 'weight_decay': 0.0, 'warmup_steps': 28}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5339,0.466882,0.793578,0.793522,0.793466,0.79349
2,0.3793,0.439036,0.805046,0.805066,0.804854,0.804922
3,0.323,0.434829,0.81078,0.810731,0.810821,0.81075
4,0.2899,0.449443,0.81422,0.814461,0.813905,0.814028
5,0.2641,0.436229,0.813073,0.813009,0.813074,0.813032
6,0.2447,0.450657,0.817661,0.817808,0.817873,0.817658
7,0.2313,0.466508,0.815367,0.815596,0.815621,0.815367
8,0.2197,0.470887,0.816514,0.816452,0.816452,0.816452
9,0.2091,0.479994,0.811927,0.812839,0.812411,0.811902
10,0.2019,0.484045,0.819954,0.819901,0.819872,0.819886


[I 2025-03-28 22:11:50,433] Trial 122 finished with value: 0.8176065796760941 and parameters: {'learning_rate': 3.6369044045644285e-05, 'weight_decay': 0.0, 'warmup_steps': 28}. Best is trial 51 with value: 0.8221801222215643.


Trial 123 with params: {'learning_rate': 3.507714164620168e-05, 'weight_decay': 0.0, 'warmup_steps': 37}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5394,0.467765,0.791284,0.791227,0.791172,0.791195
2,0.383,0.439981,0.802752,0.80277,0.80256,0.802627
3,0.3267,0.434591,0.809633,0.809633,0.809737,0.809617
4,0.2936,0.44853,0.813073,0.813269,0.812779,0.812894
5,0.2679,0.435216,0.811927,0.811859,0.811905,0.811878


[I 2025-03-28 22:13:48,561] Trial 123 pruned. 


Trial 124 with params: {'learning_rate': 2.103136890071569e-05, 'weight_decay': 0.001, 'warmup_steps': 33}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5881,0.492888,0.772936,0.773361,0.77248,0.77259
2,0.432,0.45714,0.793578,0.793519,0.793593,0.793539
3,0.3775,0.443415,0.802752,0.802862,0.802938,0.802748
4,0.3455,0.447673,0.800459,0.801577,0.799844,0.799995
5,0.3218,0.440283,0.813073,0.813358,0.812737,0.812866
6,0.3023,0.436236,0.81422,0.814162,0.814242,0.814185
7,0.2893,0.438108,0.809633,0.810007,0.809948,0.809632
8,0.2768,0.442203,0.815367,0.815301,0.815326,0.815312
9,0.2667,0.44377,0.81078,0.811102,0.811074,0.81078
10,0.2596,0.447597,0.815367,0.815336,0.815242,0.815279


[I 2025-03-28 22:17:42,231] Trial 124 pruned. 


Trial 125 with params: {'learning_rate': 3.1133499157296776e-05, 'weight_decay': 0.006, 'warmup_steps': 43}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5519,0.470887,0.787844,0.787869,0.787625,0.787696
2,0.3944,0.443398,0.799312,0.799351,0.799097,0.799172
3,0.3385,0.435269,0.809633,0.809818,0.809864,0.809632
4,0.3058,0.44648,0.81422,0.814558,0.813863,0.814
5,0.2805,0.434486,0.81422,0.814158,0.814158,0.814158
6,0.2607,0.44204,0.816514,0.816625,0.816705,0.81651
7,0.2473,0.453321,0.81422,0.814596,0.814536,0.814219
8,0.2354,0.458171,0.817661,0.817597,0.817662,0.81762
9,0.2247,0.463904,0.813073,0.813396,0.813368,0.813073
10,0.2174,0.469145,0.818807,0.818746,0.818746,0.818746


[I 2025-03-28 22:23:34,561] Trial 125 finished with value: 0.8221801222215643 and parameters: {'learning_rate': 3.1133499157296776e-05, 'weight_decay': 0.006, 'warmup_steps': 43}. Best is trial 51 with value: 0.8221801222215643.


Trial 126 with params: {'learning_rate': 2.478204720478304e-05, 'weight_decay': 0.006, 'warmup_steps': 43}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5739,0.48156,0.783257,0.783562,0.782868,0.782983
2,0.4162,0.45202,0.793578,0.793507,0.793551,0.793525
3,0.3613,0.439044,0.81078,0.811102,0.811074,0.81078
4,0.3289,0.446238,0.806193,0.80694,0.805685,0.805843
5,0.3045,0.437131,0.816514,0.816546,0.816326,0.816397
6,0.2847,0.437147,0.81422,0.81422,0.814326,0.814205
7,0.2714,0.442141,0.808486,0.809168,0.808906,0.808474
8,0.2591,0.446949,0.813073,0.813007,0.813031,0.813018
9,0.2486,0.450106,0.811927,0.812416,0.812284,0.811923
10,0.2414,0.454144,0.818807,0.818746,0.818746,0.818746


[I 2025-03-28 22:29:27,560] Trial 126 finished with value: 0.8187920875420875 and parameters: {'learning_rate': 2.478204720478304e-05, 'weight_decay': 0.006, 'warmup_steps': 43}. Best is trial 51 with value: 0.8221801222215643.


Trial 127 with params: {'learning_rate': 4.9384195689829555e-05, 'weight_decay': 0.007, 'warmup_steps': 42}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5094,0.455836,0.793578,0.793586,0.793382,0.793447
2,0.3504,0.431778,0.807339,0.807304,0.807401,0.807314
3,0.2927,0.438199,0.818807,0.818741,0.818788,0.818761
4,0.259,0.459368,0.818807,0.8189,0.818578,0.81867
5,0.2331,0.447995,0.813073,0.813096,0.8132,0.813061
6,0.215,0.472983,0.813073,0.813504,0.81341,0.813071
7,0.2015,0.494087,0.81422,0.814406,0.814452,0.814219
8,0.1903,0.507903,0.813073,0.813078,0.812905,0.812965
9,0.18,0.520159,0.808486,0.809646,0.809032,0.808444
10,0.173,0.526531,0.816514,0.816478,0.816578,0.81649


[I 2025-03-28 22:33:22,701] Trial 127 pruned. 


Trial 128 with params: {'learning_rate': 1.675581520520653e-05, 'weight_decay': 0.008, 'warmup_steps': 38}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.611,0.518525,0.758028,0.758561,0.757504,0.757591
2,0.4567,0.464551,0.78555,0.785531,0.785626,0.785528
3,0.4002,0.451412,0.795872,0.795836,0.795929,0.795845
4,0.3683,0.451676,0.795872,0.796378,0.795424,0.795561
5,0.3455,0.445533,0.803899,0.804163,0.803559,0.803682


[I 2025-03-28 22:35:21,377] Trial 128 pruned. 


Trial 129 with params: {'learning_rate': 2.324199421340565e-05, 'weight_decay': 0.005, 'warmup_steps': 36}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5788,0.48547,0.780963,0.781174,0.780616,0.780721
2,0.4222,0.454197,0.793578,0.793507,0.793551,0.793525
3,0.3676,0.440611,0.808486,0.808713,0.808737,0.808486
4,0.3354,0.446603,0.803899,0.804788,0.803349,0.803506
5,0.3113,0.438323,0.815367,0.815489,0.815115,0.815215
6,0.2916,0.436427,0.815367,0.815348,0.815452,0.815347
7,0.2784,0.440158,0.81078,0.811209,0.811116,0.810778
8,0.266,0.444715,0.813073,0.813007,0.813031,0.813018
9,0.2557,0.44723,0.81078,0.811102,0.811074,0.81078
10,0.2485,0.4511,0.817661,0.817631,0.817536,0.817574


[I 2025-03-28 22:41:17,619] Trial 129 finished with value: 0.8176411246568802 and parameters: {'learning_rate': 2.324199421340565e-05, 'weight_decay': 0.005, 'warmup_steps': 36}. Best is trial 51 with value: 0.8221801222215643.


Trial 130 with params: {'learning_rate': 3.2574657703616324e-05, 'weight_decay': 0.006, 'warmup_steps': 41}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5472,0.469488,0.788991,0.788957,0.788836,0.78888
2,0.3901,0.442151,0.801606,0.8016,0.801433,0.80149
3,0.334,0.434867,0.81078,0.810858,0.810948,0.810774
4,0.3012,0.447096,0.815367,0.815762,0.814989,0.815133
5,0.2757,0.434517,0.81422,0.814153,0.8142,0.814172
6,0.256,0.444036,0.815367,0.815445,0.815536,0.815361
7,0.2426,0.456621,0.815367,0.815691,0.815663,0.815367
8,0.2307,0.461342,0.817661,0.817595,0.81762,0.817607
9,0.2201,0.467998,0.813073,0.813626,0.813452,0.813067
10,0.2128,0.473104,0.818807,0.818741,0.818788,0.818761


[I 2025-03-28 22:47:14,269] Trial 130 finished with value: 0.8221801222215643 and parameters: {'learning_rate': 3.2574657703616324e-05, 'weight_decay': 0.006, 'warmup_steps': 41}. Best is trial 51 with value: 0.8221801222215643.


Trial 131 with params: {'learning_rate': 3.015518420993117e-05, 'weight_decay': 0.005, 'warmup_steps': 39}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.554,0.471978,0.786697,0.786698,0.786499,0.786561
2,0.3973,0.444352,0.800459,0.800474,0.800265,0.800332
3,0.3416,0.435671,0.808486,0.808632,0.808695,0.808484
4,0.309,0.446306,0.815367,0.815762,0.814989,0.815133
5,0.2838,0.43478,0.816514,0.81647,0.81641,0.816436
6,0.264,0.44101,0.815367,0.815445,0.815536,0.815361
7,0.2506,0.451416,0.811927,0.812416,0.812284,0.811923
8,0.2387,0.456255,0.818807,0.818741,0.818788,0.818761
9,0.228,0.46159,0.815367,0.815596,0.815621,0.815367
10,0.2207,0.466621,0.817661,0.817595,0.81762,0.817607


[I 2025-03-28 22:53:11,695] Trial 131 finished with value: 0.8199008365355143 and parameters: {'learning_rate': 3.015518420993117e-05, 'weight_decay': 0.005, 'warmup_steps': 39}. Best is trial 51 with value: 0.8221801222215643.


Trial 132 with params: {'learning_rate': 3.205322751680535e-05, 'weight_decay': 0.008, 'warmup_steps': 34}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5471,0.46995,0.788991,0.788957,0.788836,0.78888
2,0.3914,0.442609,0.801606,0.8016,0.801433,0.80149
3,0.3356,0.435035,0.809633,0.809743,0.809822,0.809629
4,0.3028,0.447013,0.81422,0.814558,0.813863,0.814
5,0.2774,0.434711,0.813073,0.813007,0.813031,0.813018
6,0.2577,0.443486,0.816514,0.816625,0.816705,0.81651
7,0.2443,0.455689,0.815367,0.815691,0.815663,0.815367
8,0.2324,0.460342,0.817661,0.817595,0.81762,0.817607
9,0.2218,0.466852,0.813073,0.813504,0.81341,0.813071
10,0.2145,0.471754,0.819954,0.819889,0.819915,0.819901


[I 2025-03-28 22:59:08,179] Trial 132 finished with value: 0.8221801222215643 and parameters: {'learning_rate': 3.205322751680535e-05, 'weight_decay': 0.008, 'warmup_steps': 34}. Best is trial 51 with value: 0.8221801222215643.


Trial 133 with params: {'learning_rate': 2.8472404006122218e-05, 'weight_decay': 0.005, 'warmup_steps': 40}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5597,0.47439,0.786697,0.786747,0.786457,0.786536
2,0.4028,0.446279,0.798165,0.798138,0.798013,0.798059
3,0.3473,0.436461,0.81078,0.810926,0.81099,0.810778
4,0.3149,0.446089,0.81422,0.814793,0.813779,0.813937
5,0.2899,0.435286,0.817661,0.817631,0.817536,0.817574
6,0.27,0.439543,0.816514,0.816563,0.816662,0.816505
7,0.2567,0.44825,0.81078,0.81133,0.811158,0.810774
8,0.2446,0.45315,0.817661,0.817595,0.81762,0.817607
9,0.234,0.457704,0.815367,0.815799,0.815705,0.815365
10,0.2267,0.46246,0.817661,0.817595,0.81762,0.817607


[I 2025-03-28 23:05:02,710] Trial 133 finished with value: 0.817620015390383 and parameters: {'learning_rate': 2.8472404006122218e-05, 'weight_decay': 0.005, 'warmup_steps': 40}. Best is trial 51 with value: 0.8221801222215643.


Trial 134 with params: {'learning_rate': 2.3868971651215053e-05, 'weight_decay': 0.008, 'warmup_steps': 30}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5749,0.483439,0.780963,0.781174,0.780616,0.780721
2,0.4194,0.453342,0.793578,0.793507,0.793551,0.793525
3,0.3649,0.439996,0.808486,0.808713,0.808737,0.808486
4,0.3327,0.446572,0.806193,0.80694,0.805685,0.805843
5,0.3085,0.437864,0.81422,0.814307,0.813989,0.814079
6,0.2888,0.436699,0.817661,0.817641,0.817746,0.817641
7,0.2756,0.440943,0.809633,0.81012,0.80999,0.809629
8,0.2633,0.445615,0.811927,0.811859,0.811905,0.811878
9,0.2528,0.448429,0.811927,0.812416,0.812284,0.811923
10,0.2456,0.452311,0.816514,0.81647,0.81641,0.816436


[I 2025-03-28 23:11:00,047] Trial 134 finished with value: 0.8164983164983165 and parameters: {'learning_rate': 2.3868971651215053e-05, 'weight_decay': 0.008, 'warmup_steps': 30}. Best is trial 51 with value: 0.8221801222215643.


Trial 135 with params: {'learning_rate': 3.586785103475273e-05, 'weight_decay': 0.008, 'warmup_steps': 36}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5371,0.467289,0.791284,0.791227,0.791172,0.791195
2,0.3809,0.439347,0.803899,0.803943,0.803686,0.803763
3,0.3245,0.434667,0.809633,0.809633,0.809737,0.809617
4,0.2913,0.449042,0.81422,0.814461,0.813905,0.814028
5,0.2656,0.435645,0.813073,0.813009,0.813074,0.813032


[I 2025-03-28 23:12:58,824] Trial 135 pruned. 


Trial 136 with params: {'learning_rate': 4.1541758803203886e-05, 'weight_decay': 0.006, 'warmup_steps': 42}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5251,0.462584,0.793578,0.793586,0.793382,0.793447
2,0.3671,0.435251,0.808486,0.808487,0.808316,0.808375
3,0.3098,0.435461,0.813073,0.813007,0.813031,0.813018
4,0.2762,0.452654,0.817661,0.817864,0.817368,0.817486
5,0.2503,0.439476,0.81422,0.81422,0.814326,0.814205
6,0.2313,0.458634,0.815367,0.815596,0.815621,0.815367
7,0.218,0.47825,0.811927,0.812112,0.812158,0.811926
8,0.2065,0.48421,0.81422,0.814206,0.814073,0.814122
9,0.1961,0.495094,0.808486,0.809646,0.809032,0.808444
10,0.189,0.499519,0.815367,0.815303,0.815368,0.815326


[I 2025-03-28 23:16:56,525] Trial 136 pruned. 


Trial 137 with params: {'learning_rate': 3.225367657608565e-05, 'weight_decay': 0.007, 'warmup_steps': 43}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5486,0.469761,0.788991,0.788957,0.788836,0.78888
2,0.3911,0.442409,0.801606,0.8016,0.801433,0.80149
3,0.335,0.434901,0.81078,0.810926,0.81099,0.810778
4,0.3022,0.446904,0.815367,0.815762,0.814989,0.815133


[W 2025-03-28 23:18:37,897] Trial 137 failed with parameters: {'learning_rate': 3.225367657608565e-05, 'weight_decay': 0.007, 'warmup_steps': 43} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/usr/local/lib/python3.10/dist-packages/transformers/integrations/integration_utils.py", line 250, in _objective
    trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2241, in train
    return inner_training_loop(
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2548, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 3698, in training_step
    loss = self.compute_loss(model,

KeyboardInterrupt: 

In [15]:
print(best_trial)

NameError: name 'best_trial' is not defined

In [34]:
base.reset_seed()

In [35]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-distill_hp-search", logging_dir=f"~/logs/{DATASET}/bert-distill_hp-search", remove_unused_columns=False, epochs=num_epochs, batch_size=batch_size)

In [36]:
def hp_space(trial):
    params =  {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-4, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0, 1e-2, step=1e-3),
        "warmup_steps" : trial.suggest_int("warmup_steps", 0, warm_up),
        "lambda_param": trial.suggest_float("lambda_param",0,1,step=.1),
        "temperature": trial.suggest_float("temperature", 2,7, step=.5)
    }
    print(f"Trial {trial.number} with params: {params}")
    return params

In [37]:
pruner = optuna.pruners.HyperbandPruner(min_resource=min_r, max_resource=max_r, reduction_factor=2, bootstrap_count=2)
sampler = optuna.samplers.TPESampler(seed=42, multivariate=True)



In [38]:
trainer = base.DistilTrainer(
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    model_init = lambda: get_Bert(),
)
  

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
best_trial2 = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    compute_objective=lambda metrics: metrics["eval_f1"],
    pruner=pruner,
    sampler=sampler,
    study_name="Distilation",
    n_trials=150
)

[I 2025-03-29 00:46:31,583] A new study created in memory with name: Distilation


Trial 0 with params: {'learning_rate': 4.3284502212938785e-05, 'weight_decay': 0.01, 'warmup_steps': 32, 'lambda_param': 0.6000000000000001, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6426,1.794711,0.795872,0.795997,0.795592,0.79569
2,1.6836,1.584951,0.795872,0.796962,0.795256,0.795397
3,1.3071,1.489271,0.809633,0.809575,0.809653,0.809597
4,1.0986,1.50698,0.811927,0.812163,0.811611,0.811732
5,0.9574,1.476046,0.817661,0.817612,0.817704,0.817632
6,0.867,1.50515,0.815367,0.815445,0.815536,0.815361
7,0.7983,1.563696,0.81422,0.814494,0.814494,0.81422
8,0.7488,1.550323,0.81422,0.814206,0.814073,0.814122
9,0.7052,1.591043,0.819954,0.82028,0.820251,0.819954
10,0.6731,1.564754,0.821101,0.821035,0.821083,0.821055


[I 2025-03-29 00:50:40,600] Trial 0 pruned. 


Trial 1 with params: {'learning_rate': 1.8408992080552506e-05, 'weight_decay': 0.0, 'warmup_steps': 38, 'lambda_param': 0.6000000000000001, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0805,2.287992,0.75344,0.753954,0.752915,0.752996
2,2.3097,1.842736,0.790138,0.79036,0.790383,0.790137
3,1.9007,1.710683,0.78555,0.785693,0.785247,0.785345
4,1.6658,1.638088,0.797018,0.79747,0.796592,0.796727
5,1.5041,1.580291,0.800459,0.801111,0.799971,0.800118


[I 2025-03-29 00:52:43,528] Trial 1 pruned. 


Trial 2 with params: {'learning_rate': 1.0838581269344744e-05, 'weight_decay': 0.01, 'warmup_steps': 36, 'lambda_param': 0.2, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.2223,2.680268,0.699541,0.704751,0.697925,0.69645
2,2.7628,2.163054,0.774083,0.775514,0.774701,0.773997
3,2.3159,1.913745,0.779817,0.779753,0.7797,0.779723
4,2.0535,1.810596,0.78211,0.782212,0.781826,0.781916
5,1.8875,1.757589,0.78555,0.786079,0.785078,0.785204
6,1.7577,1.687925,0.792431,0.792386,0.792298,0.792333
7,1.6709,1.646094,0.798165,0.798112,0.798055,0.798079
8,1.5847,1.619272,0.803899,0.803943,0.803686,0.803763
9,1.526,1.588411,0.809633,0.809585,0.809527,0.809552
10,1.4758,1.579802,0.803899,0.804163,0.803559,0.803682


[I 2025-03-29 00:56:52,009] Trial 2 pruned. 


Trial 3 with params: {'learning_rate': 2.049268011541735e-05, 'weight_decay': 0.003, 'warmup_steps': 23, 'lambda_param': 0.4, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0271,2.196694,0.764908,0.765599,0.764345,0.764437
2,2.2166,1.798141,0.795872,0.795872,0.795971,0.795854
3,1.823,1.669979,0.792431,0.792587,0.79213,0.792232
4,1.591,1.6095,0.800459,0.801254,0.799928,0.800079
5,1.4294,1.552029,0.806193,0.806801,0.805727,0.80588


[I 2025-03-29 00:58:54,657] Trial 3 pruned. 


Trial 4 with params: {'learning_rate': 0.00010952662748632558, 'weight_decay': 0.001, 'warmup_steps': 12, 'lambda_param': 0.4, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1009,1.555274,0.81078,0.811611,0.811242,0.81076
2,1.1285,1.49646,0.81422,0.816271,0.813442,0.813607
3,0.8301,1.51052,0.824541,0.82462,0.824714,0.824536
4,0.6822,1.574678,0.81078,0.813433,0.809895,0.810024
5,0.5822,1.63583,0.818807,0.819574,0.819251,0.818792
6,0.5187,1.672433,0.813073,0.813908,0.813537,0.813053
7,0.4561,1.759406,0.809633,0.809575,0.809653,0.809597
8,0.4221,1.776611,0.816514,0.816759,0.816199,0.816324
9,0.3844,1.817057,0.81422,0.815682,0.814831,0.814158
10,0.3605,1.841853,0.809633,0.809597,0.809695,0.809608


[I 2025-03-29 01:03:00,838] Trial 4 pruned. 


Trial 5 with params: {'learning_rate': 0.0002157696745589684, 'weight_decay': 0.002, 'warmup_steps': 22, 'lambda_param': 0.6000000000000001, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7789,1.454532,0.805046,0.807131,0.80578,0.804922
2,0.8684,1.631527,0.805046,0.805267,0.804728,0.804845
3,0.6232,1.742539,0.816514,0.816891,0.816831,0.816513
4,0.4882,1.685785,0.811927,0.813115,0.811316,0.811489
5,0.3995,1.845138,0.809633,0.81012,0.80999,0.809629
6,0.3372,1.892421,0.811927,0.812112,0.812158,0.811926
7,0.2886,2.149883,0.795872,0.796756,0.79635,0.795845
8,0.2533,2.016313,0.803899,0.80385,0.803938,0.803868
9,0.2275,2.023047,0.803899,0.805232,0.804486,0.803841
10,0.2085,2.033162,0.802752,0.803023,0.803023,0.802752


[I 2025-03-29 01:07:06,593] Trial 5 pruned. 


Trial 6 with params: {'learning_rate': 0.00010769622478263136, 'weight_decay': 0.001, 'warmup_steps': 2, 'lambda_param': 1.0, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0925,1.556024,0.811927,0.812543,0.812326,0.811918
2,1.1351,1.505002,0.816514,0.818835,0.815694,0.815859
3,0.8366,1.505569,0.825688,0.825688,0.825798,0.825673
4,0.6883,1.577069,0.81078,0.813711,0.809853,0.809968
5,0.5883,1.628784,0.821101,0.821727,0.821504,0.821092
6,0.5247,1.67482,0.81078,0.81133,0.811158,0.810774
7,0.4627,1.757227,0.81078,0.810731,0.810821,0.81075
8,0.4279,1.77135,0.819954,0.820083,0.819704,0.819806
9,0.3898,1.811973,0.813073,0.814432,0.813663,0.813018
10,0.3664,1.835226,0.807339,0.807304,0.807401,0.807314


[I 2025-03-29 01:11:13,879] Trial 6 pruned. 


Trial 7 with params: {'learning_rate': 0.000236288641842364, 'weight_decay': 0.003, 'warmup_steps': 4, 'lambda_param': 0.7000000000000001, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7041,1.477922,0.806193,0.808655,0.80699,0.806033
2,0.8395,1.642049,0.801606,0.801864,0.801265,0.801386
3,0.5967,1.788315,0.821101,0.821727,0.821504,0.821092
4,0.4613,1.739623,0.805046,0.805066,0.804854,0.804922
5,0.3789,1.873778,0.811927,0.813187,0.812495,0.811878
6,0.3157,1.982722,0.81078,0.810802,0.810906,0.810768
7,0.2722,2.123283,0.801606,0.801831,0.801854,0.801605
8,0.2388,1.989539,0.806193,0.806215,0.806317,0.80618
9,0.2129,2.052866,0.805046,0.805794,0.805485,0.805029
10,0.1979,2.046692,0.799312,0.799335,0.799434,0.799299


[I 2025-03-29 01:15:21,100] Trial 7 pruned. 


Trial 8 with params: {'learning_rate': 1.6119044727609182e-05, 'weight_decay': 0.005, 'warmup_steps': 1, 'lambda_param': 1.0, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.111,2.374848,0.75,0.750174,0.749621,0.749704
2,2.4087,1.899731,0.784404,0.784668,0.784668,0.784404
3,1.9926,1.756433,0.784404,0.78451,0.784121,0.784212
4,1.7568,1.676793,0.795872,0.796264,0.795466,0.795596
5,1.5968,1.619456,0.794725,0.795567,0.794172,0.794313


[I 2025-03-29 01:17:25,445] Trial 8 pruned. 


Trial 9 with params: {'learning_rate': 0.00013353819088790598, 'weight_decay': 0.003, 'warmup_steps': 22, 'lambda_param': 0.6000000000000001, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.014,1.519257,0.803899,0.806611,0.804738,0.803711
2,1.0364,1.4958,0.807339,0.809099,0.806601,0.806754
3,0.7605,1.571663,0.824541,0.825932,0.825135,0.824489
4,0.6164,1.588069,0.807339,0.8085,0.806727,0.806891
5,0.5229,1.71567,0.808486,0.810244,0.809158,0.808395
6,0.4564,1.751974,0.808486,0.809314,0.808948,0.808466
7,0.3977,1.896207,0.805046,0.805417,0.805359,0.805045
8,0.3615,1.881999,0.811927,0.812259,0.811569,0.811704
9,0.3282,1.908619,0.809633,0.811081,0.810242,0.809569
10,0.308,1.891332,0.807339,0.807281,0.807359,0.807303


[I 2025-03-29 01:21:33,454] Trial 9 pruned. 


Trial 10 with params: {'learning_rate': 6.725268184578669e-05, 'weight_decay': 0.01, 'warmup_steps': 22, 'lambda_param': 0.7000000000000001, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.382,1.655543,0.794725,0.794658,0.794719,0.794679
2,1.4026,1.473135,0.801606,0.8016,0.801433,0.80149
3,1.0475,1.466806,0.817661,0.817607,0.817578,0.817591
4,0.8746,1.544127,0.825688,0.827161,0.82504,0.825243
5,0.7542,1.52028,0.823394,0.823443,0.823546,0.823386
6,0.6842,1.549699,0.818807,0.819302,0.819167,0.818804
7,0.6196,1.611278,0.819954,0.819901,0.819872,0.819886
8,0.5782,1.6312,0.821101,0.82127,0.82083,0.820942
9,0.5377,1.648224,0.821101,0.821871,0.821546,0.821086
10,0.5119,1.642475,0.819954,0.819891,0.819957,0.819914


[I 2025-03-29 01:27:56,322] Trial 10 finished with value: 0.8233350172602509 and parameters: {'learning_rate': 6.725268184578669e-05, 'weight_decay': 0.01, 'warmup_steps': 22, 'lambda_param': 0.7000000000000001, 'temperature': 3.0}. Best is trial 10 with value: 0.8233350172602509.


Trial 11 with params: {'learning_rate': 6.678376660461166e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 33, 'lambda_param': 0.6000000000000001, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4011,1.661617,0.797018,0.796952,0.797013,0.796973
2,1.4102,1.475374,0.801606,0.801647,0.801391,0.801467
3,1.052,1.468499,0.819954,0.819901,0.819872,0.819886
4,0.8778,1.541198,0.826835,0.828213,0.826208,0.826413
5,0.7573,1.524844,0.823394,0.823506,0.823588,0.823391
6,0.6869,1.55184,0.818807,0.819302,0.819167,0.818804
7,0.622,1.609382,0.818807,0.818746,0.818746,0.818746
8,0.5803,1.630996,0.819954,0.820083,0.819704,0.819806
9,0.5399,1.649663,0.818807,0.819731,0.819294,0.818784
10,0.5138,1.643704,0.819954,0.819891,0.819957,0.819914


[I 2025-03-29 01:34:05,320] Trial 11 finished with value: 0.8210547917094193 and parameters: {'learning_rate': 6.678376660461166e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 33, 'lambda_param': 0.6000000000000001, 'temperature': 2.5}. Best is trial 10 with value: 0.8233350172602509.


Trial 12 with params: {'learning_rate': 6.373988700422221e-05, 'weight_decay': 0.008, 'warmup_steps': 16, 'lambda_param': 0.9, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4054,1.672864,0.797018,0.796976,0.796887,0.796922
2,1.4342,1.482393,0.805046,0.805066,0.804854,0.804922
3,1.0753,1.469367,0.81422,0.814175,0.814116,0.814141
4,0.8985,1.537092,0.826835,0.828213,0.826208,0.826413
5,0.7755,1.50839,0.822248,0.82227,0.822377,0.822236
6,0.7042,1.539149,0.822248,0.822575,0.822546,0.822247
7,0.6393,1.597134,0.819954,0.819926,0.81983,0.819869
8,0.5972,1.616172,0.819954,0.820162,0.819662,0.819781
9,0.5561,1.629522,0.821101,0.821597,0.821462,0.821097
10,0.5298,1.624816,0.819954,0.819891,0.819957,0.819914


[I 2025-03-29 01:40:15,952] Trial 12 finished with value: 0.8244579440359041 and parameters: {'learning_rate': 6.373988700422221e-05, 'weight_decay': 0.008, 'warmup_steps': 16, 'lambda_param': 0.9, 'temperature': 3.0}. Best is trial 12 with value: 0.8244579440359041.


Trial 13 with params: {'learning_rate': 0.00039689817307863315, 'weight_decay': 0.009000000000000001, 'warmup_steps': 21, 'lambda_param': 1.0, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5656,1.410758,0.809633,0.809713,0.809401,0.809489
2,0.7228,1.797154,0.811927,0.81188,0.811821,0.811846
3,0.4839,1.900979,0.81078,0.810783,0.810611,0.81067
4,0.3632,1.803357,0.807339,0.807291,0.807232,0.807257
5,0.2924,2.019283,0.805046,0.805156,0.805233,0.805042
6,0.237,2.039925,0.808486,0.808509,0.808611,0.808474
7,0.2013,2.297753,0.795872,0.796238,0.796182,0.79587
8,0.178,2.12862,0.808486,0.809035,0.808864,0.80848
9,0.1539,2.282848,0.805046,0.805656,0.805443,0.805037
10,0.1368,2.200842,0.805046,0.804977,0.805022,0.804996


[I 2025-03-29 01:44:22,038] Trial 13 pruned. 


Trial 14 with params: {'learning_rate': 4.273142277230917e-05, 'weight_decay': 0.01, 'warmup_steps': 15, 'lambda_param': 0.9, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6311,1.794211,0.797018,0.797183,0.796718,0.796824
2,1.6861,1.585323,0.795872,0.796962,0.795256,0.795397
3,1.3129,1.489485,0.809633,0.809575,0.809653,0.809597
4,1.1048,1.507626,0.811927,0.812163,0.811611,0.811732
5,0.9634,1.473799,0.817661,0.817612,0.817704,0.817632
6,0.8728,1.501563,0.815367,0.815445,0.815536,0.815361
7,0.8042,1.557654,0.813073,0.813301,0.813326,0.813073
8,0.7544,1.545942,0.815367,0.815374,0.8152,0.81526
9,0.7105,1.583273,0.818807,0.819083,0.819083,0.818807
10,0.6784,1.558667,0.819954,0.819901,0.819872,0.819886


[I 2025-03-29 01:48:29,192] Trial 14 pruned. 


Trial 15 with params: {'learning_rate': 7.868373273873772e-05, 'weight_decay': 0.008, 'warmup_steps': 17, 'lambda_param': 0.6000000000000001, 'temperature': 6.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2878,1.620094,0.808486,0.808509,0.808611,0.808474
2,1.312,1.478902,0.809633,0.809585,0.809527,0.809552
3,0.9737,1.493011,0.817661,0.817683,0.817789,0.817649
4,0.8078,1.581482,0.824541,0.826815,0.823745,0.823939
5,0.696,1.602885,0.818807,0.819083,0.819083,0.818807
6,0.6274,1.594582,0.822248,0.822946,0.822672,0.822236
7,0.5649,1.653934,0.813073,0.813025,0.813116,0.813044
8,0.525,1.663041,0.816514,0.816603,0.816284,0.816375
9,0.4851,1.685076,0.821101,0.8222,0.82163,0.821067
10,0.4612,1.712574,0.817661,0.817612,0.817704,0.817632


[I 2025-03-29 01:54:36,717] Trial 15 finished with value: 0.8210858585858586 and parameters: {'learning_rate': 7.868373273873772e-05, 'weight_decay': 0.008, 'warmup_steps': 17, 'lambda_param': 0.6000000000000001, 'temperature': 6.0}. Best is trial 12 with value: 0.8244579440359041.


Trial 16 with params: {'learning_rate': 3.8257922999035495e-05, 'weight_decay': 0.01, 'warmup_steps': 4, 'lambda_param': 0.4, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6821,1.826397,0.790138,0.790289,0.789835,0.789936
2,1.7556,1.618063,0.793578,0.794831,0.792919,0.793051
3,1.3853,1.503162,0.808486,0.808438,0.808527,0.808456
4,1.1704,1.513338,0.813073,0.813462,0.812695,0.812837
5,1.0241,1.47606,0.815367,0.815319,0.81541,0.815338
6,0.9284,1.49034,0.818807,0.818807,0.818915,0.818792
7,0.8589,1.537074,0.81422,0.814406,0.814452,0.814219
8,0.8049,1.526349,0.816514,0.816603,0.816284,0.816375
9,0.7593,1.557133,0.816514,0.817006,0.816873,0.81651
10,0.7253,1.541744,0.819954,0.819926,0.81983,0.819869


[I 2025-03-29 01:58:38,336] Trial 16 pruned. 


Trial 17 with params: {'learning_rate': 3.306391905724834e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 8, 'lambda_param': 1.0, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7691,1.883306,0.78555,0.785772,0.785205,0.785313
2,1.8572,1.656872,0.793578,0.79434,0.793045,0.793185
3,1.4858,1.529088,0.807339,0.807304,0.807401,0.807314
4,1.2634,1.528627,0.81078,0.811703,0.810232,0.810401
5,1.1106,1.476891,0.813073,0.813041,0.812947,0.812985
6,1.0077,1.472615,0.816514,0.816514,0.81662,0.816498
7,0.9358,1.506234,0.81078,0.810926,0.81099,0.810778
8,0.876,1.502936,0.813073,0.813078,0.812905,0.812965
9,0.8288,1.522659,0.815367,0.815596,0.815621,0.815367
10,0.792,1.51953,0.819954,0.820017,0.819746,0.819829


[I 2025-03-29 02:04:50,160] Trial 17 finished with value: 0.8164355445622714 and parameters: {'learning_rate': 3.306391905724834e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 8, 'lambda_param': 1.0, 'temperature': 5.0}. Best is trial 12 with value: 0.8244579440359041.


Trial 18 with params: {'learning_rate': 1.2783135103593331e-05, 'weight_decay': 0.007, 'warmup_steps': 25, 'lambda_param': 0.9, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.1869,2.576349,0.715596,0.717403,0.714616,0.714333
2,2.6192,2.041369,0.783257,0.784358,0.783794,0.783209
3,2.1757,1.839095,0.783257,0.783277,0.783037,0.783106
4,1.9267,1.750789,0.786697,0.786747,0.786457,0.786536
5,1.766,1.701892,0.790138,0.790956,0.789583,0.789717
6,1.6384,1.631299,0.801606,0.801706,0.801349,0.801442
7,1.5505,1.595429,0.806193,0.806239,0.80598,0.806058
8,1.4639,1.573703,0.805046,0.805066,0.804854,0.804922
9,1.4058,1.546115,0.803899,0.803839,0.803812,0.803825
10,1.3575,1.545266,0.806193,0.806675,0.80577,0.805915


[I 2025-03-29 02:08:56,375] Trial 18 pruned. 


Trial 19 with params: {'learning_rate': 3.912340804652161e-05, 'weight_decay': 0.008, 'warmup_steps': 32, 'lambda_param': 1.0, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6983,1.826058,0.791284,0.791477,0.790962,0.791069
2,1.7504,1.61523,0.792431,0.793768,0.791751,0.791877
3,1.3744,1.499324,0.808486,0.808438,0.808527,0.808456
4,1.159,1.509355,0.81078,0.811162,0.8104,0.81054
5,1.013,1.475894,0.815367,0.81539,0.815494,0.815355
6,0.9174,1.493327,0.819954,0.820032,0.820125,0.819948
7,0.8477,1.546207,0.816514,0.816789,0.816789,0.816514
8,0.7942,1.530899,0.817661,0.817721,0.817452,0.817534
9,0.7492,1.567896,0.815367,0.815799,0.815705,0.815365
10,0.7154,1.548695,0.817661,0.817631,0.817536,0.817574


[I 2025-03-29 02:13:03,358] Trial 19 pruned. 


Trial 20 with params: {'learning_rate': 0.00012648802951407994, 'weight_decay': 0.007, 'warmup_steps': 15, 'lambda_param': 0.7000000000000001, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0307,1.53032,0.803899,0.805639,0.80457,0.803806
2,1.0589,1.499202,0.809633,0.811197,0.808937,0.809102
3,0.7782,1.554448,0.818807,0.819731,0.819294,0.818784
4,0.6342,1.590132,0.808486,0.810603,0.807685,0.807829
5,0.5388,1.695674,0.807339,0.808588,0.807906,0.80729
6,0.473,1.735913,0.805046,0.805946,0.805527,0.80502
7,0.4135,1.859123,0.807339,0.807524,0.807569,0.807338
8,0.3769,1.848705,0.813073,0.813269,0.812779,0.812894
9,0.3429,1.887439,0.806193,0.807942,0.806864,0.806101
10,0.3217,1.885771,0.806193,0.806128,0.806191,0.80615


[I 2025-03-29 02:17:07,831] Trial 20 pruned. 


Trial 21 with params: {'learning_rate': 9.982843963014127e-05, 'weight_decay': 0.01, 'warmup_steps': 17, 'lambda_param': 0.7000000000000001, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1597,1.575162,0.813073,0.81376,0.813495,0.813061
2,1.1792,1.503784,0.808486,0.808976,0.808064,0.808212
3,0.8681,1.519828,0.826835,0.826913,0.827008,0.826829
4,0.7157,1.588813,0.818807,0.820901,0.818031,0.81821
5,0.6124,1.646063,0.817661,0.818666,0.818167,0.817632
6,0.5483,1.673641,0.81078,0.811464,0.8112,0.810768
7,0.4842,1.744611,0.813073,0.813096,0.8132,0.813061
8,0.4505,1.727914,0.815367,0.815566,0.815073,0.81519
9,0.4108,1.78359,0.81422,0.815682,0.814831,0.814158
10,0.3893,1.801487,0.813073,0.81322,0.813284,0.813071


[I 2025-03-29 02:21:12,413] Trial 21 pruned. 


Trial 22 with params: {'learning_rate': 8.008902647410555e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 20, 'lambda_param': 0.8, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2818,1.616754,0.808486,0.808509,0.808611,0.808474
2,1.302,1.479848,0.808486,0.808451,0.808359,0.808395
3,0.9653,1.495386,0.817661,0.817683,0.817789,0.817649
4,0.8004,1.582399,0.825688,0.827847,0.824914,0.825113
5,0.6894,1.610151,0.818807,0.819083,0.819083,0.818807
6,0.6212,1.600829,0.822248,0.822946,0.822672,0.822236
7,0.5588,1.661115,0.811927,0.811869,0.811947,0.811891
8,0.5191,1.668438,0.815367,0.815489,0.815115,0.815215
9,0.4795,1.693013,0.822248,0.82344,0.822798,0.822208
10,0.4556,1.720135,0.818807,0.81875,0.818831,0.818773


[I 2025-03-29 02:27:23,001] Trial 22 finished with value: 0.8164983164983165 and parameters: {'learning_rate': 8.008902647410555e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 20, 'lambda_param': 0.8, 'temperature': 4.0}. Best is trial 12 with value: 0.8244579440359041.


Trial 23 with params: {'learning_rate': 0.000119266773192184, 'weight_decay': 0.007, 'warmup_steps': 9, 'lambda_param': 0.5, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0512,1.541211,0.803899,0.805232,0.804486,0.803841
2,1.0854,1.498174,0.81422,0.815423,0.81361,0.813788
3,0.7989,1.533609,0.818807,0.819302,0.819167,0.818804
4,0.6533,1.588236,0.809633,0.811884,0.808811,0.808954
5,0.5563,1.660801,0.81422,0.814711,0.814579,0.814216


[I 2025-03-29 02:29:26,938] Trial 23 pruned. 


Trial 24 with params: {'learning_rate': 0.00019145028201717734, 'weight_decay': 0.007, 'warmup_steps': 33, 'lambda_param': 0.6000000000000001, 'temperature': 6.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8538,1.443416,0.801606,0.802932,0.802191,0.801547
2,0.9058,1.577543,0.811927,0.812627,0.811442,0.811606
3,0.652,1.711648,0.822248,0.822946,0.822672,0.822236
4,0.5165,1.609046,0.818807,0.819157,0.818452,0.818593
5,0.4282,1.793581,0.815367,0.815921,0.815747,0.815361
6,0.3644,1.925175,0.807339,0.807449,0.807527,0.807335
7,0.3121,2.130208,0.798165,0.798644,0.798518,0.798161
8,0.2796,2.001292,0.805046,0.80501,0.805107,0.80502
9,0.251,2.023747,0.807339,0.810498,0.808243,0.807111
10,0.2332,2.093798,0.794725,0.795261,0.795098,0.794718


[I 2025-03-29 02:33:32,120] Trial 24 pruned. 


Trial 25 with params: {'learning_rate': 9.489632568623659e-05, 'weight_decay': 0.007, 'warmup_steps': 19, 'lambda_param': 0.2, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1875,1.590824,0.81422,0.814711,0.814579,0.814216
2,1.2025,1.490427,0.811927,0.81208,0.811653,0.811759
3,0.8878,1.511893,0.825688,0.825737,0.82584,0.82568
4,0.7334,1.596763,0.817661,0.819867,0.816862,0.817035
5,0.6289,1.646701,0.819954,0.820649,0.820378,0.819943
6,0.564,1.663303,0.815367,0.816057,0.815789,0.815355
7,0.5008,1.734756,0.81078,0.810858,0.810948,0.810774
8,0.4655,1.719303,0.816514,0.816603,0.816284,0.816375
9,0.4257,1.765879,0.813073,0.814849,0.813747,0.812985
10,0.4041,1.785792,0.813073,0.81322,0.813284,0.813071


[I 2025-03-29 02:37:40,220] Trial 25 pruned. 


Trial 26 with params: {'learning_rate': 3.176385176660576e-05, 'weight_decay': 0.01, 'warmup_steps': 13, 'lambda_param': 0.30000000000000004, 'temperature': 6.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7962,1.903103,0.784404,0.784582,0.784078,0.784181
2,1.8872,1.666679,0.791284,0.791899,0.790793,0.790928
3,1.5148,1.536957,0.809633,0.809597,0.809695,0.809608
4,1.2904,1.533306,0.809633,0.810472,0.809106,0.809271
5,1.136,1.476029,0.813073,0.813041,0.812947,0.812985
6,1.0309,1.468947,0.815367,0.815319,0.81541,0.815338
7,0.9579,1.497909,0.808486,0.808632,0.808695,0.808484
8,0.8965,1.496811,0.81422,0.814206,0.814073,0.814122
9,0.8489,1.514422,0.816514,0.816789,0.816789,0.816514
10,0.8115,1.514418,0.818807,0.818841,0.81862,0.818692


[I 2025-03-29 02:43:51,551] Trial 26 finished with value: 0.8164355445622714 and parameters: {'learning_rate': 3.176385176660576e-05, 'weight_decay': 0.01, 'warmup_steps': 13, 'lambda_param': 0.30000000000000004, 'temperature': 6.5}. Best is trial 12 with value: 0.8244579440359041.


Trial 27 with params: {'learning_rate': 0.00014421241424714628, 'weight_decay': 0.007, 'warmup_steps': 5, 'lambda_param': 1.0, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9472,1.478452,0.799312,0.800451,0.799855,0.799267
2,1.0024,1.480841,0.81422,0.815423,0.81361,0.813788
3,0.736,1.604708,0.825688,0.826464,0.826135,0.825673
4,0.5952,1.599332,0.81422,0.814793,0.813779,0.813937
5,0.5021,1.736624,0.81422,0.815136,0.814705,0.814196


[I 2025-03-29 02:45:54,311] Trial 27 pruned. 


Trial 28 with params: {'learning_rate': 6.745391470707147e-05, 'weight_decay': 0.005, 'warmup_steps': 22, 'lambda_param': 0.9, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3803,1.65467,0.797018,0.796969,0.797055,0.796986
2,1.4008,1.472839,0.802752,0.80277,0.80256,0.802627
3,1.046,1.466779,0.817661,0.817607,0.817578,0.817591
4,0.8733,1.544458,0.825688,0.827161,0.82504,0.825243
5,0.753,1.520913,0.823394,0.823443,0.823546,0.823386
6,0.6831,1.550079,0.818807,0.819302,0.819167,0.818804
7,0.6185,1.612075,0.818807,0.818746,0.818746,0.818746
8,0.5772,1.631968,0.821101,0.82127,0.82083,0.820942
9,0.5367,1.649047,0.821101,0.821871,0.821546,0.821086
10,0.5109,1.645016,0.817661,0.817595,0.81762,0.817607


[I 2025-03-29 02:52:07,757] Trial 28 finished with value: 0.8244745722574152 and parameters: {'learning_rate': 6.745391470707147e-05, 'weight_decay': 0.005, 'warmup_steps': 22, 'lambda_param': 0.9, 'temperature': 3.0}. Best is trial 28 with value: 0.8244745722574152.


Trial 29 with params: {'learning_rate': 0.00012014952338564047, 'weight_decay': 0.01, 'warmup_steps': 18, 'lambda_param': 0.5, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0607,1.543397,0.805046,0.80648,0.805654,0.80498
2,1.0831,1.496804,0.813073,0.81456,0.8124,0.812574
3,0.7958,1.537935,0.819954,0.821141,0.820504,0.819914
4,0.6506,1.583416,0.809633,0.811884,0.808811,0.808954
5,0.5538,1.674285,0.808486,0.809474,0.80899,0.808456
6,0.4887,1.719595,0.809633,0.810541,0.810116,0.809608
7,0.4278,1.829854,0.807339,0.807449,0.807527,0.807335
8,0.3924,1.822708,0.813073,0.813358,0.812737,0.812866
9,0.357,1.859523,0.807339,0.809204,0.808032,0.807238
10,0.3346,1.873248,0.807339,0.807281,0.807359,0.807303


[I 2025-03-29 02:56:17,085] Trial 29 pruned. 


Trial 30 with params: {'learning_rate': 8.24648153819458e-05, 'weight_decay': 0.004, 'warmup_steps': 31, 'lambda_param': 1.0, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2816,1.61457,0.805046,0.805156,0.805233,0.805042
2,1.2897,1.479908,0.81078,0.810832,0.810569,0.810648
3,0.9535,1.492614,0.817661,0.817683,0.817789,0.817649
4,0.7895,1.576003,0.824541,0.826566,0.823788,0.823985
5,0.6796,1.609669,0.821101,0.82148,0.82142,0.8211
6,0.6116,1.603887,0.823394,0.824168,0.82384,0.82338
7,0.5488,1.668163,0.813073,0.813007,0.813031,0.813018
8,0.5095,1.67151,0.817661,0.817721,0.817452,0.817534
9,0.4706,1.702965,0.819954,0.821537,0.820588,0.819886
10,0.4463,1.729361,0.815367,0.815319,0.81541,0.815338


[I 2025-03-29 03:02:30,788] Trial 30 finished with value: 0.8187920875420875 and parameters: {'learning_rate': 8.24648153819458e-05, 'weight_decay': 0.004, 'warmup_steps': 31, 'lambda_param': 1.0, 'temperature': 3.5}. Best is trial 28 with value: 0.8244745722574152.


Trial 31 with params: {'learning_rate': 7.40121657218587e-05, 'weight_decay': 0.008, 'warmup_steps': 23, 'lambda_param': 1.0, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3289,1.630184,0.807339,0.807304,0.807401,0.807314
2,1.3457,1.462211,0.81078,0.810895,0.810527,0.810624
3,0.9995,1.462479,0.816514,0.816478,0.816578,0.81649
4,0.8332,1.553651,0.822248,0.823801,0.821577,0.821773
5,0.7171,1.544853,0.818807,0.818994,0.819041,0.818806
6,0.6492,1.573982,0.821101,0.821597,0.821462,0.821097
7,0.5853,1.633579,0.815367,0.815374,0.8152,0.81526
8,0.5458,1.651827,0.816514,0.816546,0.816326,0.816397
9,0.5055,1.681497,0.816514,0.817434,0.816999,0.81649
10,0.4806,1.672595,0.816514,0.816447,0.816494,0.816466


[I 2025-03-29 03:07:11,338] Trial 31 pruned. 


Trial 32 with params: {'learning_rate': 4.956842169562857e-05, 'weight_decay': 0.006, 'warmup_steps': 16, 'lambda_param': 0.6000000000000001, 'temperature': 6.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5484,1.755046,0.798165,0.79823,0.797929,0.798012
2,1.5922,1.539707,0.801606,0.802196,0.801139,0.801285
3,1.2195,1.483426,0.811927,0.811863,0.811863,0.811863
4,1.0229,1.512097,0.819954,0.820362,0.819578,0.819726
5,0.8879,1.476141,0.816514,0.816514,0.81662,0.816498
6,0.8056,1.515721,0.817661,0.817739,0.817831,0.817655
7,0.7383,1.573887,0.815367,0.815348,0.815452,0.815347
8,0.6933,1.575145,0.823394,0.823433,0.823209,0.823282
9,0.6502,1.605544,0.817661,0.817985,0.817957,0.81766
10,0.6203,1.580389,0.829128,0.829079,0.82905,0.829063


[I 2025-03-29 03:13:25,885] Trial 32 finished with value: 0.8244745722574152 and parameters: {'learning_rate': 4.956842169562857e-05, 'weight_decay': 0.006, 'warmup_steps': 16, 'lambda_param': 0.6000000000000001, 'temperature': 6.5}. Best is trial 28 with value: 0.8244745722574152.


Trial 33 with params: {'learning_rate': 2.9884730997511285e-05, 'weight_decay': 0.004, 'warmup_steps': 11, 'lambda_param': 0.5, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8284,1.932654,0.784404,0.784762,0.783994,0.784113
2,1.9298,1.685021,0.792431,0.792587,0.79213,0.792232
3,1.5584,1.551132,0.805046,0.804987,0.805064,0.805009
4,1.3317,1.542397,0.809633,0.810472,0.809106,0.809271
5,1.1752,1.48117,0.809633,0.809658,0.809443,0.809512


[I 2025-03-29 03:15:29,242] Trial 33 pruned. 


Trial 34 with params: {'learning_rate': 3.058885041448953e-05, 'weight_decay': 0.006, 'warmup_steps': 22, 'lambda_param': 0.8, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8249,1.925214,0.780963,0.781174,0.780616,0.780721
2,1.9171,1.678869,0.793578,0.793775,0.793256,0.793365
3,1.5435,1.545381,0.806193,0.806144,0.806233,0.806162
4,1.3169,1.537699,0.809633,0.810472,0.809106,0.809271
5,1.1608,1.478499,0.81078,0.810783,0.810611,0.81067
6,1.0535,1.467777,0.81422,0.814162,0.814242,0.814185
7,0.9793,1.491757,0.806193,0.806419,0.806443,0.806192
8,0.9162,1.492218,0.813073,0.813041,0.812947,0.812985
9,0.8681,1.508328,0.81422,0.814331,0.81441,0.814216
10,0.8301,1.510549,0.815367,0.815374,0.8152,0.81526


[I 2025-03-29 03:21:42,683] Trial 34 finished with value: 0.8141409888692999 and parameters: {'learning_rate': 3.058885041448953e-05, 'weight_decay': 0.006, 'warmup_steps': 22, 'lambda_param': 0.8, 'temperature': 7.0}. Best is trial 28 with value: 0.8244745722574152.


Trial 35 with params: {'learning_rate': 3.681082791006686e-05, 'weight_decay': 0.005, 'warmup_steps': 24, 'lambda_param': 0.7000000000000001, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7236,1.845541,0.790138,0.790289,0.789835,0.789936
2,1.7889,1.631833,0.788991,0.790397,0.788288,0.788402
3,1.4146,1.508941,0.805046,0.805046,0.805149,0.805029
4,1.1962,1.514242,0.81078,0.811406,0.810316,0.810474
5,1.0474,1.474268,0.817661,0.817595,0.81762,0.817607
6,0.9492,1.483832,0.816514,0.816625,0.816705,0.81651
7,0.8789,1.531221,0.813073,0.813301,0.813326,0.813073
8,0.823,1.519036,0.81422,0.81425,0.814031,0.814102
9,0.7772,1.549747,0.81422,0.814839,0.814621,0.814211
10,0.7423,1.536296,0.817661,0.817669,0.817494,0.817555


[I 2025-03-29 03:25:50,620] Trial 35 pruned. 


Trial 36 with params: {'learning_rate': 8.871468526497837e-05, 'weight_decay': 0.006, 'warmup_steps': 20, 'lambda_param': 1.0, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2246,1.602324,0.805046,0.805094,0.805191,0.805037
2,1.2392,1.482374,0.81422,0.814307,0.813989,0.814079
3,0.9166,1.510656,0.821101,0.82115,0.821251,0.821092
4,0.759,1.592869,0.823394,0.825291,0.822661,0.822858
5,0.6518,1.638876,0.818807,0.819431,0.819209,0.818799
6,0.5859,1.640948,0.817661,0.818353,0.818083,0.817649
7,0.5232,1.709842,0.81078,0.810858,0.810948,0.810774
8,0.4863,1.702778,0.816514,0.816603,0.816284,0.816375
9,0.4465,1.737768,0.816514,0.818416,0.81721,0.816417
10,0.4242,1.759692,0.815367,0.815445,0.815536,0.815361


[I 2025-03-29 03:32:01,704] Trial 36 finished with value: 0.8164983164983165 and parameters: {'learning_rate': 8.871468526497837e-05, 'weight_decay': 0.006, 'warmup_steps': 20, 'lambda_param': 1.0, 'temperature': 3.0}. Best is trial 28 with value: 0.8244745722574152.


Trial 37 with params: {'learning_rate': 4.729948829550423e-05, 'weight_decay': 0.002, 'warmup_steps': 9, 'lambda_param': 0.8, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5675,1.7663,0.795872,0.795997,0.795592,0.79569
2,1.6193,1.553097,0.801606,0.802482,0.801055,0.801208
3,1.2476,1.484196,0.813073,0.813017,0.812989,0.813002
4,1.0478,1.511284,0.818807,0.81927,0.81841,0.818563
5,0.9107,1.473663,0.817661,0.817612,0.817704,0.817632
6,0.826,1.510134,0.816514,0.816563,0.816662,0.816505
7,0.7584,1.56785,0.815367,0.815348,0.815452,0.815347
8,0.7121,1.565419,0.818807,0.818972,0.818536,0.818646
9,0.6687,1.597768,0.816514,0.816891,0.816831,0.816513
10,0.6384,1.571882,0.825688,0.825682,0.825545,0.825596


[I 2025-03-29 03:38:14,266] Trial 37 finished with value: 0.8267526114341277 and parameters: {'learning_rate': 4.729948829550423e-05, 'weight_decay': 0.002, 'warmup_steps': 9, 'lambda_param': 0.8, 'temperature': 2.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 38 with params: {'learning_rate': 3.6395130105526165e-05, 'weight_decay': 0.002, 'warmup_steps': 11, 'lambda_param': 1.0, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7173,1.845659,0.791284,0.791477,0.790962,0.791069
2,1.792,1.63331,0.788991,0.790397,0.788288,0.788402
3,1.4203,1.511129,0.805046,0.805046,0.805149,0.805029
4,1.2021,1.516341,0.809633,0.810324,0.809148,0.809308
5,1.0532,1.474301,0.818807,0.818746,0.818746,0.818746
6,0.9549,1.482508,0.816514,0.816625,0.816705,0.81651
7,0.8847,1.527069,0.81422,0.814406,0.814452,0.814219
8,0.8286,1.517086,0.813073,0.813128,0.812863,0.812943
9,0.7825,1.544525,0.815367,0.815921,0.815747,0.815361
10,0.7475,1.533002,0.818807,0.818796,0.818662,0.818712


[I 2025-03-29 03:44:19,647] Trial 38 finished with value: 0.8164519659846763 and parameters: {'learning_rate': 3.6395130105526165e-05, 'weight_decay': 0.002, 'warmup_steps': 11, 'lambda_param': 1.0, 'temperature': 2.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 39 with params: {'learning_rate': 6.784398075892817e-05, 'weight_decay': 0.003, 'warmup_steps': 3, 'lambda_param': 0.8, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3528,1.650285,0.801606,0.801586,0.801686,0.801584
2,1.3933,1.473599,0.805046,0.805066,0.804854,0.804922
3,1.0428,1.464201,0.817661,0.817595,0.81762,0.817607
4,0.8708,1.552901,0.825688,0.827161,0.82504,0.825243
5,0.7508,1.521291,0.821101,0.82115,0.821251,0.821092
6,0.6808,1.546347,0.819954,0.82028,0.820251,0.819954
7,0.6169,1.618838,0.816514,0.81647,0.81641,0.816436
8,0.5759,1.633003,0.817661,0.817956,0.817326,0.817459
9,0.5351,1.648961,0.819954,0.820513,0.820336,0.819948
10,0.5099,1.642194,0.817661,0.817595,0.81762,0.817607


[I 2025-03-29 03:50:31,307] Trial 39 finished with value: 0.8221950933949345 and parameters: {'learning_rate': 6.784398075892817e-05, 'weight_decay': 0.003, 'warmup_steps': 3, 'lambda_param': 0.8, 'temperature': 2.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 40 with params: {'learning_rate': 8.788434110215489e-05, 'weight_decay': 0.001, 'warmup_steps': 40, 'lambda_param': 0.6000000000000001, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2614,1.604087,0.808486,0.808713,0.808737,0.808486
2,1.256,1.48832,0.811927,0.812163,0.811611,0.811732
3,0.9245,1.504689,0.818807,0.818807,0.818915,0.818792
4,0.7642,1.583161,0.823394,0.825291,0.822661,0.822858
5,0.6565,1.62655,0.817661,0.818217,0.818041,0.817655
6,0.5901,1.628462,0.821101,0.822029,0.821588,0.821077
7,0.5269,1.699162,0.808486,0.808438,0.808527,0.808456
8,0.489,1.694115,0.819954,0.819965,0.819788,0.81985
9,0.4503,1.729847,0.817661,0.819236,0.818294,0.817591
10,0.4266,1.75851,0.818807,0.818772,0.818873,0.818784


[I 2025-03-29 03:56:37,163] Trial 40 finished with value: 0.817648799542307 and parameters: {'learning_rate': 8.788434110215489e-05, 'weight_decay': 0.001, 'warmup_steps': 40, 'lambda_param': 0.6000000000000001, 'temperature': 5.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 41 with params: {'learning_rate': 1.1001377595702983e-05, 'weight_decay': 0.002, 'warmup_steps': 10, 'lambda_param': 0.6000000000000001, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.2122,2.664117,0.697248,0.70178,0.695714,0.694412
2,2.7427,2.146275,0.774083,0.775514,0.774701,0.773997
3,2.2989,1.90344,0.77867,0.778616,0.778532,0.778565
4,2.0395,1.803485,0.783257,0.783395,0.782952,0.783049
5,1.875,1.751443,0.78555,0.786079,0.785078,0.785204
6,1.7462,1.68165,0.793578,0.793548,0.793424,0.793469
7,1.6597,1.640727,0.799312,0.799271,0.799181,0.799217
8,1.5739,1.614591,0.803899,0.804004,0.803644,0.803738
9,1.5154,1.584253,0.809633,0.809585,0.809527,0.809552
10,1.4654,1.576448,0.805046,0.805267,0.804728,0.804845


[I 2025-03-29 04:00:42,143] Trial 41 pruned. 


Trial 42 with params: {'learning_rate': 0.00010946176175251309, 'weight_decay': 0.003, 'warmup_steps': 15, 'lambda_param': 0.7000000000000001, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1057,1.557352,0.809633,0.810541,0.810116,0.809608
2,1.1294,1.495547,0.816514,0.818352,0.815778,0.815956
3,0.8299,1.511364,0.823394,0.823506,0.823588,0.823391
4,0.6821,1.572977,0.813073,0.815754,0.81219,0.812327
5,0.5821,1.637788,0.818807,0.819574,0.819251,0.818792
6,0.5184,1.669984,0.81422,0.814981,0.814663,0.814205
7,0.4559,1.75769,0.809633,0.809575,0.809653,0.809597
8,0.4222,1.776243,0.817661,0.817864,0.817368,0.817486
9,0.3841,1.813128,0.815367,0.816732,0.815957,0.815312
10,0.3603,1.840136,0.809633,0.809597,0.809695,0.809608


[I 2025-03-29 04:04:46,686] Trial 42 pruned. 


Trial 43 with params: {'learning_rate': 0.00019674186242712736, 'weight_decay': 0.003, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7853,1.463033,0.807339,0.809436,0.808074,0.807217
2,0.8923,1.592979,0.806193,0.807093,0.805643,0.805804
3,0.6445,1.674823,0.821101,0.821044,0.821125,0.821067
4,0.5099,1.654472,0.811927,0.812491,0.811484,0.81164
5,0.4231,1.76468,0.81078,0.811209,0.811116,0.810778
6,0.3588,1.94407,0.803899,0.805639,0.80457,0.803806
7,0.3084,2.128054,0.798165,0.798768,0.79856,0.798156
8,0.2756,1.963653,0.802752,0.802717,0.802812,0.802726
9,0.2463,1.95334,0.803899,0.805049,0.804443,0.803855
10,0.2281,1.982694,0.802752,0.802683,0.802728,0.802701


[I 2025-03-29 04:08:50,940] Trial 43 pruned. 


Trial 44 with params: {'learning_rate': 3.75792536289279e-05, 'weight_decay': 0.003, 'warmup_steps': 18, 'lambda_param': 1.0, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7061,1.836159,0.791284,0.791477,0.790962,0.791069
2,1.7728,1.625482,0.790138,0.791458,0.789457,0.789577
3,1.3996,1.505708,0.805046,0.805046,0.805149,0.805029
4,1.1828,1.512655,0.811927,0.812368,0.811526,0.811673
5,1.0352,1.474698,0.817661,0.817597,0.817662,0.81762


[I 2025-03-29 04:10:52,410] Trial 44 pruned. 


Trial 45 with params: {'learning_rate': 0.00043371026242218253, 'weight_decay': 0.009000000000000001, 'warmup_steps': 32, 'lambda_param': 0.5, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5674,1.474552,0.813073,0.813041,0.812947,0.812985
2,0.7152,1.816958,0.813073,0.813041,0.812947,0.812985
3,0.4784,1.873214,0.811927,0.812543,0.812326,0.811918
4,0.3542,1.852375,0.799312,0.800512,0.798676,0.798823
5,0.2807,1.977269,0.816514,0.816789,0.816789,0.816514
6,0.2338,2.044938,0.816514,0.816858,0.816157,0.816296
7,0.199,2.013567,0.816514,0.816447,0.816494,0.816466
8,0.1718,2.042676,0.816514,0.817603,0.817041,0.816479
9,0.1458,2.163199,0.803899,0.803834,0.803896,0.803855
10,0.1344,2.241283,0.794725,0.794869,0.794929,0.794722


[I 2025-03-29 04:14:56,594] Trial 45 pruned. 


Trial 46 with params: {'learning_rate': 9.705653773363448e-05, 'weight_decay': 0.0, 'warmup_steps': 15, 'lambda_param': 0.8, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.172,1.584306,0.81422,0.814711,0.814579,0.814216
2,1.1915,1.49843,0.807339,0.80766,0.80698,0.807111
3,0.879,1.513908,0.826835,0.826858,0.826966,0.826824
4,0.7255,1.594527,0.817661,0.819624,0.816905,0.817083
5,0.6215,1.644288,0.818807,0.819431,0.819209,0.818799
6,0.5569,1.66619,0.813073,0.813908,0.813537,0.813053
7,0.4934,1.737785,0.813073,0.813151,0.813242,0.813067
8,0.4586,1.722656,0.815367,0.815566,0.815073,0.81519
9,0.4188,1.773734,0.815367,0.816935,0.815999,0.815297
10,0.3973,1.793807,0.81078,0.810858,0.810948,0.810774


[I 2025-03-29 04:19:00,165] Trial 46 pruned. 


Trial 47 with params: {'learning_rate': 3.32509088330528e-05, 'weight_decay': 0.007, 'warmup_steps': 8, 'lambda_param': 0.7000000000000001, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7659,1.880866,0.78555,0.785772,0.785205,0.785313
2,1.8533,1.655462,0.793578,0.79434,0.793045,0.793185
3,1.4819,1.528062,0.807339,0.807304,0.807401,0.807314
4,1.2596,1.528009,0.81078,0.811703,0.810232,0.810401
5,1.1071,1.47669,0.813073,0.813041,0.812947,0.812985
6,1.0045,1.473152,0.815367,0.81539,0.815494,0.815355
7,0.9327,1.507212,0.809633,0.809818,0.809864,0.809632
8,0.8731,1.503598,0.813073,0.813078,0.812905,0.812965
9,0.826,1.523681,0.816514,0.816789,0.816789,0.816514
10,0.7894,1.520154,0.819954,0.820017,0.819746,0.819829


[I 2025-03-29 04:25:10,951] Trial 47 finished with value: 0.8164355445622714 and parameters: {'learning_rate': 3.32509088330528e-05, 'weight_decay': 0.007, 'warmup_steps': 8, 'lambda_param': 0.7000000000000001, 'temperature': 2.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 48 with params: {'learning_rate': 3.371313519569357e-05, 'weight_decay': 0.01, 'warmup_steps': 23, 'lambda_param': 0.7000000000000001, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7719,1.88053,0.786697,0.786965,0.786331,0.786444
2,1.8489,1.653151,0.790138,0.791111,0.789541,0.789673
3,1.4746,1.524805,0.805046,0.805046,0.805149,0.805029
4,1.252,1.524058,0.809633,0.810472,0.809106,0.809271
5,1.0995,1.475045,0.81422,0.814175,0.814116,0.814141
6,0.997,1.473297,0.818807,0.818856,0.818957,0.818799
7,0.9253,1.510709,0.81078,0.811007,0.811032,0.81078
8,0.866,1.504578,0.81422,0.814206,0.814073,0.814122
9,0.8191,1.527663,0.818807,0.819186,0.819125,0.818806
10,0.7827,1.522356,0.821101,0.821197,0.820872,0.820965


[I 2025-03-29 04:31:23,075] Trial 48 finished with value: 0.8152967721140121 and parameters: {'learning_rate': 3.371313519569357e-05, 'weight_decay': 0.01, 'warmup_steps': 23, 'lambda_param': 0.7000000000000001, 'temperature': 2.5}. Best is trial 37 with value: 0.8267526114341277.


Trial 49 with params: {'learning_rate': 6.854309974927711e-05, 'weight_decay': 0.001, 'warmup_steps': 5, 'lambda_param': 0.6000000000000001, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3499,1.647439,0.800459,0.800423,0.800518,0.800432
2,1.3873,1.471918,0.803899,0.803943,0.803686,0.803763
3,1.0374,1.464575,0.817661,0.817595,0.81762,0.817607
4,0.8662,1.553644,0.825688,0.827161,0.82504,0.825243
5,0.7467,1.52283,0.822248,0.822326,0.822419,0.822242
6,0.677,1.549337,0.819954,0.82028,0.820251,0.819954
7,0.613,1.619827,0.816514,0.81647,0.81641,0.816436
8,0.5723,1.634573,0.816514,0.816759,0.816199,0.816324
9,0.5315,1.652645,0.822248,0.822946,0.822672,0.822236
10,0.5064,1.644757,0.817661,0.817595,0.81762,0.817607


[I 2025-03-29 04:37:33,339] Trial 49 finished with value: 0.8233350172602509 and parameters: {'learning_rate': 6.854309974927711e-05, 'weight_decay': 0.001, 'warmup_steps': 5, 'lambda_param': 0.6000000000000001, 'temperature': 2.5}. Best is trial 37 with value: 0.8267526114341277.


Trial 50 with params: {'learning_rate': 3.9622765700351934e-05, 'weight_decay': 0.007, 'warmup_steps': 24, 'lambda_param': 0.5, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6826,1.819836,0.790138,0.790289,0.789835,0.789936
2,1.7392,1.61042,0.793578,0.794654,0.792961,0.793098
3,1.3646,1.497949,0.807339,0.807281,0.807359,0.807303
4,1.1506,1.50934,0.811927,0.812259,0.811569,0.811704
5,1.0054,1.475828,0.815367,0.81539,0.815494,0.815355


[I 2025-03-29 04:39:36,364] Trial 50 pruned. 


Trial 51 with params: {'learning_rate': 4.805371039568404e-05, 'weight_decay': 0.0, 'warmup_steps': 14, 'lambda_param': 0.4, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5638,1.762676,0.797018,0.797112,0.796761,0.796851
2,1.611,1.548985,0.801606,0.802482,0.801055,0.801208
3,1.2384,1.483992,0.813073,0.813017,0.812989,0.813002
4,1.0395,1.510977,0.816514,0.816858,0.816157,0.816296
5,0.9031,1.474455,0.816514,0.816478,0.816578,0.81649
6,0.8191,1.512364,0.816514,0.816563,0.816662,0.816505
7,0.7515,1.570451,0.816514,0.816514,0.81662,0.816498
8,0.7056,1.568679,0.817661,0.817786,0.81741,0.817511
9,0.6625,1.601428,0.816514,0.816891,0.816831,0.816513
10,0.6322,1.575152,0.826835,0.826811,0.826713,0.826753


[I 2025-03-29 04:45:46,683] Trial 51 finished with value: 0.8256137673341579 and parameters: {'learning_rate': 4.805371039568404e-05, 'weight_decay': 0.0, 'warmup_steps': 14, 'lambda_param': 0.4, 'temperature': 2.5}. Best is trial 37 with value: 0.8267526114341277.


Trial 52 with params: {'learning_rate': 1.7644803369444844e-05, 'weight_decay': 0.0, 'warmup_steps': 20, 'lambda_param': 0.2, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0867,2.311123,0.752294,0.752747,0.751789,0.751871
2,2.3383,1.858709,0.790138,0.79036,0.790383,0.790137
3,1.9288,1.724935,0.784404,0.78451,0.784121,0.784212
4,1.694,1.649889,0.794725,0.795289,0.794256,0.794394
5,1.5331,1.592234,0.800459,0.801254,0.799928,0.800079


[I 2025-03-29 04:47:49,116] Trial 52 pruned. 


Trial 53 with params: {'learning_rate': 0.00011842484457522068, 'weight_decay': 0.002, 'warmup_steps': 27, 'lambda_param': 0.2, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0794,1.548264,0.805046,0.806684,0.805696,0.804963
2,1.0909,1.501012,0.81078,0.812682,0.810021,0.81018
3,0.8007,1.529983,0.819954,0.8208,0.82042,0.819935
4,0.6546,1.580673,0.81078,0.812919,0.809979,0.81013
5,0.5574,1.675333,0.811927,0.813006,0.812453,0.811891
6,0.4933,1.70886,0.813073,0.81407,0.813579,0.813044
7,0.4316,1.824292,0.808486,0.808713,0.808737,0.808486
8,0.3969,1.814222,0.81422,0.814558,0.813863,0.814
9,0.3609,1.85031,0.809633,0.811081,0.810242,0.809569
10,0.3386,1.870355,0.808486,0.808467,0.808569,0.808466


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--recall/11f90e583db35601050aed380d48e83202a896976b9608432fba9244fb447f24 (last modified on Fri Jan 10 23:14:00 2025) since it couldn't be found locally at evaluate-metric--recall, or remotely on the Hugging Face Hub.
[I 2025-03-29 04:52:01,197] Trial 53 pruned. 


Trial 54 with params: {'learning_rate': 3.0513436108398737e-05, 'weight_decay': 0.0, 'warmup_steps': 1, 'lambda_param': 0.30000000000000004, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8077,1.918131,0.783257,0.783473,0.78291,0.783017
2,1.9112,1.676992,0.793578,0.793775,0.793256,0.793365
3,1.5415,1.546477,0.806193,0.806144,0.806233,0.806162
4,1.3163,1.540884,0.809633,0.810472,0.809106,0.809271
5,1.161,1.480697,0.809633,0.809658,0.809443,0.809512


[I 2025-03-29 04:54:03,607] Trial 54 pruned. 


Trial 55 with params: {'learning_rate': 4.85417487043681e-05, 'weight_decay': 0.002, 'warmup_steps': 10, 'lambda_param': 0.2, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5539,1.759399,0.798165,0.79823,0.797929,0.798012
2,1.6034,1.545109,0.803899,0.804637,0.803391,0.803545
3,1.2316,1.483492,0.811927,0.811863,0.811863,0.811863
4,1.0338,1.512232,0.817661,0.818062,0.817283,0.81743
5,0.8979,1.474786,0.816514,0.816478,0.816578,0.81649
6,0.8146,1.512845,0.816514,0.816563,0.816662,0.816505
7,0.7472,1.570731,0.816514,0.816514,0.81662,0.816498
8,0.7017,1.570838,0.817661,0.817786,0.81741,0.817511
9,0.6584,1.601293,0.816514,0.816891,0.816831,0.816513
10,0.6284,1.575765,0.826835,0.826811,0.826713,0.826753


[I 2025-03-29 05:00:13,986] Trial 55 finished with value: 0.8267526114341277 and parameters: {'learning_rate': 4.85417487043681e-05, 'weight_decay': 0.002, 'warmup_steps': 10, 'lambda_param': 0.2, 'temperature': 2.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 56 with params: {'learning_rate': 5.232117977967534e-05, 'weight_decay': 0.001, 'warmup_steps': 22, 'lambda_param': 0.4, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5246,1.742091,0.795872,0.795997,0.795592,0.79569
2,1.5603,1.525511,0.801606,0.801962,0.801223,0.801354
3,1.1878,1.483068,0.81078,0.810746,0.810653,0.81069
4,0.995,1.514725,0.822248,0.822783,0.82183,0.821993
5,0.8625,1.481912,0.817661,0.817739,0.817831,0.817655
6,0.7829,1.522455,0.819954,0.819977,0.820083,0.819943
7,0.7161,1.579535,0.817661,0.817597,0.817662,0.81762
8,0.672,1.586238,0.821101,0.821092,0.820956,0.821007
9,0.6293,1.61222,0.817661,0.817985,0.817957,0.81766
10,0.6,1.590965,0.825688,0.825631,0.825714,0.825655


[I 2025-03-29 05:06:21,259] Trial 56 finished with value: 0.8233350172602509 and parameters: {'learning_rate': 5.232117977967534e-05, 'weight_decay': 0.001, 'warmup_steps': 22, 'lambda_param': 0.4, 'temperature': 3.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 57 with params: {'learning_rate': 2.581226824745506e-05, 'weight_decay': 0.004, 'warmup_steps': 10, 'lambda_param': 0.0, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9064,2.019541,0.776376,0.776662,0.775985,0.776093
2,2.0347,1.728666,0.792431,0.792366,0.79234,0.792352
3,1.6617,1.591483,0.801606,0.801566,0.801476,0.801511
4,1.4317,1.566219,0.805046,0.806192,0.804433,0.804593
5,1.2721,1.502903,0.813073,0.813358,0.812737,0.812866
6,1.1575,1.476812,0.81422,0.814206,0.814073,0.814122
7,1.0789,1.481011,0.806193,0.80627,0.806359,0.806186
8,1.0097,1.485973,0.809633,0.809585,0.809527,0.809552
9,0.96,1.487237,0.813073,0.81322,0.813284,0.813071
10,0.9192,1.4964,0.816514,0.816546,0.816326,0.816397


[I 2025-03-29 05:12:33,217] Trial 57 finished with value: 0.8107238090978335 and parameters: {'learning_rate': 2.581226824745506e-05, 'weight_decay': 0.004, 'warmup_steps': 10, 'lambda_param': 0.0, 'temperature': 2.5}. Best is trial 37 with value: 0.8267526114341277.


Trial 58 with params: {'learning_rate': 8.838493696481511e-05, 'weight_decay': 0.001, 'warmup_steps': 14, 'lambda_param': 0.0, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2194,1.602901,0.808486,0.808564,0.808653,0.80848
2,1.2398,1.484848,0.816514,0.816603,0.816284,0.816375
3,0.918,1.509622,0.819954,0.820032,0.820125,0.819948
4,0.7604,1.596513,0.824541,0.826815,0.823745,0.823939
5,0.6531,1.637751,0.817661,0.818217,0.818041,0.817655
6,0.5871,1.640759,0.817661,0.818353,0.818083,0.817649
7,0.5245,1.707264,0.811927,0.812037,0.812116,0.811923
8,0.4874,1.701404,0.816514,0.816603,0.816284,0.816375
9,0.4475,1.733759,0.815367,0.816935,0.815999,0.815297
10,0.4252,1.759158,0.815367,0.815445,0.815536,0.815361


[I 2025-03-29 05:18:46,951] Trial 58 finished with value: 0.8142113874173404 and parameters: {'learning_rate': 8.838493696481511e-05, 'weight_decay': 0.001, 'warmup_steps': 14, 'lambda_param': 0.0, 'temperature': 2.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 59 with params: {'learning_rate': 4.1580499478084515e-05, 'weight_decay': 0.002, 'warmup_steps': 9, 'lambda_param': 0.4, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6405,1.801016,0.795872,0.796074,0.79555,0.795661
2,1.7019,1.592467,0.794725,0.795724,0.794129,0.79427
3,1.3301,1.492009,0.808486,0.808438,0.808527,0.808456
4,1.1202,1.509098,0.81078,0.81106,0.810442,0.81057
5,0.9778,1.474583,0.817661,0.817641,0.817746,0.817641
6,0.8859,1.499106,0.816514,0.816563,0.816662,0.816505
7,0.8172,1.552991,0.813073,0.813301,0.813326,0.813073
8,0.7665,1.54081,0.816514,0.816546,0.816326,0.816397
9,0.7221,1.577288,0.818807,0.819083,0.819083,0.818807
10,0.6896,1.554563,0.818807,0.818765,0.818704,0.81873


[I 2025-03-29 05:24:56,739] Trial 59 finished with value: 0.818730100255243 and parameters: {'learning_rate': 4.1580499478084515e-05, 'weight_decay': 0.002, 'warmup_steps': 9, 'lambda_param': 0.4, 'temperature': 2.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 60 with params: {'learning_rate': 6.905127566839599e-05, 'weight_decay': 0.001, 'warmup_steps': 16, 'lambda_param': 0.7000000000000001, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3595,1.647137,0.800459,0.800459,0.80056,0.800442
2,1.3855,1.470166,0.806193,0.806239,0.80598,0.806058
3,1.0338,1.465321,0.817661,0.817597,0.817662,0.81762
4,0.863,1.551011,0.824541,0.826111,0.823872,0.824073
5,0.7438,1.52487,0.822248,0.82227,0.822377,0.822236
6,0.6744,1.554514,0.818807,0.819302,0.819167,0.818804
7,0.6101,1.619296,0.819954,0.819901,0.819872,0.819886
8,0.5695,1.636655,0.817661,0.817956,0.817326,0.817459
9,0.5287,1.655372,0.822248,0.823097,0.822714,0.822229
10,0.5036,1.647444,0.817661,0.817595,0.81762,0.817607


[I 2025-03-29 05:31:03,382] Trial 60 finished with value: 0.8233350172602509 and parameters: {'learning_rate': 6.905127566839599e-05, 'weight_decay': 0.001, 'warmup_steps': 16, 'lambda_param': 0.7000000000000001, 'temperature': 2.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 61 with params: {'learning_rate': 3.293272179510502e-05, 'weight_decay': 0.006, 'warmup_steps': 16, 'lambda_param': 0.5, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7786,1.888045,0.78555,0.785772,0.785205,0.785313
2,1.8629,1.658194,0.793578,0.79434,0.793045,0.793185
3,1.4899,1.529512,0.807339,0.807304,0.807401,0.807314
4,1.2667,1.528075,0.811927,0.812776,0.8114,0.811569
5,1.1135,1.474186,0.813073,0.813041,0.812947,0.812985


[I 2025-03-29 05:33:06,577] Trial 61 pruned. 


Trial 62 with params: {'learning_rate': 7.283898624298514e-05, 'weight_decay': 0.008, 'warmup_steps': 29, 'lambda_param': 0.5, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3462,1.635009,0.803899,0.80385,0.803938,0.803868
2,1.3563,1.463603,0.808486,0.808673,0.80819,0.808302
3,1.0076,1.463327,0.816514,0.816478,0.816578,0.81649
4,0.8399,1.551403,0.823394,0.824852,0.822746,0.822944
5,0.7232,1.541484,0.819954,0.820101,0.820167,0.819952
6,0.6551,1.569376,0.818807,0.819431,0.819209,0.818799
7,0.5909,1.63085,0.817661,0.817669,0.817494,0.817555
8,0.5509,1.651564,0.815367,0.815425,0.815157,0.815238
9,0.5107,1.676816,0.817661,0.818502,0.818125,0.817641
10,0.4855,1.669231,0.817661,0.817597,0.817662,0.81762


[I 2025-03-29 05:39:10,403] Trial 62 finished with value: 0.8210670314637483 and parameters: {'learning_rate': 7.283898624298514e-05, 'weight_decay': 0.008, 'warmup_steps': 29, 'lambda_param': 0.5, 'temperature': 4.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 63 with params: {'learning_rate': 5.799674988791627e-05, 'weight_decay': 0.0, 'warmup_steps': 15, 'lambda_param': 0.4, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4585,1.707602,0.795872,0.795997,0.795592,0.79569
2,1.4934,1.501392,0.805046,0.80512,0.804812,0.804898
3,1.1272,1.474702,0.811927,0.81191,0.811779,0.811828
4,0.9431,1.525525,0.827982,0.828748,0.827503,0.827688
5,0.8154,1.490251,0.817661,0.817739,0.817831,0.817655
6,0.7406,1.529663,0.817661,0.81789,0.817915,0.81766
7,0.6753,1.582151,0.821101,0.821041,0.821041,0.821041
8,0.6323,1.598231,0.823394,0.823433,0.823209,0.823282
9,0.5901,1.612853,0.816514,0.816891,0.816831,0.816513
10,0.5623,1.603143,0.826835,0.826773,0.82684,0.826796


[I 2025-03-29 05:45:18,795] Trial 63 finished with value: 0.8244579440359041 and parameters: {'learning_rate': 5.799674988791627e-05, 'weight_decay': 0.0, 'warmup_steps': 15, 'lambda_param': 0.4, 'temperature': 3.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 64 with params: {'learning_rate': 5.986275918990953e-05, 'weight_decay': 0.008, 'warmup_steps': 38, 'lambda_param': 0.0, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4703,1.704977,0.798165,0.798295,0.797887,0.797986
2,1.4815,1.497755,0.802752,0.80277,0.80256,0.802627
3,1.1121,1.475677,0.811927,0.81191,0.811779,0.811828
4,0.9289,1.526775,0.831422,0.83245,0.830881,0.831084
5,0.8027,1.505807,0.819954,0.820101,0.820167,0.819952
6,0.7287,1.542955,0.816514,0.816891,0.816831,0.816513
7,0.6634,1.591856,0.818807,0.818746,0.818746,0.818746
8,0.6202,1.608597,0.822248,0.822314,0.82204,0.822124
9,0.5785,1.627863,0.818807,0.819186,0.819125,0.818806
10,0.551,1.620109,0.822248,0.822185,0.822251,0.822208


[I 2025-03-29 05:51:27,615] Trial 64 finished with value: 0.8233192116411863 and parameters: {'learning_rate': 5.986275918990953e-05, 'weight_decay': 0.008, 'warmup_steps': 38, 'lambda_param': 0.0, 'temperature': 3.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 65 with params: {'learning_rate': 7.873581912408055e-05, 'weight_decay': 0.0, 'warmup_steps': 6, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2727,1.618046,0.811927,0.811869,0.811947,0.811891
2,1.307,1.482477,0.81078,0.810723,0.810695,0.810708
3,0.9723,1.497007,0.816514,0.816563,0.816662,0.816505
4,0.8073,1.594864,0.823394,0.825787,0.822577,0.822764
5,0.6952,1.608028,0.818807,0.818994,0.819041,0.818806
6,0.6269,1.601951,0.816514,0.817135,0.816915,0.816505
7,0.5646,1.660205,0.811927,0.811891,0.81199,0.811902
8,0.525,1.669175,0.811927,0.812163,0.811611,0.811732
9,0.485,1.68608,0.817661,0.818842,0.81821,0.81762
10,0.4611,1.716239,0.817661,0.817597,0.817662,0.81762


[I 2025-03-29 05:57:33,729] Trial 65 finished with value: 0.8233610438808797 and parameters: {'learning_rate': 7.873581912408055e-05, 'weight_decay': 0.0, 'warmup_steps': 6, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 66 with params: {'learning_rate': 0.00044749265830226623, 'weight_decay': 0.003, 'warmup_steps': 42, 'lambda_param': 0.0, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5807,1.457958,0.811927,0.81188,0.811821,0.811846
2,0.7125,1.94068,0.806193,0.806375,0.805896,0.806007
3,0.4728,1.952868,0.802752,0.802936,0.802981,0.802751
4,0.3491,2.011808,0.790138,0.790071,0.79013,0.790091
5,0.2827,2.090972,0.794725,0.798225,0.795687,0.79443
6,0.2301,2.043356,0.815367,0.815445,0.815536,0.815361
7,0.1925,2.133976,0.807339,0.807271,0.807317,0.80729
8,0.1658,1.991647,0.811927,0.811869,0.811947,0.811891
9,0.1475,2.050966,0.806193,0.806156,0.806064,0.806101
10,0.1291,2.333233,0.790138,0.792508,0.79093,0.789965


[I 2025-03-29 06:01:40,281] Trial 66 pruned. 


Trial 67 with params: {'learning_rate': 2.754583220865359e-05, 'weight_decay': 0.001, 'warmup_steps': 14, 'lambda_param': 0.5, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8749,1.980004,0.787844,0.78807,0.787499,0.787609
2,1.9889,1.711377,0.791284,0.791253,0.79113,0.791175
3,1.6172,1.572584,0.803899,0.803839,0.803812,0.803825
4,1.3881,1.555055,0.806193,0.807258,0.805601,0.805763
5,1.2294,1.491795,0.813073,0.813269,0.812779,0.812894


[I 2025-03-29 06:03:47,855] Trial 67 pruned. 


Trial 68 with params: {'learning_rate': 5.288130004639587e-05, 'weight_decay': 0.007, 'warmup_steps': 8, 'lambda_param': 0.6000000000000001, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5035,1.735837,0.793578,0.793775,0.793256,0.793365
2,1.5495,1.521128,0.801606,0.801962,0.801223,0.801354
3,1.18,1.481138,0.81078,0.810746,0.810653,0.81069
4,0.989,1.519278,0.822248,0.822783,0.82183,0.821993
5,0.8569,1.480088,0.818807,0.818807,0.818915,0.818792
6,0.7782,1.520098,0.819954,0.819977,0.820083,0.819943
7,0.7118,1.576715,0.818807,0.818741,0.818788,0.818761
8,0.668,1.586503,0.821101,0.821092,0.820956,0.821007
9,0.625,1.608133,0.817661,0.817985,0.817957,0.81766
10,0.5961,1.587421,0.825688,0.825623,0.825671,0.825643


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--accuracy/f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Sat Oct 12 13:56:14 2024) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.
[I 2025-03-29 06:09:56,464] Trial 68 finished with value: 0.8256137673341579 and parameters: {'learning_rate': 5.288130004639587e-05, 'weight_decay': 0.007, 'warmup_steps': 8, 'lambda_param': 0.6000000000000001, 'temperature': 7.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 69 with params: {'learning_rate': 5.776197337240459e-05, 'weight_decay': 0.007, 'warmup_steps': 14, 'lambda_param': 0.7000000000000001, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4597,1.708988,0.795872,0.795997,0.795592,0.79569
2,1.4957,1.502012,0.805046,0.80512,0.804812,0.804898
3,1.1294,1.475093,0.811927,0.81191,0.811779,0.811828
4,0.9451,1.525367,0.826835,0.827523,0.826377,0.826555
5,0.8171,1.489601,0.817661,0.817739,0.817831,0.817655
6,0.7422,1.528669,0.816514,0.8167,0.816747,0.816513
7,0.6768,1.58192,0.819954,0.819901,0.819872,0.819886
8,0.6339,1.597627,0.823394,0.823433,0.823209,0.823282
9,0.5915,1.612518,0.816514,0.816891,0.816831,0.816513
10,0.5638,1.602166,0.826835,0.826773,0.82684,0.826796


[I 2025-03-29 06:16:07,416] Trial 69 finished with value: 0.8244579440359041 and parameters: {'learning_rate': 5.776197337240459e-05, 'weight_decay': 0.007, 'warmup_steps': 14, 'lambda_param': 0.7000000000000001, 'temperature': 7.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 70 with params: {'learning_rate': 1.9019739852416414e-05, 'weight_decay': 0.001, 'warmup_steps': 5, 'lambda_param': 0.7000000000000001, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0488,2.244318,0.760321,0.760988,0.759756,0.759841
2,2.2707,1.824009,0.792431,0.792454,0.792551,0.792418
3,1.873,1.695159,0.786697,0.786807,0.786415,0.786507
4,1.6406,1.628679,0.798165,0.79868,0.797718,0.797858
5,1.4798,1.570823,0.805046,0.805718,0.804559,0.804713
6,1.3579,1.517002,0.808486,0.808487,0.808316,0.808375
7,1.2716,1.503743,0.811927,0.811869,0.811947,0.811891
8,1.1928,1.498816,0.809633,0.809585,0.809527,0.809552
9,1.1391,1.484733,0.807339,0.807275,0.807275,0.807275
10,1.0955,1.489991,0.81078,0.81106,0.810442,0.81057


[I 2025-03-29 06:20:10,470] Trial 70 pruned. 


Trial 71 with params: {'learning_rate': 0.00011150517132664043, 'weight_decay': 0.0, 'warmup_steps': 6, 'lambda_param': 0.30000000000000004, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0824,1.551894,0.811927,0.812684,0.812368,0.811911
2,1.1188,1.499836,0.817661,0.819395,0.816947,0.817129
3,0.8239,1.515887,0.825688,0.825737,0.82584,0.82568
4,0.6766,1.582136,0.81078,0.813711,0.809853,0.809968
5,0.5773,1.633244,0.819954,0.820184,0.820209,0.819954
6,0.514,1.689271,0.808486,0.809168,0.808906,0.808474
7,0.452,1.774781,0.81078,0.810731,0.810821,0.81075
8,0.4169,1.786566,0.816514,0.816675,0.816241,0.81635
9,0.3801,1.825347,0.813073,0.814432,0.813663,0.813018
10,0.3564,1.849505,0.806193,0.806144,0.806233,0.806162


[I 2025-03-29 06:24:14,987] Trial 71 pruned. 


Trial 72 with params: {'learning_rate': 5.1133032728766223e-05, 'weight_decay': 0.007, 'warmup_steps': 6, 'lambda_param': 0.30000000000000004, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5204,1.74518,0.794725,0.794885,0.794424,0.794528
2,1.5698,1.529601,0.800459,0.800865,0.800055,0.80019
3,1.1997,1.482279,0.811927,0.81188,0.811821,0.811846
4,1.0062,1.517542,0.822248,0.822783,0.82183,0.821993
5,0.8727,1.477635,0.817661,0.817641,0.817746,0.817641
6,0.7923,1.516745,0.818807,0.818807,0.818915,0.818792
7,0.7256,1.574198,0.818807,0.818741,0.818788,0.818761
8,0.6811,1.581033,0.822248,0.822261,0.822083,0.822145
9,0.638,1.605787,0.816514,0.816789,0.816789,0.816514
10,0.6086,1.582627,0.826835,0.826772,0.826798,0.826784


[I 2025-03-29 06:30:24,978] Trial 72 finished with value: 0.8256137673341579 and parameters: {'learning_rate': 5.1133032728766223e-05, 'weight_decay': 0.007, 'warmup_steps': 6, 'lambda_param': 0.30000000000000004, 'temperature': 7.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 73 with params: {'learning_rate': 4.3374804421380576e-05, 'weight_decay': 0.008, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 6.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6053,1.788512,0.797018,0.797266,0.796676,0.796794
2,1.6708,1.578169,0.795872,0.79714,0.795213,0.795351
3,1.3009,1.488418,0.811927,0.811859,0.811905,0.811878
4,1.0952,1.511152,0.813073,0.813358,0.812737,0.812866
5,0.9552,1.474094,0.817661,0.817597,0.817662,0.81762


[I 2025-03-29 06:32:27,639] Trial 73 pruned. 


Trial 74 with params: {'learning_rate': 9.354579012866187e-05, 'weight_decay': 0.008, 'warmup_steps': 4, 'lambda_param': 0.7000000000000001, 'temperature': 6.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1754,1.585012,0.815367,0.815596,0.815621,0.815367
2,1.212,1.496431,0.809633,0.809782,0.809358,0.809464
3,0.8958,1.510957,0.822248,0.82227,0.822377,0.822236
4,0.7403,1.59557,0.817661,0.819867,0.816862,0.817035
5,0.6346,1.644177,0.817661,0.818353,0.818083,0.817649
6,0.5692,1.653128,0.816514,0.817434,0.816999,0.81649
7,0.5064,1.71991,0.811927,0.811927,0.812032,0.811911
8,0.4699,1.719381,0.819954,0.820017,0.819746,0.819829
9,0.4306,1.771032,0.81422,0.815891,0.814873,0.814141
10,0.4085,1.786339,0.81078,0.810802,0.810906,0.810768


[I 2025-03-29 06:36:33,445] Trial 74 pruned. 


Trial 75 with params: {'learning_rate': 5.9829333956504966e-05, 'weight_decay': 0.005, 'warmup_steps': 13, 'lambda_param': 0.2, 'temperature': 6.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4385,1.696884,0.795872,0.795997,0.795592,0.79569
2,1.4735,1.495383,0.803899,0.804004,0.803644,0.803738
3,1.1097,1.472662,0.811927,0.81191,0.811779,0.811828
4,0.9281,1.53025,0.827982,0.829269,0.827376,0.827582
5,0.8019,1.496193,0.818807,0.818856,0.818957,0.818799
6,0.7284,1.532853,0.819954,0.820184,0.820209,0.819954
7,0.6634,1.585216,0.818807,0.818746,0.818746,0.818746
8,0.6205,1.602189,0.822248,0.82238,0.821998,0.822101
9,0.5786,1.616435,0.818807,0.819186,0.819125,0.818806
10,0.5515,1.608897,0.821101,0.821041,0.821041,0.821041


[I 2025-03-29 06:42:44,995] Trial 75 finished with value: 0.8244579440359041 and parameters: {'learning_rate': 5.9829333956504966e-05, 'weight_decay': 0.005, 'warmup_steps': 13, 'lambda_param': 0.2, 'temperature': 6.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 76 with params: {'learning_rate': 2.4333546860341735e-05, 'weight_decay': 0.005, 'warmup_steps': 4, 'lambda_param': 0.30000000000000004, 'temperature': 6.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9326,2.057061,0.774083,0.774567,0.773607,0.773718
2,2.0763,1.744299,0.792431,0.792359,0.792382,0.79237
3,1.7015,1.610021,0.799312,0.799351,0.799097,0.799172
4,1.4713,1.577093,0.803899,0.804952,0.803307,0.803465
5,1.3111,1.515049,0.81422,0.814669,0.813821,0.81397
6,1.1946,1.482825,0.811927,0.81191,0.811779,0.811828
7,1.1146,1.483147,0.806193,0.80627,0.806359,0.806186
8,1.0435,1.487179,0.803899,0.80383,0.803854,0.803841
9,0.9931,1.484397,0.809633,0.809633,0.809737,0.809617
10,0.9518,1.493232,0.815367,0.815425,0.815157,0.815238


[I 2025-03-29 06:46:53,671] Trial 76 pruned. 


Trial 77 with params: {'learning_rate': 4.753316387686598e-05, 'weight_decay': 0.007, 'warmup_steps': 4, 'lambda_param': 0.5, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5593,1.764487,0.794725,0.794885,0.794424,0.794528
2,1.6147,1.551103,0.802752,0.803558,0.802223,0.802377
3,1.2439,1.483976,0.813073,0.813017,0.812989,0.813002
4,1.045,1.513168,0.818807,0.81927,0.81841,0.818563
5,0.9083,1.474082,0.817661,0.817612,0.817704,0.817632
6,0.824,1.510372,0.816514,0.816563,0.816662,0.816505
7,0.7565,1.567732,0.815367,0.815348,0.815452,0.815347
8,0.7103,1.566961,0.818807,0.818972,0.818536,0.818646
9,0.6669,1.597602,0.815367,0.815691,0.815663,0.815367
10,0.6366,1.572272,0.825688,0.825682,0.825545,0.825596


[I 2025-03-29 06:53:04,712] Trial 77 finished with value: 0.8267526114341277 and parameters: {'learning_rate': 4.753316387686598e-05, 'weight_decay': 0.007, 'warmup_steps': 4, 'lambda_param': 0.5, 'temperature': 7.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 78 with params: {'learning_rate': 2.0050664717347798e-05, 'weight_decay': 0.007, 'warmup_steps': 0, 'lambda_param': 0.7000000000000001, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0222,2.199641,0.762615,0.763293,0.762051,0.762139
2,2.2253,1.803446,0.794725,0.794705,0.794803,0.794703
3,1.8349,1.676316,0.791284,0.791402,0.791004,0.791099
4,1.6039,1.616002,0.801606,0.802333,0.801097,0.801248
5,1.443,1.558649,0.805046,0.805718,0.804559,0.804713


[I 2025-03-29 06:55:07,966] Trial 78 pruned. 


Trial 79 with params: {'learning_rate': 0.00010177804621038942, 'weight_decay': 0.007, 'warmup_steps': 6, 'lambda_param': 0.30000000000000004, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1314,1.566302,0.815367,0.815691,0.815663,0.815367
2,1.1664,1.499518,0.811927,0.813956,0.811148,0.811306
3,0.859,1.493815,0.827982,0.827946,0.82805,0.827959
4,0.7085,1.564139,0.816514,0.819666,0.815568,0.815698
5,0.6057,1.612581,0.824541,0.825104,0.824924,0.824536
6,0.5421,1.638556,0.811927,0.813006,0.812453,0.811891
7,0.4792,1.725177,0.811927,0.811859,0.811905,0.811878
8,0.445,1.737262,0.817661,0.817786,0.81741,0.817511
9,0.4058,1.792651,0.81422,0.815304,0.814747,0.814185
10,0.3827,1.796987,0.806193,0.806144,0.806233,0.806162


[I 2025-03-29 06:59:16,925] Trial 79 pruned. 


Trial 80 with params: {'learning_rate': 4.55376183641086e-05, 'weight_decay': 0.007, 'warmup_steps': 2, 'lambda_param': 0.5, 'temperature': 6.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5814,1.775707,0.797018,0.797266,0.796676,0.796794
2,1.6411,1.564165,0.799312,0.800512,0.798676,0.798823
3,1.2705,1.485334,0.81422,0.814153,0.8142,0.814172
4,1.0683,1.511792,0.816514,0.816858,0.816157,0.816296
5,0.93,1.473731,0.821101,0.821044,0.821125,0.821067
6,0.8433,1.506541,0.813073,0.813151,0.813242,0.813067
7,0.7754,1.562114,0.81422,0.81422,0.814326,0.814205
8,0.7278,1.55797,0.821101,0.821197,0.820872,0.820965
9,0.6842,1.590648,0.817661,0.817985,0.817957,0.81766
10,0.6533,1.565774,0.826835,0.826852,0.826671,0.826734


[I 2025-03-29 07:05:31,142] Trial 80 finished with value: 0.8255963283437546 and parameters: {'learning_rate': 4.55376183641086e-05, 'weight_decay': 0.007, 'warmup_steps': 2, 'lambda_param': 0.5, 'temperature': 6.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 81 with params: {'learning_rate': 4.0978781233050886e-05, 'weight_decay': 0.005, 'warmup_steps': 1, 'lambda_param': 0.6000000000000001, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6402,1.804502,0.794725,0.794885,0.794424,0.794528
2,1.7086,1.59623,0.793578,0.794654,0.792961,0.793098
3,1.3385,1.493788,0.807339,0.807304,0.807401,0.807314
4,1.1283,1.511215,0.811927,0.812259,0.811569,0.811704
5,0.9856,1.475657,0.817661,0.817612,0.817704,0.817632
6,0.8932,1.498717,0.818807,0.818856,0.818957,0.818799
7,0.8245,1.548983,0.81422,0.814494,0.814494,0.81422
8,0.7732,1.538921,0.816514,0.816546,0.816326,0.816397
9,0.7285,1.572933,0.816514,0.816891,0.816831,0.816513
10,0.6958,1.552133,0.818807,0.818765,0.818704,0.81873


[I 2025-03-29 07:11:40,752] Trial 81 finished with value: 0.8175547376579559 and parameters: {'learning_rate': 4.0978781233050886e-05, 'weight_decay': 0.005, 'warmup_steps': 1, 'lambda_param': 0.6000000000000001, 'temperature': 7.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 82 with params: {'learning_rate': 5.711389093170868e-05, 'weight_decay': 0.008, 'warmup_steps': 2, 'lambda_param': 0.5, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4518,1.712198,0.793578,0.793775,0.793256,0.793365
2,1.5003,1.504575,0.805046,0.805267,0.804728,0.804845
3,1.1356,1.476628,0.813073,0.813041,0.812947,0.812985
4,0.9509,1.529591,0.825688,0.826603,0.825166,0.825356
5,0.8223,1.489312,0.818807,0.818856,0.818957,0.818799
6,0.7469,1.524795,0.819954,0.819977,0.820083,0.819943
7,0.6818,1.581362,0.817661,0.817595,0.81762,0.817607
8,0.6385,1.597538,0.819954,0.820083,0.819704,0.819806
9,0.596,1.611858,0.815367,0.815799,0.815705,0.815365
10,0.5683,1.599393,0.825688,0.825631,0.825714,0.825655


[I 2025-03-29 07:17:49,944] Trial 82 finished with value: 0.8255963283437546 and parameters: {'learning_rate': 5.711389093170868e-05, 'weight_decay': 0.008, 'warmup_steps': 2, 'lambda_param': 0.5, 'temperature': 5.5}. Best is trial 37 with value: 0.8267526114341277.


Trial 83 with params: {'learning_rate': 2.3043021487289137e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 11, 'lambda_param': 0.6000000000000001, 'temperature': 6.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9651,2.100938,0.774083,0.774688,0.773564,0.773675
2,2.1202,1.760385,0.795872,0.795812,0.795887,0.795833
3,1.7403,1.628387,0.799312,0.799409,0.799055,0.799147
4,1.5093,1.586274,0.802752,0.803715,0.802181,0.802336
5,1.3483,1.525536,0.813073,0.813462,0.812695,0.812837
6,1.2299,1.487974,0.808486,0.808451,0.808359,0.808395
7,1.1481,1.48529,0.806193,0.80627,0.806359,0.806186
8,1.0752,1.487578,0.805046,0.804977,0.805022,0.804996
9,1.0241,1.482174,0.811927,0.811927,0.812032,0.811911
10,0.9823,1.489975,0.81422,0.814377,0.813947,0.814055


[I 2025-03-29 07:21:56,115] Trial 83 pruned. 


Trial 84 with params: {'learning_rate': 6.225359863207699e-05, 'weight_decay': 0.01, 'warmup_steps': 3, 'lambda_param': 0.5, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.403,1.680715,0.799312,0.799409,0.799055,0.799147
2,1.4463,1.487781,0.803899,0.804004,0.803644,0.803738
3,1.0878,1.470281,0.81422,0.814206,0.814073,0.814122
4,0.9097,1.539558,0.825688,0.827161,0.82504,0.825243
5,0.7853,1.505474,0.819954,0.819977,0.820083,0.819943
6,0.7132,1.532899,0.819954,0.820101,0.820167,0.819952
7,0.6488,1.597291,0.817661,0.817607,0.817578,0.817591
8,0.6061,1.614076,0.819954,0.820162,0.819662,0.819781
9,0.5648,1.625202,0.819954,0.82039,0.820293,0.819952
10,0.5385,1.618982,0.821101,0.821041,0.821041,0.821041


[I 2025-03-29 07:28:15,547] Trial 84 finished with value: 0.823301543190383 and parameters: {'learning_rate': 6.225359863207699e-05, 'weight_decay': 0.01, 'warmup_steps': 3, 'lambda_param': 0.5, 'temperature': 5.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 85 with params: {'learning_rate': 6.39995377037567e-05, 'weight_decay': 0.007, 'warmup_steps': 7, 'lambda_param': 0.4, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3924,1.669903,0.798165,0.798138,0.798013,0.798059
2,1.4296,1.482578,0.805046,0.805066,0.804854,0.804922
3,1.0729,1.469148,0.816514,0.816501,0.816368,0.816417
4,0.8968,1.542256,0.826835,0.828421,0.826166,0.826372
5,0.7738,1.509087,0.821101,0.821101,0.821209,0.821086
6,0.7026,1.537112,0.821101,0.821377,0.821377,0.821101
7,0.6381,1.601884,0.818807,0.818765,0.818704,0.81873
8,0.596,1.619634,0.819954,0.820162,0.819662,0.819781
9,0.555,1.631596,0.819954,0.820513,0.820336,0.819948
10,0.529,1.625702,0.821101,0.821041,0.821041,0.821041


[I 2025-03-29 07:34:35,733] Trial 85 finished with value: 0.8256137673341579 and parameters: {'learning_rate': 6.39995377037567e-05, 'weight_decay': 0.007, 'warmup_steps': 7, 'lambda_param': 0.4, 'temperature': 5.5}. Best is trial 37 with value: 0.8267526114341277.


Trial 86 with params: {'learning_rate': 7.37531771471803e-05, 'weight_decay': 0.006, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2972,1.629826,0.809633,0.809565,0.809611,0.809584
2,1.3447,1.470178,0.807339,0.807362,0.807148,0.807217
3,1.0016,1.463001,0.817661,0.817612,0.817704,0.817632
4,0.8351,1.564903,0.823394,0.825291,0.822661,0.822858
5,0.719,1.543746,0.819954,0.820101,0.820167,0.819952
6,0.6503,1.571406,0.81422,0.814711,0.814579,0.814216
7,0.5871,1.641222,0.81422,0.81425,0.814031,0.814102
8,0.5477,1.652279,0.815367,0.815657,0.815031,0.815163
9,0.5071,1.684139,0.815367,0.816057,0.815789,0.815355
10,0.4822,1.675113,0.816514,0.816456,0.816536,0.816479


[I 2025-03-29 07:38:48,472] Trial 86 pruned. 


Trial 87 with params: {'learning_rate': 4.6539544844268765e-05, 'weight_decay': 0.007, 'warmup_steps': 2, 'lambda_param': 0.5, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5691,1.769944,0.795872,0.796074,0.79555,0.795661
2,1.6273,1.557476,0.799312,0.800338,0.798718,0.798867
3,1.2569,1.484533,0.813073,0.813007,0.813031,0.813018
4,1.0564,1.512564,0.817661,0.818062,0.817283,0.81743
5,0.9189,1.473955,0.818807,0.81875,0.818831,0.818773
6,0.8335,1.508472,0.816514,0.816563,0.816662,0.816505
7,0.7658,1.564695,0.81422,0.81422,0.814326,0.814205
8,0.7189,1.562494,0.819954,0.820083,0.819704,0.819806
9,0.6754,1.594252,0.815367,0.815691,0.815663,0.815367
10,0.6448,1.569012,0.826835,0.826852,0.826671,0.826734


[I 2025-03-29 07:45:01,550] Trial 87 finished with value: 0.8255963283437546 and parameters: {'learning_rate': 4.6539544844268765e-05, 'weight_decay': 0.007, 'warmup_steps': 2, 'lambda_param': 0.5, 'temperature': 5.5}. Best is trial 37 with value: 0.8267526114341277.


Trial 88 with params: {'learning_rate': 8.939389629581151e-05, 'weight_decay': 0.007, 'warmup_steps': 3, 'lambda_param': 0.6000000000000001, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1974,1.596941,0.81422,0.814269,0.814368,0.814211
2,1.2339,1.490361,0.811927,0.81208,0.811653,0.811759
3,0.9142,1.506668,0.821101,0.821212,0.821293,0.821097
4,0.7569,1.602089,0.817661,0.820124,0.81682,0.816984
5,0.6497,1.63675,0.819954,0.82039,0.820293,0.819952
6,0.5838,1.641056,0.817661,0.818502,0.818125,0.817641
7,0.5216,1.705021,0.809633,0.809682,0.809779,0.809624
8,0.4838,1.706805,0.818807,0.8189,0.818578,0.81867
9,0.4444,1.746377,0.815367,0.817151,0.816042,0.815279
10,0.4218,1.766512,0.813073,0.813096,0.8132,0.813061


[I 2025-03-29 07:49:08,763] Trial 88 pruned. 


Trial 89 with params: {'learning_rate': 4.155106814683351e-05, 'weight_decay': 0.007, 'warmup_steps': 7, 'lambda_param': 0.30000000000000004, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6388,1.800937,0.795872,0.796074,0.79555,0.795661
2,1.7017,1.59244,0.794725,0.795724,0.794129,0.79427
3,1.3303,1.492045,0.808486,0.808438,0.808527,0.808456
4,1.1205,1.509287,0.81078,0.81106,0.810442,0.81057
5,0.9781,1.474445,0.817661,0.817612,0.817704,0.817632
6,0.8863,1.499032,0.816514,0.816563,0.816662,0.816505
7,0.8176,1.552233,0.813073,0.813301,0.813326,0.813073
8,0.7669,1.54057,0.816514,0.816546,0.816326,0.816397
9,0.7224,1.576254,0.817661,0.817985,0.817957,0.81766
10,0.6899,1.554185,0.818807,0.818765,0.818704,0.81873


[I 2025-03-29 07:55:21,032] Trial 89 finished with value: 0.8175739418412338 and parameters: {'learning_rate': 4.155106814683351e-05, 'weight_decay': 0.007, 'warmup_steps': 7, 'lambda_param': 0.30000000000000004, 'temperature': 7.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 90 with params: {'learning_rate': 6.142202590452693e-05, 'weight_decay': 0.008, 'warmup_steps': 8, 'lambda_param': 0.0, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4176,1.686678,0.797018,0.797183,0.796718,0.796824
2,1.4558,1.490049,0.805046,0.805187,0.80477,0.804872
3,1.095,1.471593,0.811927,0.81191,0.811779,0.811828
4,0.9157,1.535391,0.826835,0.828213,0.826208,0.826413
5,0.7907,1.501442,0.821101,0.82115,0.821251,0.821092
6,0.7183,1.53363,0.821101,0.821377,0.821377,0.821101
7,0.6535,1.589901,0.817661,0.817607,0.817578,0.817591
8,0.6108,1.607838,0.821101,0.82127,0.82083,0.820942
9,0.5693,1.620875,0.819954,0.82039,0.820293,0.819952
10,0.5426,1.614677,0.822248,0.822196,0.822167,0.82218


[I 2025-03-29 08:01:29,237] Trial 90 finished with value: 0.8244579440359041 and parameters: {'learning_rate': 6.142202590452693e-05, 'weight_decay': 0.008, 'warmup_steps': 8, 'lambda_param': 0.0, 'temperature': 4.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 91 with params: {'learning_rate': 3.992873798941882e-05, 'weight_decay': 0.007, 'warmup_steps': 2, 'lambda_param': 0.5, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.656,1.812441,0.792431,0.792587,0.79213,0.792232
2,1.7262,1.604275,0.793578,0.794654,0.792961,0.793098
3,1.3561,1.497048,0.807339,0.807281,0.807359,0.807303
4,1.144,1.511758,0.811927,0.812259,0.811569,0.811704
5,1.0,1.476001,0.816514,0.816478,0.816578,0.81649
6,0.9063,1.495912,0.819954,0.819977,0.820083,0.819943
7,0.8374,1.544939,0.81422,0.814406,0.814452,0.814219
8,0.785,1.534059,0.818807,0.818841,0.81862,0.818692
9,0.74,1.567282,0.816514,0.816891,0.816831,0.816513
10,0.7068,1.548522,0.818807,0.818765,0.818704,0.81873


[I 2025-03-29 08:07:43,946] Trial 91 finished with value: 0.8187119728836396 and parameters: {'learning_rate': 3.992873798941882e-05, 'weight_decay': 0.007, 'warmup_steps': 2, 'lambda_param': 0.5, 'temperature': 5.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 92 with params: {'learning_rate': 0.00038371788124985846, 'weight_decay': 0.003, 'warmup_steps': 43, 'lambda_param': 0.9, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6191,1.468707,0.801606,0.801683,0.80177,0.801599
2,0.739,1.815433,0.807339,0.807275,0.807275,0.807275
3,0.5073,1.872901,0.809633,0.809818,0.809864,0.809632
4,0.3753,1.925328,0.805046,0.80553,0.805401,0.805042
5,0.2994,2.010037,0.809633,0.809585,0.809527,0.809552


[I 2025-03-29 08:09:47,162] Trial 92 pruned. 


Trial 93 with params: {'learning_rate': 3.0725128262141896e-05, 'weight_decay': 0.007, 'warmup_steps': 3, 'lambda_param': 0.5, 'temperature': 6.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8058,1.915222,0.783257,0.783473,0.78291,0.783017
2,1.9071,1.675085,0.794725,0.794967,0.794382,0.794497
3,1.5369,1.544749,0.806193,0.806144,0.806233,0.806162
4,1.3119,1.539647,0.809633,0.810472,0.809106,0.809271
5,1.1567,1.479654,0.81078,0.810783,0.810611,0.81067
6,1.0503,1.469167,0.81422,0.814153,0.8142,0.814172
7,0.9766,1.492795,0.808486,0.808632,0.808695,0.808484
8,0.914,1.494526,0.813073,0.813041,0.812947,0.812985
9,0.8661,1.508781,0.815367,0.815514,0.815578,0.815365
10,0.828,1.51136,0.815367,0.815425,0.815157,0.815238


[I 2025-03-29 08:13:52,750] Trial 93 pruned. 


Trial 94 with params: {'learning_rate': 6.128220142314674e-05, 'weight_decay': 0.007, 'warmup_steps': 10, 'lambda_param': 0.5, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4213,1.687904,0.797018,0.797183,0.796718,0.796824
2,1.4577,1.490467,0.805046,0.805187,0.80477,0.804872
3,1.0964,1.471673,0.811927,0.81191,0.811779,0.811828
4,0.9167,1.534281,0.827982,0.829269,0.827376,0.827582
5,0.7917,1.501329,0.818807,0.818856,0.818957,0.818799
6,0.7192,1.534172,0.822248,0.822575,0.822546,0.822247
7,0.6543,1.588726,0.817661,0.817607,0.817578,0.817591
8,0.6116,1.606837,0.821101,0.82127,0.82083,0.820942
9,0.5701,1.620381,0.819954,0.82039,0.820293,0.819952
10,0.5433,1.61421,0.822248,0.822196,0.822167,0.82218


[I 2025-03-29 08:20:03,123] Trial 94 finished with value: 0.8244579440359041 and parameters: {'learning_rate': 6.128220142314674e-05, 'weight_decay': 0.007, 'warmup_steps': 10, 'lambda_param': 0.5, 'temperature': 5.5}. Best is trial 37 with value: 0.8267526114341277.


Trial 95 with params: {'learning_rate': 2.0235786228099924e-05, 'weight_decay': 0.003, 'warmup_steps': 35, 'lambda_param': 0.30000000000000004, 'temperature': 6.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0402,2.213183,0.762615,0.763293,0.762051,0.762139
2,2.2312,1.804215,0.794725,0.794747,0.794845,0.794712
3,1.8334,1.675681,0.790138,0.790221,0.789877,0.789965
4,1.6005,1.612577,0.799312,0.800029,0.798802,0.79895
5,1.4386,1.555229,0.806193,0.806801,0.805727,0.80588
6,1.3167,1.505264,0.811927,0.81191,0.811779,0.811828
7,1.2309,1.49494,0.803899,0.803922,0.804022,0.803886
8,1.1535,1.492118,0.808486,0.808451,0.808359,0.808395
9,1.1005,1.480692,0.806193,0.806144,0.806233,0.806162
10,1.0577,1.485487,0.811927,0.812163,0.811611,0.811732


[I 2025-03-29 08:24:11,190] Trial 95 pruned. 


Trial 96 with params: {'learning_rate': 4.2762128019163814e-05, 'weight_decay': 0.008, 'warmup_steps': 4, 'lambda_param': 0.5, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6194,1.792682,0.797018,0.797183,0.796718,0.796824
2,1.682,1.583147,0.794725,0.795895,0.794087,0.794224
3,1.311,1.489393,0.81078,0.810715,0.810779,0.810738
4,1.1036,1.509974,0.813073,0.813358,0.812737,0.812866
5,0.9627,1.474243,0.817661,0.817597,0.817662,0.81762


[I 2025-03-29 08:26:15,379] Trial 96 pruned. 


Trial 97 with params: {'learning_rate': 3.5631162174406863e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 0, 'lambda_param': 0.6000000000000001, 'temperature': 6.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.717,1.850968,0.791284,0.791477,0.790962,0.791069
2,1.8019,1.637876,0.791284,0.792521,0.790625,0.790752
3,1.4327,1.515974,0.805046,0.805046,0.805149,0.805029
4,1.2144,1.521315,0.808486,0.809103,0.808022,0.808177
5,1.0652,1.476221,0.816514,0.81647,0.81641,0.816436
6,0.9662,1.481581,0.817661,0.817739,0.817831,0.817655
7,0.8959,1.520633,0.81422,0.814406,0.814452,0.814219
8,0.839,1.514855,0.811927,0.811954,0.811737,0.811807
9,0.7926,1.53819,0.816514,0.816891,0.816831,0.816513
10,0.7572,1.529584,0.817661,0.817631,0.817536,0.817574


[I 2025-03-29 08:32:25,508] Trial 97 finished with value: 0.8152967721140121 and parameters: {'learning_rate': 3.5631162174406863e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 0, 'lambda_param': 0.6000000000000001, 'temperature': 6.5}. Best is trial 37 with value: 0.8267526114341277.


Trial 98 with params: {'learning_rate': 0.0001425495875889341, 'weight_decay': 0.01, 'warmup_steps': 10, 'lambda_param': 0.2, 'temperature': 6.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9621,1.488798,0.801606,0.803337,0.802275,0.801511
2,1.0083,1.487937,0.815367,0.816486,0.814779,0.814958
3,0.7395,1.588777,0.818807,0.819302,0.819167,0.818804
4,0.5982,1.588491,0.813073,0.813708,0.812611,0.812772
5,0.5053,1.718367,0.811927,0.813006,0.812453,0.811891
6,0.4399,1.793903,0.807339,0.80878,0.807948,0.807275
7,0.3818,1.944325,0.799312,0.799629,0.799602,0.799312
8,0.3452,1.911878,0.807339,0.807566,0.807022,0.807141
9,0.3128,1.928295,0.809633,0.81174,0.810369,0.809512
10,0.2929,1.953779,0.800459,0.800423,0.800518,0.800432


[I 2025-03-29 08:36:33,427] Trial 98 pruned. 


Trial 99 with params: {'learning_rate': 3.796915148861297e-05, 'weight_decay': 0.002, 'warmup_steps': 13, 'lambda_param': 0.2, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6954,1.8309,0.791284,0.791477,0.790962,0.791069
2,1.764,1.621585,0.791284,0.792521,0.790625,0.790752
3,1.3918,1.504078,0.807339,0.807304,0.807401,0.807314
4,1.1759,1.512359,0.813073,0.813462,0.812695,0.812837
5,1.0289,1.474977,0.816514,0.816456,0.816536,0.816479


[I 2025-03-29 08:38:38,164] Trial 99 pruned. 


Trial 100 with params: {'learning_rate': 0.0004540061556428691, 'weight_decay': 0.003, 'warmup_steps': 24, 'lambda_param': 1.0, 'temperature': 6.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5422,1.430372,0.81422,0.814406,0.814452,0.814219
2,0.704,1.885834,0.806193,0.806562,0.805812,0.805947
3,0.4731,1.874916,0.806193,0.806739,0.806569,0.806186
4,0.3516,1.981315,0.800459,0.8004,0.800476,0.800421
5,0.2829,1.991686,0.808486,0.808564,0.808653,0.80848
6,0.2287,2.111898,0.800459,0.801577,0.799844,0.799995
7,0.1953,2.092685,0.788991,0.788955,0.789046,0.788963
8,0.1658,2.045467,0.801606,0.80175,0.801812,0.801603
9,0.1477,2.076681,0.801606,0.802932,0.802191,0.801547
10,0.1314,2.042845,0.802752,0.802683,0.802728,0.802701


[I 2025-03-29 08:42:46,673] Trial 100 pruned. 


Trial 101 with params: {'learning_rate': 9.122504721242308e-05, 'weight_decay': 0.003, 'warmup_steps': 25, 'lambda_param': 0.9, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2161,1.597898,0.811927,0.8122,0.8122,0.811927
2,1.2251,1.484043,0.815367,0.815425,0.815157,0.815238
3,0.9048,1.515131,0.823394,0.823506,0.823588,0.823391
4,0.7482,1.592366,0.821101,0.822978,0.820367,0.820557
5,0.6423,1.647316,0.819954,0.820649,0.820378,0.819943
6,0.5768,1.651143,0.816514,0.817278,0.816957,0.816498
7,0.5138,1.724512,0.81422,0.814269,0.814368,0.814211
8,0.4776,1.712223,0.815367,0.815489,0.815115,0.815215
9,0.4381,1.75526,0.815367,0.817151,0.816042,0.815279
10,0.4158,1.772547,0.815367,0.815514,0.815578,0.815365


[I 2025-03-29 08:46:55,145] Trial 101 pruned. 


Trial 102 with params: {'learning_rate': 5.658976697281868e-05, 'weight_decay': 0.004, 'warmup_steps': 17, 'lambda_param': 0.9, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4747,1.716415,0.793578,0.793775,0.793256,0.793365
2,1.5093,1.506161,0.805046,0.805187,0.80477,0.804872
3,1.1414,1.477235,0.811927,0.81191,0.811779,0.811828
4,0.9553,1.521725,0.826835,0.827385,0.826419,0.826587
5,0.8264,1.486412,0.816514,0.816625,0.816705,0.81651
6,0.7504,1.527364,0.818807,0.818918,0.818999,0.818804
7,0.6848,1.580441,0.818807,0.818746,0.818746,0.818746
8,0.6417,1.595293,0.822248,0.822221,0.822125,0.822163
9,0.5993,1.611765,0.816514,0.816891,0.816831,0.816513
10,0.5711,1.599348,0.826835,0.826787,0.826882,0.826807


[I 2025-03-29 08:53:08,138] Trial 102 finished with value: 0.8221801222215643 and parameters: {'learning_rate': 5.658976697281868e-05, 'weight_decay': 0.004, 'warmup_steps': 17, 'lambda_param': 0.9, 'temperature': 2.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 103 with params: {'learning_rate': 8.487287964854836e-05, 'weight_decay': 0.005, 'warmup_steps': 7, 'lambda_param': 0.5, 'temperature': 6.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2322,1.60643,0.811927,0.811891,0.81199,0.811902
2,1.2624,1.483992,0.813073,0.813078,0.812905,0.812965
3,0.9367,1.504104,0.816514,0.816563,0.816662,0.816505
4,0.7767,1.59997,0.824541,0.826815,0.823745,0.823939
5,0.6675,1.628715,0.816514,0.816891,0.816831,0.816513


[I 2025-03-29 08:55:14,688] Trial 103 pruned. 


Trial 104 with params: {'learning_rate': 4.471564007079742e-05, 'weight_decay': 0.007, 'warmup_steps': 8, 'lambda_param': 0.30000000000000004, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5983,1.780846,0.799312,0.799481,0.799013,0.799119
2,1.6546,1.570291,0.799312,0.800512,0.798676,0.798823
3,1.2828,1.486344,0.81422,0.814162,0.814242,0.814185
4,1.0786,1.509243,0.81422,0.814461,0.813905,0.814028
5,0.9393,1.473041,0.818807,0.81875,0.818831,0.818773
6,0.8514,1.504877,0.81422,0.814331,0.81441,0.814216
7,0.7833,1.560949,0.815367,0.81539,0.815494,0.815355
8,0.7352,1.55385,0.821101,0.821137,0.820914,0.820987
9,0.6915,1.589,0.818807,0.819186,0.819125,0.818806
10,0.6603,1.563592,0.826835,0.826852,0.826671,0.826734


[I 2025-03-29 09:01:28,305] Trial 104 finished with value: 0.8256137673341579 and parameters: {'learning_rate': 4.471564007079742e-05, 'weight_decay': 0.007, 'warmup_steps': 8, 'lambda_param': 0.30000000000000004, 'temperature': 5.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 105 with params: {'learning_rate': 3.5168576861786074e-05, 'weight_decay': 0.01, 'warmup_steps': 14, 'lambda_param': 0.30000000000000004, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7396,1.859886,0.788991,0.789178,0.788667,0.788773
2,1.8166,1.642497,0.788991,0.790397,0.788288,0.788402
3,1.4442,1.517478,0.805046,0.805046,0.805149,0.805029
4,1.2241,1.519579,0.808486,0.809103,0.808022,0.808177
5,1.0736,1.474528,0.817661,0.817607,0.817578,0.817591


[I 2025-03-29 09:03:31,726] Trial 105 pruned. 


Trial 106 with params: {'learning_rate': 7.122930804692333e-05, 'weight_decay': 0.007, 'warmup_steps': 1, 'lambda_param': 0.5, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3211,1.637565,0.807339,0.807304,0.807401,0.807314
2,1.3641,1.468762,0.806193,0.806191,0.806022,0.80608
3,1.0184,1.462655,0.818807,0.81875,0.818831,0.818773
4,0.8497,1.560095,0.825688,0.827161,0.82504,0.825243
5,0.732,1.534887,0.821101,0.82115,0.821251,0.821092
6,0.6629,1.56104,0.817661,0.81789,0.817915,0.81766
7,0.5995,1.632099,0.818807,0.818841,0.81862,0.818692
8,0.5595,1.64369,0.817661,0.817956,0.817326,0.817459
9,0.5186,1.66752,0.815367,0.815921,0.815747,0.815361
10,0.4938,1.66043,0.816514,0.816447,0.816494,0.816466


[I 2025-03-29 09:09:44,474] Trial 106 finished with value: 0.8210670314637483 and parameters: {'learning_rate': 7.122930804692333e-05, 'weight_decay': 0.007, 'warmup_steps': 1, 'lambda_param': 0.5, 'temperature': 5.5}. Best is trial 37 with value: 0.8267526114341277.


Trial 107 with params: {'learning_rate': 0.00031471170796524536, 'weight_decay': 0.008, 'warmup_steps': 8, 'lambda_param': 0.5, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6024,1.449678,0.81078,0.810723,0.810695,0.810708
2,0.7691,1.786569,0.809633,0.809959,0.809274,0.809407
3,0.5311,1.823716,0.819954,0.819889,0.819915,0.819901
4,0.3997,1.822692,0.808486,0.808862,0.808106,0.808244
5,0.3281,1.952712,0.809633,0.809597,0.809695,0.809608


[I 2025-03-29 09:11:47,744] Trial 107 pruned. 


Trial 108 with params: {'learning_rate': 1.7064665691774607e-05, 'weight_decay': 0.007, 'warmup_steps': 7, 'lambda_param': 0.2, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0931,2.33128,0.74656,0.746938,0.746074,0.746151
2,2.362,1.872413,0.787844,0.788066,0.788088,0.787844
3,1.9514,1.736415,0.783257,0.783395,0.782952,0.783049
4,1.7166,1.659683,0.797018,0.79747,0.796592,0.796727
5,1.5562,1.601991,0.798165,0.798949,0.797634,0.797781


[I 2025-03-29 09:13:51,737] Trial 108 pruned. 


Trial 109 with params: {'learning_rate': 2.2890061022371275e-05, 'weight_decay': 0.006, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9621,2.100765,0.774083,0.774688,0.773564,0.773675
2,2.1221,1.761682,0.795872,0.795812,0.795887,0.795833
3,1.7436,1.630664,0.799312,0.799409,0.799055,0.799147
4,1.5131,1.58814,0.802752,0.803715,0.802181,0.802336
5,1.3523,1.527647,0.811927,0.812368,0.811526,0.811673
6,1.2341,1.489479,0.808486,0.808451,0.808359,0.808395
7,1.1523,1.48629,0.806193,0.80627,0.806359,0.806186
8,1.0793,1.488551,0.806193,0.806134,0.806106,0.806119
9,1.0281,1.482728,0.811927,0.811927,0.812032,0.811911
10,0.9862,1.490476,0.813073,0.813192,0.812821,0.81292


[I 2025-03-29 09:18:03,025] Trial 109 pruned. 


Trial 110 with params: {'learning_rate': 4.393785846446947e-05, 'weight_decay': 0.007, 'warmup_steps': 9, 'lambda_param': 0.5, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6093,1.785666,0.798165,0.798372,0.797845,0.797957
2,1.6662,1.575632,0.797018,0.798389,0.79634,0.796476
3,1.2942,1.48722,0.811927,0.811859,0.811905,0.811878
4,1.0885,1.508682,0.81422,0.814461,0.813905,0.814028
5,0.9484,1.473304,0.817661,0.817597,0.817662,0.81762
6,0.8595,1.503563,0.816514,0.816563,0.816662,0.816505
7,0.7912,1.559323,0.81422,0.814269,0.814368,0.814211
8,0.7425,1.550496,0.817661,0.817721,0.817452,0.817534
9,0.6987,1.58612,0.818807,0.819186,0.819125,0.818806
10,0.6672,1.56146,0.825688,0.825649,0.825587,0.825614


[I 2025-03-29 09:24:15,490] Trial 110 finished with value: 0.823301543190383 and parameters: {'learning_rate': 4.393785846446947e-05, 'weight_decay': 0.007, 'warmup_steps': 9, 'lambda_param': 0.5, 'temperature': 7.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 111 with params: {'learning_rate': 8.440743911611171e-05, 'weight_decay': 0.005, 'warmup_steps': 17, 'lambda_param': 0.30000000000000004, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2486,1.608647,0.807339,0.807271,0.807317,0.80729
2,1.2673,1.481631,0.813073,0.813041,0.812947,0.812985
3,0.9391,1.505806,0.817661,0.817683,0.817789,0.817649
4,0.7785,1.592082,0.825688,0.827847,0.824914,0.825113
5,0.6693,1.629343,0.818807,0.819302,0.819167,0.818804
6,0.6025,1.623487,0.821101,0.821871,0.821546,0.821086
7,0.5401,1.685704,0.811927,0.811975,0.812074,0.811918
8,0.5021,1.686329,0.817661,0.817786,0.81741,0.817511
9,0.4622,1.711864,0.815367,0.816935,0.815999,0.815297
10,0.4394,1.74027,0.817661,0.817641,0.817746,0.817641


[I 2025-03-29 09:28:24,217] Trial 111 pruned. 


Trial 112 with params: {'learning_rate': 7.958553193484505e-05, 'weight_decay': 0.006, 'warmup_steps': 23, 'lambda_param': 0.9, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2895,1.619868,0.803899,0.804044,0.804107,0.803897
2,1.3086,1.476976,0.809633,0.809569,0.809569,0.809569
3,0.97,1.486483,0.817661,0.817683,0.817789,0.817649
4,0.8042,1.573087,0.825688,0.827847,0.824914,0.825113
5,0.6925,1.595807,0.821101,0.82148,0.82142,0.8211
6,0.6239,1.590353,0.819954,0.820513,0.820336,0.819948
7,0.5611,1.651421,0.816514,0.816456,0.816536,0.816479
8,0.5212,1.657454,0.816514,0.816675,0.816241,0.81635
9,0.4819,1.685459,0.818807,0.820085,0.819378,0.818761
10,0.4575,1.711079,0.817661,0.817612,0.817704,0.817632


[I 2025-03-29 09:34:41,109] Trial 112 finished with value: 0.817648799542307 and parameters: {'learning_rate': 7.958553193484505e-05, 'weight_decay': 0.006, 'warmup_steps': 23, 'lambda_param': 0.9, 'temperature': 4.5}. Best is trial 37 with value: 0.8267526114341277.


Trial 113 with params: {'learning_rate': 4.546434510128894e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 8, 'lambda_param': 0.2, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5889,1.776396,0.797018,0.797183,0.796718,0.796824
2,1.6441,1.565139,0.799312,0.800512,0.798676,0.798823
3,1.2723,1.485608,0.81422,0.814153,0.8142,0.814172
4,1.0694,1.509963,0.816514,0.816858,0.816157,0.816296
5,0.9307,1.472995,0.821101,0.821044,0.821125,0.821067
6,0.8438,1.506364,0.81422,0.814331,0.81441,0.814216
7,0.7758,1.563093,0.81422,0.81422,0.814326,0.814205
8,0.7283,1.557251,0.821101,0.821197,0.820872,0.820965
9,0.6847,1.591369,0.816514,0.816891,0.816831,0.816513
10,0.6538,1.565964,0.826835,0.826852,0.826671,0.826734


[I 2025-03-29 09:40:57,993] Trial 113 finished with value: 0.8256137673341579 and parameters: {'learning_rate': 4.546434510128894e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 8, 'lambda_param': 0.2, 'temperature': 5.5}. Best is trial 37 with value: 0.8267526114341277.


Trial 114 with params: {'learning_rate': 7.806615700810692e-05, 'weight_decay': 0.007, 'warmup_steps': 1, 'lambda_param': 0.30000000000000004, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2691,1.618651,0.813073,0.813009,0.813074,0.813032
2,1.3115,1.483055,0.809633,0.809569,0.809569,0.809569
3,0.9765,1.495875,0.815367,0.81539,0.815494,0.815355
4,0.811,1.597236,0.819954,0.822442,0.819115,0.819287
5,0.6984,1.606961,0.817661,0.81789,0.817915,0.81766
6,0.6301,1.600079,0.817661,0.818094,0.817999,0.817658
7,0.5677,1.661951,0.813073,0.813025,0.813116,0.813044
8,0.528,1.669875,0.81422,0.814461,0.813905,0.814028
9,0.488,1.686978,0.818807,0.819901,0.819336,0.818773
10,0.4638,1.716067,0.81422,0.814162,0.814242,0.814185


[I 2025-03-29 09:45:06,299] Trial 114 pruned. 


Trial 115 with params: {'learning_rate': 4.563581482424318e-05, 'weight_decay': 0.008, 'warmup_steps': 8, 'lambda_param': 0.30000000000000004, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5868,1.775466,0.797018,0.797183,0.796718,0.796824
2,1.6417,1.563935,0.799312,0.800512,0.798676,0.798823
3,1.2699,1.485458,0.81422,0.814153,0.8142,0.814172
4,1.0674,1.510179,0.816514,0.816858,0.816157,0.816296
5,0.9288,1.473091,0.821101,0.821044,0.821125,0.821067
6,0.8421,1.506642,0.81422,0.814331,0.81441,0.814216
7,0.7741,1.563589,0.81422,0.81422,0.814326,0.814205
8,0.7268,1.558023,0.821101,0.821197,0.820872,0.820965
9,0.6832,1.592042,0.816514,0.816891,0.816831,0.816513
10,0.6523,1.566443,0.826835,0.826852,0.826671,0.826734


[I 2025-03-29 09:51:20,822] Trial 115 finished with value: 0.8256137673341579 and parameters: {'learning_rate': 4.563581482424318e-05, 'weight_decay': 0.008, 'warmup_steps': 8, 'lambda_param': 0.30000000000000004, 'temperature': 5.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 116 with params: {'learning_rate': 6.643822835431652e-05, 'weight_decay': 0.008, 'warmup_steps': 7, 'lambda_param': 0.2, 'temperature': 6.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3707,1.657216,0.798165,0.798097,0.798097,0.798097
2,1.4067,1.476728,0.801606,0.8016,0.801433,0.80149
3,1.0533,1.467038,0.817661,0.817607,0.817578,0.817591
4,0.8799,1.549585,0.824541,0.826111,0.823872,0.824073
5,0.7589,1.515848,0.823394,0.823394,0.823503,0.82338
6,0.6886,1.543014,0.821101,0.821377,0.821377,0.821101
7,0.6244,1.61192,0.818807,0.818765,0.818704,0.81873
8,0.583,1.628732,0.817661,0.817956,0.817326,0.817459
9,0.5421,1.642737,0.819954,0.820649,0.820378,0.819943
10,0.5167,1.636951,0.819954,0.819889,0.819915,0.819901


[I 2025-03-29 09:57:35,499] Trial 116 finished with value: 0.8256137673341579 and parameters: {'learning_rate': 6.643822835431652e-05, 'weight_decay': 0.008, 'warmup_steps': 7, 'lambda_param': 0.2, 'temperature': 6.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 117 with params: {'learning_rate': 7.15073888525604e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 11, 'lambda_param': 0.1, 'temperature': 6.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.334,1.638669,0.802752,0.802752,0.802854,0.802736
2,1.3642,1.466261,0.808486,0.808536,0.808274,0.808353
3,1.0166,1.463185,0.818807,0.81875,0.818831,0.818773
4,0.848,1.55611,0.825688,0.827161,0.82504,0.825243
5,0.7305,1.53414,0.822248,0.822326,0.822419,0.822242
6,0.6615,1.564158,0.819954,0.82028,0.820251,0.819954
7,0.5976,1.628035,0.817661,0.817669,0.817494,0.817555
8,0.5579,1.642917,0.819954,0.820162,0.819662,0.819781
9,0.5171,1.667777,0.818807,0.819574,0.819251,0.818792
10,0.4923,1.660472,0.817661,0.817597,0.817662,0.81762


[I 2025-03-29 10:01:43,938] Trial 117 pruned. 


Trial 118 with params: {'learning_rate': 0.0004838234070984164, 'weight_decay': 0.006, 'warmup_steps': 9, 'lambda_param': 0.0, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4776,1.52579,0.809633,0.811987,0.810411,0.809489
2,0.6932,1.851123,0.808486,0.808976,0.808064,0.808212
3,0.4649,1.953027,0.819954,0.819891,0.819957,0.819914
4,0.347,1.892196,0.803899,0.805523,0.803181,0.803328
5,0.2762,2.184135,0.801606,0.801831,0.801854,0.801605


[I 2025-03-29 10:03:47,272] Trial 118 pruned. 


Trial 119 with params: {'learning_rate': 1.0704036787379217e-05, 'weight_decay': 0.003, 'warmup_steps': 35, 'lambda_param': 0.4, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.2241,2.685822,0.698394,0.703721,0.696756,0.69522
2,2.7725,2.172384,0.771789,0.773406,0.772449,0.771681
3,2.3266,1.920143,0.779817,0.779753,0.7797,0.779723
4,2.0633,1.814999,0.783257,0.78333,0.782994,0.783079
5,1.8967,1.761667,0.784404,0.78499,0.78391,0.784036
6,1.7667,1.691937,0.792431,0.792386,0.792298,0.792333
7,1.68,1.64998,0.798165,0.798112,0.798055,0.798079
8,1.594,1.622757,0.803899,0.803943,0.803686,0.803763
9,1.5353,1.591915,0.808486,0.808451,0.808359,0.808395
10,1.485,1.58251,0.802752,0.802969,0.802433,0.802549


[I 2025-03-29 10:07:57,733] Trial 119 pruned. 


Trial 120 with params: {'learning_rate': 3.931465750764011e-05, 'weight_decay': 0.008, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6658,1.81733,0.791284,0.791402,0.791004,0.791099
2,1.7369,1.60955,0.793578,0.794654,0.792961,0.793098
3,1.3667,1.499202,0.808486,0.808438,0.808527,0.808456
4,1.1535,1.512133,0.81422,0.814669,0.813821,0.81397
5,1.0087,1.476215,0.817661,0.817612,0.817704,0.817632
6,0.9142,1.494146,0.819954,0.819977,0.820083,0.819943
7,0.8451,1.542271,0.81422,0.814406,0.814452,0.814219
8,0.7921,1.531135,0.819954,0.820017,0.819746,0.819829
9,0.7469,1.563786,0.815367,0.815799,0.815705,0.815365
10,0.7134,1.546119,0.819954,0.819926,0.81983,0.819869


[I 2025-03-29 10:14:12,399] Trial 120 finished with value: 0.816417187730268 and parameters: {'learning_rate': 3.931465750764011e-05, 'weight_decay': 0.008, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 5.5}. Best is trial 37 with value: 0.8267526114341277.


Trial 121 with params: {'learning_rate': 5.4996185653935916e-05, 'weight_decay': 0.008, 'warmup_steps': 22, 'lambda_param': 0.2, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4965,1.726882,0.793578,0.793775,0.793256,0.793365
2,1.5287,1.513049,0.803899,0.804163,0.803559,0.803682
3,1.1584,1.480211,0.811927,0.81191,0.811779,0.811828
4,0.9697,1.518595,0.823394,0.823871,0.822998,0.823156
5,0.8395,1.485498,0.817661,0.817683,0.817789,0.817649
6,0.7623,1.527864,0.819954,0.820032,0.820125,0.819948
7,0.6961,1.581702,0.818807,0.818741,0.818788,0.818761
8,0.6527,1.593463,0.822248,0.822221,0.822125,0.822163
9,0.6101,1.61349,0.816514,0.816891,0.816831,0.816513
10,0.5815,1.598017,0.825688,0.825653,0.825756,0.825665


[I 2025-03-29 10:20:24,371] Trial 121 finished with value: 0.8244745722574152 and parameters: {'learning_rate': 5.4996185653935916e-05, 'weight_decay': 0.008, 'warmup_steps': 22, 'lambda_param': 0.2, 'temperature': 5.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 122 with params: {'learning_rate': 5.013479350078533e-05, 'weight_decay': 0.008, 'warmup_steps': 3, 'lambda_param': 0.2, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5281,1.750355,0.794725,0.794815,0.794466,0.794556
2,1.5813,1.534846,0.800459,0.800982,0.800013,0.800155
3,1.2114,1.48262,0.811927,0.81188,0.811821,0.811846
4,1.0165,1.516977,0.818807,0.819396,0.818367,0.818531
5,0.8823,1.476889,0.817661,0.817641,0.817746,0.817641
6,0.8008,1.514669,0.816514,0.816563,0.816662,0.816505
7,0.7339,1.572461,0.818807,0.81875,0.818831,0.818773
8,0.689,1.577845,0.821101,0.821197,0.820872,0.820965
9,0.6458,1.603978,0.816514,0.816891,0.816831,0.816513
10,0.6162,1.579607,0.826835,0.826785,0.826755,0.826769


[I 2025-03-29 10:26:34,467] Trial 122 finished with value: 0.8256137673341579 and parameters: {'learning_rate': 5.013479350078533e-05, 'weight_decay': 0.008, 'warmup_steps': 3, 'lambda_param': 0.2, 'temperature': 5.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 123 with params: {'learning_rate': 7.183646508196148e-05, 'weight_decay': 0.004, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3182,1.637196,0.809633,0.809597,0.809695,0.809608
2,1.3612,1.467794,0.807339,0.807362,0.807148,0.807217
3,1.0152,1.461297,0.818807,0.81875,0.818831,0.818773
4,0.8464,1.561988,0.823394,0.825064,0.822704,0.822902
5,0.7292,1.537138,0.822248,0.822326,0.822419,0.822242
6,0.6598,1.564632,0.817661,0.81789,0.817915,0.81766
7,0.5965,1.633972,0.818807,0.818841,0.81862,0.818692
8,0.5567,1.644416,0.816514,0.816858,0.816157,0.816296
9,0.5159,1.67379,0.816514,0.817278,0.816957,0.816498
10,0.491,1.665089,0.817661,0.817597,0.817662,0.81762


[I 2025-03-29 10:32:47,733] Trial 123 finished with value: 0.8210670314637483 and parameters: {'learning_rate': 7.183646508196148e-05, 'weight_decay': 0.004, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 2.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 124 with params: {'learning_rate': 4.126689200396419e-05, 'weight_decay': 0.008, 'warmup_steps': 11, 'lambda_param': 0.30000000000000004, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6467,1.803706,0.794725,0.794885,0.794424,0.794528
2,1.7076,1.59535,0.794725,0.795724,0.794129,0.79427
3,1.3354,1.492834,0.807339,0.807304,0.807401,0.807314
4,1.1249,1.509075,0.81078,0.81106,0.810442,0.81057
5,0.982,1.474674,0.818807,0.818807,0.818915,0.818792
6,0.8897,1.498329,0.817661,0.817739,0.817831,0.817655
7,0.8209,1.552159,0.81422,0.814494,0.814494,0.81422
8,0.7699,1.539411,0.816514,0.816546,0.816326,0.816397
9,0.7254,1.576073,0.818807,0.819083,0.819083,0.818807
10,0.6927,1.553741,0.818807,0.818765,0.818704,0.81873


[I 2025-03-29 10:36:53,558] Trial 124 pruned. 


Trial 125 with params: {'learning_rate': 7.41519116745938e-05, 'weight_decay': 0.008, 'warmup_steps': 12, 'lambda_param': 0.30000000000000004, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.315,1.632233,0.805046,0.805156,0.805233,0.805042
2,1.3472,1.466106,0.809633,0.809615,0.809485,0.809533
3,1.0011,1.460024,0.818807,0.818772,0.818873,0.818784
4,0.8333,1.558449,0.824541,0.826332,0.82383,0.82403
5,0.7178,1.545991,0.817661,0.817808,0.817873,0.817658
6,0.6484,1.572841,0.818807,0.819302,0.819167,0.818804
7,0.5853,1.632493,0.817661,0.817721,0.817452,0.817534
8,0.5458,1.649063,0.817661,0.817956,0.817326,0.817459
9,0.5052,1.689548,0.818807,0.819731,0.819294,0.818784
10,0.4803,1.67889,0.817661,0.817612,0.817704,0.817632


[I 2025-03-29 10:43:06,260] Trial 125 finished with value: 0.8245022789605572 and parameters: {'learning_rate': 7.41519116745938e-05, 'weight_decay': 0.008, 'warmup_steps': 12, 'lambda_param': 0.30000000000000004, 'temperature': 5.5}. Best is trial 37 with value: 0.8267526114341277.


Trial 126 with params: {'learning_rate': 3.994436916390084e-05, 'weight_decay': 0.003, 'warmup_steps': 3, 'lambda_param': 0.8, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6569,1.812512,0.792431,0.792587,0.79213,0.792232
2,1.7264,1.604366,0.793578,0.794654,0.792961,0.793098
3,1.356,1.49696,0.807339,0.807281,0.807359,0.807303
4,1.1438,1.511541,0.811927,0.812259,0.811569,0.811704
5,0.9998,1.475889,0.817661,0.817612,0.817704,0.817632
6,0.9061,1.495647,0.819954,0.819977,0.820083,0.819943
7,0.8371,1.544983,0.81422,0.814406,0.814452,0.814219
8,0.7848,1.53388,0.818807,0.818841,0.81862,0.818692
9,0.7398,1.56733,0.816514,0.816891,0.816831,0.816513
10,0.7066,1.54848,0.819954,0.819926,0.81983,0.819869


[I 2025-03-29 10:49:18,596] Trial 126 finished with value: 0.8175739418412338 and parameters: {'learning_rate': 3.994436916390084e-05, 'weight_decay': 0.003, 'warmup_steps': 3, 'lambda_param': 0.8, 'temperature': 3.5}. Best is trial 37 with value: 0.8267526114341277.


Trial 127 with params: {'learning_rate': 6.024366240428301e-05, 'weight_decay': 0.008, 'warmup_steps': 11, 'lambda_param': 0.2, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4323,1.694331,0.795872,0.795997,0.795592,0.79569
2,1.4687,1.494145,0.803899,0.804004,0.803644,0.803738
3,1.1058,1.472286,0.811927,0.81191,0.811779,0.811828
4,0.9248,1.532044,0.826835,0.828213,0.826208,0.826413
5,0.7989,1.497658,0.818807,0.818856,0.818957,0.818799
6,0.7258,1.53301,0.822248,0.822478,0.822504,0.822247
7,0.6607,1.586144,0.818807,0.818746,0.818746,0.818746
8,0.618,1.603615,0.821101,0.82127,0.82083,0.820942
9,0.5762,1.61752,0.818807,0.819186,0.819125,0.818806
10,0.5492,1.610259,0.822248,0.822196,0.822167,0.82218


[I 2025-03-29 10:55:25,913] Trial 127 finished with value: 0.8244579440359041 and parameters: {'learning_rate': 6.024366240428301e-05, 'weight_decay': 0.008, 'warmup_steps': 11, 'lambda_param': 0.2, 'temperature': 4.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 128 with params: {'learning_rate': 5.085042171729124e-05, 'weight_decay': 0.008, 'warmup_steps': 6, 'lambda_param': 0.1, 'temperature': 6.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5235,1.746641,0.794725,0.794815,0.794466,0.794556
2,1.5733,1.531282,0.801606,0.802073,0.801181,0.801321
3,1.203,1.48237,0.811927,0.81188,0.811821,0.811846
4,1.0091,1.516971,0.821101,0.82157,0.820704,0.82086
5,0.8754,1.477245,0.818807,0.818807,0.818915,0.818792
6,0.7947,1.51631,0.819954,0.819977,0.820083,0.819943
7,0.7278,1.573711,0.818807,0.818741,0.818788,0.818761
8,0.6833,1.580082,0.823394,0.823433,0.823209,0.823282
9,0.6402,1.605372,0.816514,0.816789,0.816789,0.816514
10,0.6108,1.581688,0.826835,0.826772,0.826798,0.826784


[I 2025-03-29 11:01:38,657] Trial 128 finished with value: 0.8256137673341579 and parameters: {'learning_rate': 5.085042171729124e-05, 'weight_decay': 0.008, 'warmup_steps': 6, 'lambda_param': 0.1, 'temperature': 6.5}. Best is trial 37 with value: 0.8267526114341277.


Trial 129 with params: {'learning_rate': 4.391795313898165e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 10, 'lambda_param': 0.2, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6105,1.785799,0.798165,0.798372,0.797845,0.797957
2,1.6668,1.575921,0.797018,0.798389,0.79634,0.796476
3,1.2946,1.487191,0.811927,0.811859,0.811905,0.811878
4,1.0888,1.508412,0.81422,0.814461,0.813905,0.814028
5,0.9487,1.473263,0.816514,0.816456,0.816536,0.816479


[I 2025-03-29 11:03:41,600] Trial 129 pruned. 


Trial 130 with params: {'learning_rate': 4.693907671796952e-05, 'weight_decay': 0.008, 'warmup_steps': 14, 'lambda_param': 0.1, 'temperature': 6.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.577,1.768784,0.798165,0.798295,0.797887,0.797986
2,1.6257,1.556261,0.799312,0.800338,0.798718,0.798867
3,1.2529,1.48459,0.815367,0.815301,0.815326,0.815312
4,1.0522,1.509662,0.816514,0.816858,0.816157,0.816296
5,0.9148,1.473871,0.816514,0.816478,0.816578,0.81649


[I 2025-03-29 11:05:44,009] Trial 130 pruned. 


Trial 131 with params: {'learning_rate': 0.00010451766015747133, 'weight_decay': 0.007, 'warmup_steps': 8, 'lambda_param': 0.1, 'temperature': 6.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1201,1.562648,0.81422,0.814711,0.814579,0.814216
2,1.1524,1.498303,0.809633,0.811412,0.808895,0.809054
3,0.8486,1.499826,0.825688,0.825688,0.825798,0.825673
4,0.6988,1.569447,0.813073,0.816034,0.812147,0.812271
5,0.5971,1.619542,0.824541,0.825104,0.824924,0.824536
6,0.5337,1.649574,0.813073,0.81407,0.813579,0.813044
7,0.4707,1.737827,0.81078,0.810731,0.810821,0.81075
8,0.4368,1.755246,0.821101,0.82127,0.82083,0.820942
9,0.3982,1.803018,0.81422,0.815304,0.814747,0.814185
10,0.3745,1.81689,0.808486,0.808438,0.808527,0.808456


[I 2025-03-29 11:09:57,380] Trial 131 pruned. 


Trial 132 with params: {'learning_rate': 7.298743902935234e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 6.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3092,1.634173,0.808486,0.808467,0.808569,0.808466
2,1.3539,1.468398,0.806193,0.806191,0.806022,0.80608
3,1.0085,1.462088,0.818807,0.818772,0.818873,0.818784
4,0.8401,1.56349,0.824541,0.826111,0.823872,0.824073
5,0.7237,1.544244,0.818807,0.818994,0.819041,0.818806
6,0.6541,1.568282,0.816514,0.816789,0.816789,0.816514
7,0.5913,1.636411,0.81422,0.81425,0.814031,0.814102
8,0.5513,1.647457,0.816514,0.816969,0.816115,0.816266
9,0.5108,1.683766,0.817661,0.818502,0.818125,0.817641
10,0.4858,1.675906,0.816514,0.816456,0.816536,0.816479


[I 2025-03-29 11:14:01,735] Trial 132 pruned. 


Trial 133 with params: {'learning_rate': 6.557558100496577e-05, 'weight_decay': 0.01, 'warmup_steps': 4, 'lambda_param': 0.2, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3741,1.660896,0.798165,0.798095,0.798139,0.798113
2,1.4142,1.479069,0.803899,0.803896,0.803728,0.803785
3,1.0602,1.467108,0.818807,0.818765,0.818704,0.81873
4,0.8859,1.548577,0.825688,0.827375,0.824998,0.825202
5,0.7642,1.513839,0.822248,0.822228,0.822335,0.822229
6,0.6935,1.539357,0.821101,0.821377,0.821377,0.821101
7,0.6293,1.610421,0.818807,0.818765,0.818704,0.81873
8,0.5876,1.626202,0.817661,0.817956,0.817326,0.817459
9,0.5468,1.638969,0.818807,0.819431,0.819209,0.818799
10,0.5211,1.633709,0.821101,0.821035,0.821083,0.821055


[I 2025-03-29 11:20:13,213] Trial 133 finished with value: 0.8256137673341579 and parameters: {'learning_rate': 6.557558100496577e-05, 'weight_decay': 0.01, 'warmup_steps': 4, 'lambda_param': 0.2, 'temperature': 4.5}. Best is trial 37 with value: 0.8267526114341277.


Trial 134 with params: {'learning_rate': 2.935107689661603e-05, 'weight_decay': 0.008, 'warmup_steps': 1, 'lambda_param': 0.2, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8294,1.937957,0.78555,0.785862,0.785162,0.785279
2,1.9388,1.6903,0.791284,0.791402,0.791004,0.791099
3,1.5696,1.556133,0.807339,0.807271,0.807317,0.80729
4,1.343,1.54686,0.806193,0.807258,0.805601,0.805763
5,1.1864,1.484376,0.81078,0.810832,0.810569,0.810648


[I 2025-03-29 11:22:15,553] Trial 134 pruned. 


Trial 135 with params: {'learning_rate': 9.741144451379889e-05, 'weight_decay': 0.01, 'warmup_steps': 6, 'lambda_param': 0.4, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1574,1.58,0.809633,0.810541,0.810116,0.809608
2,1.1947,1.504921,0.808486,0.809746,0.807853,0.808019
3,0.8787,1.48474,0.823394,0.823394,0.823503,0.82338
4,0.7257,1.567223,0.818807,0.821696,0.817904,0.818057
5,0.6209,1.606672,0.823394,0.823892,0.823756,0.823391
6,0.5559,1.638929,0.813073,0.814244,0.813621,0.813032
7,0.4927,1.714998,0.809633,0.809569,0.809569,0.809569
8,0.4581,1.707872,0.819954,0.820083,0.819704,0.819806
9,0.4185,1.777946,0.815367,0.816732,0.815957,0.815312
10,0.3956,1.77322,0.809633,0.809575,0.809653,0.809597


[I 2025-03-29 11:26:19,617] Trial 135 pruned. 


Trial 136 with params: {'learning_rate': 7.071981278419155e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3252,1.638848,0.807339,0.807304,0.807401,0.807314
2,1.3679,1.469129,0.807339,0.807362,0.807148,0.807217
3,1.0217,1.462874,0.817661,0.817612,0.817704,0.817632
4,0.8527,1.558796,0.825688,0.827161,0.82504,0.825243
5,0.7346,1.532585,0.821101,0.82115,0.821251,0.821092
6,0.6654,1.558429,0.817661,0.81789,0.817915,0.81766
7,0.602,1.629652,0.817661,0.817669,0.817494,0.817555
8,0.5619,1.641935,0.817661,0.817956,0.817326,0.817459
9,0.521,1.664109,0.816514,0.817135,0.816915,0.816505
10,0.4961,1.656974,0.816514,0.816447,0.816494,0.816466


[I 2025-03-29 11:30:24,660] Trial 136 pruned. 


Trial 137 with params: {'learning_rate': 3.388429037629391e-05, 'weight_decay': 0.0, 'warmup_steps': 8, 'lambda_param': 0.1, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7551,1.873133,0.786697,0.786965,0.786331,0.786444
2,1.8402,1.651142,0.788991,0.790039,0.788372,0.7885
3,1.4688,1.524654,0.805046,0.805046,0.805149,0.805029
4,1.2473,1.525387,0.809633,0.810472,0.809106,0.809271
5,1.0956,1.476122,0.81422,0.814175,0.814116,0.814141
6,0.9938,1.474852,0.816514,0.816563,0.816662,0.816505
7,0.9225,1.511404,0.81078,0.811007,0.811032,0.81078
8,0.8637,1.506363,0.813073,0.813078,0.812905,0.812965
9,0.8167,1.527655,0.818807,0.819186,0.819125,0.818806
10,0.7804,1.522672,0.819954,0.820017,0.819746,0.819829


[I 2025-03-29 11:36:33,624] Trial 137 finished with value: 0.8152967721140121 and parameters: {'learning_rate': 3.388429037629391e-05, 'weight_decay': 0.0, 'warmup_steps': 8, 'lambda_param': 0.1, 'temperature': 3.5}. Best is trial 37 with value: 0.8267526114341277.


Trial 138 with params: {'learning_rate': 4.074831272656962e-05, 'weight_decay': 0.002, 'warmup_steps': 10, 'lambda_param': 0.7000000000000001, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6529,1.80741,0.792431,0.792518,0.792172,0.79226
2,1.7156,1.599133,0.793578,0.794654,0.792961,0.793098
3,1.3437,1.494366,0.808486,0.808438,0.808527,0.808456
4,1.1323,1.509382,0.81078,0.81106,0.810442,0.81057
5,0.9889,1.474949,0.817661,0.817683,0.817789,0.817649
6,0.896,1.49733,0.818807,0.818856,0.818957,0.818799
7,0.8271,1.550206,0.81422,0.814494,0.814494,0.81422
8,0.7756,1.537233,0.816514,0.816501,0.816368,0.816417
9,0.7309,1.573226,0.817661,0.817985,0.817957,0.81766
10,0.698,1.552003,0.818807,0.818765,0.818704,0.81873


[I 2025-03-29 11:42:41,998] Trial 138 finished with value: 0.8187119728836396 and parameters: {'learning_rate': 4.074831272656962e-05, 'weight_decay': 0.002, 'warmup_steps': 10, 'lambda_param': 0.7000000000000001, 'temperature': 2.5}. Best is trial 37 with value: 0.8267526114341277.


Trial 139 with params: {'learning_rate': 6.070275777903919e-05, 'weight_decay': 0.01, 'warmup_steps': 1, 'lambda_param': 0.30000000000000004, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4146,1.690219,0.797018,0.797183,0.796718,0.796824
2,1.4617,1.492798,0.806193,0.806375,0.805896,0.806007
3,1.1017,1.471839,0.811927,0.81191,0.811779,0.811828
4,0.9216,1.536773,0.826835,0.828213,0.826208,0.826413
5,0.7959,1.501627,0.818807,0.818856,0.818957,0.818799
6,0.7231,1.530571,0.819954,0.820101,0.820167,0.819952
7,0.6585,1.589589,0.817661,0.817607,0.817578,0.817591
8,0.6154,1.60734,0.821101,0.82127,0.82083,0.820942
9,0.5738,1.619494,0.818807,0.819186,0.819125,0.818806
10,0.547,1.612498,0.823394,0.823335,0.823335,0.823335


[I 2025-03-29 11:48:52,998] Trial 139 finished with value: 0.8244579440359041 and parameters: {'learning_rate': 6.070275777903919e-05, 'weight_decay': 0.01, 'warmup_steps': 1, 'lambda_param': 0.30000000000000004, 'temperature': 4.5}. Best is trial 37 with value: 0.8267526114341277.


Trial 140 with params: {'learning_rate': 3.533508539495587e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 7, 'lambda_param': 0.2, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7302,1.855801,0.791284,0.791477,0.790962,0.791069
2,1.8109,1.640632,0.788991,0.790397,0.788288,0.788402
3,1.4399,1.516704,0.805046,0.805046,0.805149,0.805029
4,1.2204,1.520044,0.808486,0.809103,0.808022,0.808177
5,1.0704,1.475107,0.816514,0.81647,0.81641,0.816436
6,0.9707,1.479162,0.817661,0.817739,0.817831,0.817655
7,0.9002,1.52029,0.81078,0.811007,0.811032,0.81078
8,0.843,1.512543,0.811927,0.811954,0.811737,0.811807
9,0.7965,1.537084,0.817661,0.818094,0.817999,0.817658
10,0.761,1.528209,0.816514,0.81647,0.81641,0.816436


[I 2025-03-29 11:53:03,317] Trial 140 pruned. 


Trial 141 with params: {'learning_rate': 4.77700305636362e-05, 'weight_decay': 0.002, 'warmup_steps': 22, 'lambda_param': 0.4, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5757,1.765908,0.798165,0.798295,0.797887,0.797986
2,1.6173,1.552809,0.800459,0.801409,0.799886,0.800038
3,1.243,1.48488,0.813073,0.813017,0.812989,0.813002
4,1.0431,1.50958,0.816514,0.816759,0.816199,0.816324
5,0.9064,1.476256,0.815367,0.815348,0.815452,0.815347


[I 2025-03-29 11:55:08,567] Trial 141 pruned. 


Trial 142 with params: {'learning_rate': 9.890058814850274e-05, 'weight_decay': 0.001, 'warmup_steps': 10, 'lambda_param': 1.0, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1562,1.577698,0.809633,0.810388,0.810074,0.809617
2,1.1867,1.502415,0.81422,0.815614,0.813568,0.813746
3,0.8725,1.489269,0.826835,0.826816,0.826924,0.826816
4,0.72,1.575508,0.817661,0.82068,0.816736,0.816878
5,0.6159,1.611318,0.822248,0.822808,0.82263,0.822242
6,0.551,1.658161,0.81422,0.815304,0.814747,0.814185
7,0.4875,1.724104,0.813073,0.813007,0.813031,0.813018
8,0.4536,1.71507,0.819954,0.820083,0.819704,0.819806
9,0.4143,1.797293,0.813073,0.814634,0.813705,0.813002
10,0.3917,1.796623,0.807339,0.807339,0.807443,0.807323


[I 2025-03-29 11:59:17,315] Trial 142 pruned. 


Trial 143 with params: {'learning_rate': 4.0177761144064346e-05, 'weight_decay': 0.006, 'warmup_steps': 9, 'lambda_param': 0.1, 'temperature': 6.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6597,1.811335,0.791284,0.791402,0.791004,0.791099
2,1.7246,1.603394,0.793578,0.794654,0.792961,0.793098
3,1.353,1.495936,0.807339,0.807281,0.807359,0.807303
4,1.1407,1.509882,0.81078,0.81106,0.810442,0.81057
5,0.9967,1.475123,0.817661,0.817683,0.817789,0.817649
6,0.9031,1.495888,0.818807,0.818856,0.818957,0.818799
7,0.8341,1.54765,0.815367,0.815596,0.815621,0.815367
8,0.782,1.534855,0.817661,0.817669,0.817494,0.817555
9,0.7371,1.570182,0.817661,0.817985,0.817957,0.81766
10,0.704,1.550089,0.818807,0.818765,0.818704,0.81873


[I 2025-03-29 12:05:31,154] Trial 143 finished with value: 0.8175739418412338 and parameters: {'learning_rate': 4.0177761144064346e-05, 'weight_decay': 0.006, 'warmup_steps': 9, 'lambda_param': 0.1, 'temperature': 6.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 144 with params: {'learning_rate': 2.149685017125216e-05, 'weight_decay': 0.006, 'warmup_steps': 9, 'lambda_param': 0.6000000000000001, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9971,2.152298,0.764908,0.765599,0.764345,0.764437
2,2.1735,1.780923,0.798165,0.798129,0.798223,0.798139
3,1.788,1.652209,0.794725,0.794815,0.794466,0.794556
4,1.5571,1.599891,0.800459,0.801409,0.799886,0.800038
5,1.3958,1.540354,0.809633,0.810324,0.809148,0.809308
6,1.2757,1.496279,0.809633,0.809585,0.809527,0.809552
7,1.192,1.489586,0.805046,0.805156,0.805233,0.805042
8,1.1169,1.489613,0.81078,0.810723,0.810695,0.810708
9,1.0648,1.480757,0.808486,0.808509,0.808611,0.808474
10,1.0223,1.487338,0.811927,0.812163,0.811611,0.811732


[I 2025-03-29 12:09:36,431] Trial 144 pruned. 


Trial 145 with params: {'learning_rate': 8.40722463913065e-05, 'weight_decay': 0.006, 'warmup_steps': 16, 'lambda_param': 0.4, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2495,1.609151,0.807339,0.807271,0.807317,0.80729
2,1.2692,1.481645,0.81422,0.814175,0.814116,0.814141
3,0.9409,1.505433,0.816514,0.816514,0.81662,0.816498
4,0.7801,1.592283,0.825688,0.827847,0.824914,0.825113
5,0.6707,1.628766,0.818807,0.819302,0.819167,0.818804
6,0.6039,1.622541,0.821101,0.821871,0.821546,0.821086
7,0.5414,1.684366,0.811927,0.811975,0.812074,0.811918
8,0.5034,1.685328,0.817661,0.817786,0.81741,0.817511
9,0.4635,1.709659,0.816514,0.817982,0.817126,0.816452
10,0.4406,1.738532,0.817661,0.817641,0.817746,0.817641


[I 2025-03-29 12:13:41,732] Trial 145 pruned. 


Trial 146 with params: {'learning_rate': 3.16246935104891e-05, 'weight_decay': 0.0, 'warmup_steps': 9, 'lambda_param': 0.1, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7951,1.903639,0.784404,0.784582,0.784078,0.784181
2,1.8888,1.667398,0.791284,0.791899,0.790793,0.790928
3,1.5172,1.53803,0.809633,0.809597,0.809695,0.809608
4,1.293,1.534271,0.81078,0.811703,0.810232,0.810401
5,1.1385,1.476598,0.813073,0.813041,0.812947,0.812985
6,1.0333,1.469131,0.81422,0.814162,0.814242,0.814185
7,0.9603,1.497149,0.808486,0.808632,0.808695,0.808484
8,0.8988,1.496816,0.81422,0.814206,0.814073,0.814122
9,0.8511,1.513761,0.815367,0.815596,0.815621,0.815367
10,0.8136,1.513964,0.818807,0.818841,0.81862,0.818692


[I 2025-03-29 12:19:53,881] Trial 146 finished with value: 0.8175739418412338 and parameters: {'learning_rate': 3.16246935104891e-05, 'weight_decay': 0.0, 'warmup_steps': 9, 'lambda_param': 0.1, 'temperature': 2.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 147 with params: {'learning_rate': 5.6115221015602515e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 5, 'lambda_param': 0.4, 'temperature': 6.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.466,1.717396,0.793578,0.793775,0.793256,0.793365
2,1.5118,1.507835,0.805046,0.805267,0.804728,0.804845
3,1.1455,1.477247,0.811927,0.81191,0.811779,0.811828
4,0.9593,1.525886,0.823394,0.824142,0.822914,0.823093
5,0.83,1.48516,0.818807,0.818856,0.818957,0.818799
6,0.7539,1.523412,0.819954,0.819977,0.820083,0.819943
7,0.6884,1.579697,0.819954,0.819889,0.819915,0.819901
8,0.6451,1.59465,0.822248,0.822314,0.82204,0.822124
9,0.6025,1.610401,0.815367,0.815799,0.815705,0.815365
10,0.5745,1.595725,0.825688,0.825631,0.825714,0.825655


[I 2025-03-29 12:26:05,392] Trial 147 finished with value: 0.8255963283437546 and parameters: {'learning_rate': 5.6115221015602515e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 5, 'lambda_param': 0.4, 'temperature': 6.5}. Best is trial 37 with value: 0.8267526114341277.


Trial 148 with params: {'learning_rate': 6.974273840162806e-05, 'weight_decay': 0.004, 'warmup_steps': 16, 'lambda_param': 0.1, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3539,1.64439,0.800459,0.800459,0.80056,0.800442
2,1.3795,1.468665,0.807339,0.807417,0.807106,0.807193
3,1.0288,1.464613,0.817661,0.817597,0.817662,0.81762
4,0.8586,1.552002,0.824541,0.826111,0.823872,0.824073
5,0.7399,1.527478,0.822248,0.82227,0.822377,0.822236
6,0.6708,1.557127,0.817661,0.818094,0.817999,0.817658
7,0.6066,1.621569,0.821101,0.821059,0.820999,0.821025
8,0.5661,1.638599,0.815367,0.815657,0.815031,0.815163
9,0.5254,1.658874,0.821101,0.821871,0.821546,0.821086
10,0.5003,1.649827,0.818807,0.818741,0.818788,0.818761


[I 2025-03-29 12:32:16,844] Trial 148 finished with value: 0.8221801222215643 and parameters: {'learning_rate': 6.974273840162806e-05, 'weight_decay': 0.004, 'warmup_steps': 16, 'lambda_param': 0.1, 'temperature': 2.0}. Best is trial 37 with value: 0.8267526114341277.


Trial 149 with params: {'learning_rate': 8.221432133065498e-05, 'weight_decay': 0.006, 'warmup_steps': 1, 'lambda_param': 0.2, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.24,1.609083,0.813073,0.813025,0.813116,0.813044
2,1.2802,1.484442,0.81078,0.810723,0.810695,0.810708
3,0.9516,1.500191,0.818807,0.818856,0.818957,0.818799
4,0.7897,1.602838,0.825688,0.828375,0.82483,0.825017
5,0.6791,1.62221,0.816514,0.816789,0.816789,0.816514
6,0.6119,1.617079,0.817661,0.818217,0.818041,0.817655
7,0.5496,1.679772,0.81078,0.810802,0.810906,0.810768
8,0.511,1.683468,0.817661,0.817721,0.817452,0.817534
9,0.4711,1.706438,0.815367,0.816732,0.815957,0.815312
10,0.4475,1.734621,0.816514,0.816456,0.816536,0.816479


[I 2025-03-29 12:38:29,633] Trial 149 finished with value: 0.8210773868712218 and parameters: {'learning_rate': 8.221432133065498e-05, 'weight_decay': 0.006, 'warmup_steps': 1, 'lambda_param': 0.2, 'temperature': 7.0}. Best is trial 37 with value: 0.8267526114341277.


In [40]:
print(best_trial2)

BestRun(run_id='37', objective=0.8267526114341277, hyperparameters={'learning_rate': 4.729948829550423e-05, 'weight_decay': 0.002, 'warmup_steps': 9, 'lambda_param': 0.8, 'temperature': 2.0}, run_summary=None)


In [13]:
#Nápočet epoch na steps
data_length = len(train_aug)
min_r = math.ceil(data_length/batch_size)*5
max_r = math.ceil(data_length/batch_size)*num_epochs
warm_up = math.ceil(data_length/batch_size/10)

In [14]:
base.reset_seed()

In [15]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-base_aug_hp-search", logging_dir=f"~/logs/{DATASET}/bert-base_aug_hp-search", epochs=num_epochs, batch_size=batch_size)

In [16]:
def hp_space(trial):
    params =  {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-4, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0, 1e-2, step=1e-3),
        "warmup_steps" : trial.suggest_int("warmup_steps", 0, warm_up),
    }
    print(f"Trial {trial.number} with params: {params}")
    return params

In [17]:
pruner = optuna.pruners.HyperbandPruner(min_resource=min_r, max_resource=max_r, reduction_factor=2, bootstrap_count=2)
sampler = optuna.samplers.TPESampler(seed=42, multivariate=True)



In [18]:
trainer = Trainer(
    args=training_args,
    train_dataset=train_aug,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    model_init = lambda: get_Bert()
)
  

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
best_trial3 = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    compute_objective=lambda metrics: metrics["eval_f1"],
    pruner=pruner,
    sampler=sampler,
    study_name="Test-base-aug",
    n_trials=150
)

[I 2025-03-29 17:37:29,505] A new study created in memory with name: Test-base-aug


Trial 0 with params: {'learning_rate': 4.3284502212938785e-05, 'weight_decay': 0.01, 'warmup_steps': 169}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3435,0.427888,0.81078,0.812774,0.811495,0.81067
2,0.2025,0.470815,0.818807,0.818765,0.818704,0.81873
3,0.1645,0.521956,0.802752,0.802823,0.802517,0.802603
4,0.1418,0.555934,0.806193,0.806338,0.806401,0.80619
5,0.1258,0.617021,0.799312,0.799351,0.799097,0.799172


[I 2025-03-29 17:56:03,729] Trial 0 pruned. 


Trial 1 with params: {'learning_rate': 0.00010401663679887307, 'weight_decay': 0.001, 'warmup_steps': 36}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.269,0.482056,0.800459,0.800389,0.800434,0.800407
2,0.1488,0.549983,0.795872,0.796797,0.795298,0.795441
3,0.1123,0.673818,0.805046,0.806021,0.804475,0.804635
4,0.0904,0.763311,0.81078,0.810802,0.810906,0.810768
5,0.0749,0.89284,0.802752,0.80277,0.80256,0.802627
6,0.0653,0.980753,0.803899,0.805523,0.803181,0.803328
7,0.057,1.043335,0.797018,0.797112,0.796761,0.796851
8,0.0502,1.141318,0.78555,0.787026,0.784826,0.784926
9,0.0462,1.168056,0.786697,0.786698,0.786499,0.786561
10,0.0426,1.215207,0.791284,0.791213,0.791256,0.791231


[I 2025-03-29 18:33:02,144] Trial 1 pruned. 


Trial 2 with params: {'learning_rate': 1.2551115172973821e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 138}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4549,0.433953,0.803899,0.804878,0.804401,0.803868
2,0.3127,0.419962,0.81078,0.811945,0.811327,0.810738
3,0.2638,0.419399,0.818807,0.819302,0.819167,0.818804
4,0.2341,0.432901,0.824541,0.824564,0.824672,0.82453
5,0.214,0.44055,0.821101,0.821066,0.821167,0.821077


[I 2025-03-29 18:51:34,537] Trial 2 pruned. 


Trial 3 with params: {'learning_rate': 0.00015958573588141273, 'weight_decay': 0.0, 'warmup_steps': 224}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2586,0.520411,0.795872,0.796163,0.795508,0.79563
2,0.1295,0.612431,0.791284,0.792708,0.790583,0.790702
3,0.0928,0.761443,0.803899,0.803896,0.803728,0.803785
4,0.0724,0.895893,0.808486,0.809035,0.808864,0.80848
5,0.0587,0.989196,0.801606,0.8016,0.801433,0.80149
6,0.0502,1.014527,0.801606,0.803211,0.800886,0.801027
7,0.0434,1.132012,0.786697,0.786662,0.786541,0.786585
8,0.0374,1.174615,0.798165,0.798808,0.797676,0.797821
9,0.0339,1.250374,0.794725,0.794653,0.794677,0.794664
10,0.0308,1.315318,0.783257,0.783237,0.783331,0.783234


[I 2025-03-29 19:28:41,123] Trial 3 pruned. 


Trial 4 with params: {'learning_rate': 0.00025959425503112657, 'weight_decay': 0.002, 'warmup_steps': 42}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2224,0.603142,0.788991,0.789264,0.788625,0.788741
2,0.1087,0.718453,0.791284,0.791775,0.790835,0.790967
3,0.0753,0.862499,0.791284,0.791466,0.791509,0.791283
4,0.0577,1.00096,0.795872,0.795817,0.795761,0.795785
5,0.0467,1.135036,0.792431,0.792575,0.792635,0.792429


[I 2025-03-29 19:47:16,756] Trial 4 pruned. 


Trial 5 with params: {'learning_rate': 2.049268011541735e-05, 'weight_decay': 0.003, 'warmup_steps': 121}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4072,0.42648,0.813073,0.814634,0.813705,0.813002
2,0.2652,0.425141,0.823394,0.824498,0.823924,0.823361
3,0.2189,0.436451,0.824541,0.82462,0.824714,0.824536
4,0.1925,0.465637,0.818807,0.818746,0.818746,0.818746
5,0.1755,0.490894,0.813073,0.813054,0.813158,0.813053
6,0.1635,0.513377,0.815367,0.815348,0.815452,0.815347
7,0.153,0.52632,0.81422,0.814494,0.814494,0.81422
8,0.1454,0.554926,0.806193,0.806239,0.80598,0.806058
9,0.1397,0.566202,0.808486,0.808487,0.808316,0.808375
10,0.1356,0.578477,0.81078,0.810858,0.810948,0.810774


[I 2025-03-29 20:43:32,010] Trial 5 finished with value: 0.8015467816601527 and parameters: {'learning_rate': 2.049268011541735e-05, 'weight_decay': 0.003, 'warmup_steps': 121}. Best is trial 5 with value: 0.8015467816601527.


Trial 6 with params: {'learning_rate': 5.4182823195332406e-05, 'weight_decay': 0.003, 'warmup_steps': 141}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3227,0.434832,0.81422,0.815304,0.814747,0.814185
2,0.1868,0.491836,0.81078,0.81106,0.810442,0.81057
3,0.1505,0.557616,0.806193,0.806462,0.805854,0.805978
4,0.1279,0.603561,0.807339,0.807449,0.807527,0.807335
5,0.1119,0.680761,0.794725,0.794885,0.794424,0.794528


[I 2025-03-29 21:02:30,713] Trial 6 pruned. 


Trial 7 with params: {'learning_rate': 1.7258215396625005e-05, 'weight_decay': 0.003, 'warmup_steps': 84}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4216,0.427752,0.811927,0.813187,0.812495,0.811878
2,0.2815,0.421546,0.819954,0.821332,0.820546,0.819901
3,0.2339,0.427592,0.826835,0.827164,0.827134,0.826835
4,0.2062,0.451062,0.822248,0.822185,0.822251,0.822208
5,0.1882,0.470256,0.815367,0.815303,0.815368,0.815326
6,0.1757,0.491746,0.81422,0.814185,0.814284,0.814196
7,0.165,0.501495,0.808486,0.808713,0.808737,0.808486
8,0.1575,0.524967,0.809633,0.809658,0.809443,0.809512
9,0.1516,0.535827,0.808486,0.808451,0.808359,0.808395
10,0.1475,0.544024,0.81078,0.810802,0.810906,0.810768


[I 2025-03-29 21:59:45,562] Trial 7 finished with value: 0.807289775687067 and parameters: {'learning_rate': 1.7258215396625005e-05, 'weight_decay': 0.003, 'warmup_steps': 84}. Best is trial 7 with value: 0.807289775687067.


Trial 8 with params: {'learning_rate': 5.954553793888986e-05, 'weight_decay': 0.008, 'warmup_steps': 46}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3093,0.44164,0.813073,0.814244,0.813621,0.813032
2,0.1806,0.496529,0.807339,0.807767,0.806938,0.80708
3,0.1447,0.576101,0.806193,0.806562,0.805812,0.805947
4,0.1221,0.631857,0.805046,0.80523,0.805275,0.805045
5,0.1062,0.727037,0.795872,0.796645,0.79534,0.795483
6,0.0949,0.766699,0.791284,0.791899,0.790793,0.790928
7,0.085,0.832209,0.795872,0.795803,0.795803,0.795803
8,0.0767,0.939559,0.798165,0.800066,0.797381,0.797499
9,0.072,0.954106,0.791284,0.791227,0.791172,0.791195
10,0.068,0.991315,0.790138,0.790072,0.790046,0.790058


[I 2025-03-29 22:37:54,877] Trial 8 pruned. 


Trial 9 with params: {'learning_rate': 7.475992999956501e-05, 'weight_decay': 0.006, 'warmup_steps': 10}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.29,0.455015,0.81078,0.811007,0.811032,0.81078
2,0.1672,0.518119,0.808486,0.809244,0.80798,0.808141
3,0.1314,0.618824,0.800459,0.801111,0.799971,0.800118
4,0.1088,0.683271,0.808486,0.808467,0.808569,0.808466
5,0.0928,0.817841,0.792431,0.793118,0.791919,0.792057


[I 2025-03-29 22:56:29,707] Trial 9 pruned. 


Trial 10 with params: {'learning_rate': 1.0829253790120454e-05, 'weight_decay': 0.001, 'warmup_steps': 33}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4649,0.43798,0.800459,0.801351,0.800939,0.800432
2,0.3269,0.420554,0.81422,0.815682,0.814831,0.814158
3,0.2783,0.418483,0.816514,0.816891,0.816831,0.816513
4,0.248,0.427841,0.823394,0.823506,0.823588,0.823391
5,0.2271,0.431849,0.823394,0.823394,0.823503,0.82338
6,0.2129,0.440911,0.827982,0.827982,0.828092,0.827967
7,0.2007,0.449416,0.823394,0.823506,0.823588,0.823391
8,0.1925,0.462048,0.825688,0.825629,0.825629,0.825629
9,0.1859,0.471645,0.819954,0.820017,0.819746,0.819829
10,0.1814,0.473546,0.816514,0.816625,0.816705,0.81651


[I 2025-03-29 23:33:40,624] Trial 10 pruned. 


Trial 11 with params: {'learning_rate': 1.3099353602199278e-05, 'weight_decay': 0.002, 'warmup_steps': 145}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4511,0.432882,0.803899,0.804878,0.804401,0.803868
2,0.3086,0.419943,0.81078,0.811945,0.811327,0.810738
3,0.2597,0.419925,0.819954,0.820513,0.820336,0.819948
4,0.2301,0.434665,0.825688,0.825737,0.82584,0.82568
5,0.2103,0.443637,0.821101,0.821066,0.821167,0.821077
6,0.1968,0.459222,0.822248,0.8222,0.822293,0.822219
7,0.1853,0.468314,0.816514,0.816563,0.816662,0.816505
8,0.1775,0.485418,0.816514,0.816501,0.816368,0.816417
9,0.1713,0.495313,0.816514,0.81647,0.81641,0.816436
10,0.1671,0.499449,0.817661,0.817683,0.817789,0.817649


[I 2025-03-30 00:29:28,958] Trial 11 finished with value: 0.8107497010902807 and parameters: {'learning_rate': 1.3099353602199278e-05, 'weight_decay': 0.002, 'warmup_steps': 145}. Best is trial 11 with value: 0.8107497010902807.


Trial 12 with params: {'learning_rate': 1.0500886098732501e-05, 'weight_decay': 0.0, 'warmup_steps': 210}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.476,0.439287,0.798165,0.799216,0.798687,0.798127
2,0.3303,0.420917,0.81422,0.815304,0.814747,0.814185
3,0.2814,0.418668,0.816514,0.816891,0.816831,0.816513
4,0.2509,0.42717,0.823394,0.823506,0.823588,0.823391
5,0.2298,0.430807,0.824541,0.824522,0.82463,0.824523
6,0.2154,0.439175,0.826835,0.826858,0.826966,0.826824
7,0.2031,0.44763,0.824541,0.82462,0.824714,0.824536
8,0.1949,0.459492,0.825688,0.825629,0.825629,0.825629
9,0.1882,0.468992,0.819954,0.820017,0.819746,0.819829
10,0.1836,0.470685,0.816514,0.816625,0.816705,0.81651


[I 2025-03-30 01:05:45,126] Trial 12 pruned. 


Trial 13 with params: {'learning_rate': 1.712316550761407e-05, 'weight_decay': 0.005, 'warmup_steps': 60}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4213,0.427899,0.81078,0.812132,0.811369,0.810724
2,0.2822,0.421412,0.821101,0.822385,0.821672,0.821055
3,0.2346,0.427296,0.826835,0.827164,0.827134,0.826835
4,0.2068,0.45044,0.822248,0.822185,0.822251,0.822208
5,0.1888,0.469307,0.815367,0.815319,0.81541,0.815338
6,0.1763,0.490723,0.81422,0.814185,0.814284,0.814196
7,0.1656,0.500328,0.81078,0.810926,0.81099,0.810778
8,0.158,0.523466,0.809633,0.809658,0.809443,0.809512
9,0.1522,0.534463,0.808486,0.808451,0.808359,0.808395
10,0.1481,0.542435,0.811927,0.811975,0.812074,0.811918


[I 2025-03-30 03:01:19,781] Trial 13 finished with value: 0.807289775687067 and parameters: {'learning_rate': 1.712316550761407e-05, 'weight_decay': 0.005, 'warmup_steps': 60}. Best is trial 11 with value: 0.8107497010902807.


Trial 14 with params: {'learning_rate': 1.622732935569823e-05, 'weight_decay': 0.006, 'warmup_steps': 208}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4338,0.428627,0.809633,0.810708,0.810158,0.809597
2,0.2878,0.420898,0.81422,0.815486,0.814789,0.814172
3,0.2396,0.425328,0.823394,0.823892,0.823756,0.823391
4,0.2113,0.446719,0.822248,0.8222,0.822293,0.822219
5,0.1929,0.46352,0.817661,0.817595,0.81762,0.817607
6,0.1802,0.484477,0.815367,0.815319,0.81541,0.815338
7,0.1693,0.494203,0.809633,0.809906,0.809906,0.809633
8,0.1617,0.516274,0.81078,0.810895,0.810527,0.810624
9,0.1558,0.526253,0.811927,0.81188,0.811821,0.811846
10,0.1517,0.533862,0.809633,0.809633,0.809737,0.809617


[I 2025-03-30 03:38:39,857] Trial 14 pruned. 


Trial 15 with params: {'learning_rate': 1.071934208458698e-05, 'weight_decay': 0.001, 'warmup_steps': 134}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4703,0.438454,0.799312,0.800282,0.799813,0.79928
2,0.3281,0.420689,0.811927,0.813187,0.812495,0.811878
3,0.2793,0.418641,0.816514,0.816891,0.816831,0.816513
4,0.249,0.427605,0.823394,0.823506,0.823588,0.823391
5,0.228,0.431665,0.823394,0.823394,0.823503,0.82338
6,0.2137,0.440587,0.826835,0.826816,0.826924,0.826816
7,0.2015,0.449027,0.823394,0.823506,0.823588,0.823391
8,0.1933,0.461409,0.825688,0.825629,0.825629,0.825629
9,0.1866,0.471045,0.819954,0.820017,0.819746,0.819829
10,0.1821,0.472896,0.817661,0.817739,0.817831,0.817655


[I 2025-03-30 04:34:22,040] Trial 15 finished with value: 0.8153259275336583 and parameters: {'learning_rate': 1.071934208458698e-05, 'weight_decay': 0.001, 'warmup_steps': 134}. Best is trial 15 with value: 0.8153259275336583.


Trial 16 with params: {'learning_rate': 1.2586176447406365e-05, 'weight_decay': 0.0, 'warmup_steps': 126}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4541,0.433833,0.803899,0.804878,0.804401,0.803868
2,0.3124,0.419957,0.81078,0.811945,0.811327,0.810738
3,0.2635,0.419411,0.818807,0.819302,0.819167,0.818804
4,0.2338,0.433004,0.824541,0.824564,0.824672,0.82453
5,0.2138,0.440757,0.821101,0.821066,0.821167,0.821077
6,0.2001,0.454994,0.823394,0.823338,0.823419,0.823361
7,0.1885,0.463972,0.819954,0.820032,0.820125,0.819948
8,0.1806,0.480178,0.816514,0.816501,0.816368,0.816417
9,0.1743,0.49006,0.817661,0.817631,0.817536,0.817574
10,0.17,0.493691,0.817661,0.817683,0.817789,0.817649


[I 2025-03-30 05:31:34,519] Trial 16 finished with value: 0.8107497010902807 and parameters: {'learning_rate': 1.2586176447406365e-05, 'weight_decay': 0.0, 'warmup_steps': 126}. Best is trial 15 with value: 0.8153259275336583.


Trial 17 with params: {'learning_rate': 0.00023041229790746586, 'weight_decay': 0.008, 'warmup_steps': 186}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2385,0.565906,0.787844,0.787924,0.787583,0.78767
2,0.1144,0.63372,0.788991,0.792339,0.787952,0.787919
3,0.0789,0.8144,0.795872,0.795812,0.795887,0.795833
4,0.0608,1.008509,0.799312,0.799292,0.799392,0.799291
5,0.0492,1.083615,0.783257,0.783207,0.783289,0.783222
6,0.041,1.104995,0.791284,0.791775,0.790835,0.790967
7,0.0351,1.168335,0.78555,0.785531,0.785373,0.785426
8,0.0304,1.187894,0.797018,0.79747,0.796592,0.796727
9,0.0272,1.316508,0.77867,0.778799,0.778364,0.778457
10,0.0242,1.396634,0.780963,0.78104,0.781121,0.780956


[I 2025-03-30 06:08:37,035] Trial 17 pruned. 


Trial 18 with params: {'learning_rate': 0.0002950137270531351, 'weight_decay': 0.01, 'warmup_steps': 78}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2206,0.625377,0.776376,0.776991,0.775859,0.775973
2,0.1046,0.71559,0.77867,0.779172,0.778195,0.778313
3,0.0721,0.875053,0.784404,0.784764,0.78471,0.784403
4,0.0553,0.998958,0.797018,0.798951,0.797729,0.796901
5,0.0445,1.102123,0.794725,0.794967,0.794382,0.794497
6,0.0375,1.35466,0.788991,0.789363,0.788583,0.788706
7,0.0323,1.278281,0.775229,0.776396,0.775785,0.775171
8,0.028,1.145869,0.786697,0.787293,0.786204,0.786333
9,0.0237,1.43658,0.77867,0.779079,0.778995,0.778667
10,0.0217,1.427998,0.771789,0.772006,0.772028,0.771789


[I 2025-03-30 06:45:36,028] Trial 18 pruned. 


Trial 19 with params: {'learning_rate': 5.125465771181014e-05, 'weight_decay': 0.0, 'warmup_steps': 130}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3266,0.433107,0.813073,0.814634,0.813705,0.813002
2,0.1904,0.48707,0.81422,0.814377,0.813947,0.814055
3,0.1538,0.548656,0.805046,0.805267,0.804728,0.804845
4,0.1312,0.590688,0.803899,0.804044,0.804107,0.803897
5,0.1152,0.664429,0.794725,0.794885,0.794424,0.794528


[I 2025-03-30 07:04:13,870] Trial 19 pruned. 


Trial 20 with params: {'learning_rate': 3.137316955693352e-05, 'weight_decay': 0.002, 'warmup_steps': 208}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3736,0.427496,0.811927,0.813589,0.812579,0.811846
2,0.2276,0.445428,0.819954,0.820184,0.820209,0.819954
3,0.186,0.476633,0.81078,0.810723,0.810695,0.810708
4,0.1623,0.511112,0.81422,0.814269,0.814368,0.814211
5,0.1463,0.55368,0.806193,0.806128,0.806191,0.80615
6,0.135,0.575558,0.81078,0.810713,0.810737,0.810724
7,0.1245,0.608109,0.800459,0.800827,0.80077,0.800458
8,0.1162,0.65171,0.799312,0.800177,0.79876,0.79891
9,0.1108,0.67096,0.800459,0.8004,0.800476,0.800421
10,0.1066,0.694375,0.795872,0.795836,0.795929,0.795845


[I 2025-03-30 07:41:42,122] Trial 20 pruned. 


Trial 21 with params: {'learning_rate': 1.1921977502684865e-05, 'weight_decay': 0.0, 'warmup_steps': 99}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4582,0.435255,0.801606,0.80258,0.802107,0.801574
2,0.3177,0.420081,0.811927,0.813381,0.812537,0.811863
3,0.2688,0.418914,0.821101,0.82148,0.82142,0.8211
4,0.2389,0.430994,0.822248,0.822326,0.822419,0.822242
5,0.2185,0.437234,0.821101,0.821066,0.821167,0.821077
6,0.2046,0.449574,0.823394,0.823359,0.823461,0.823371
7,0.1928,0.458465,0.821101,0.82115,0.821251,0.821092
8,0.1848,0.473412,0.816514,0.81647,0.81641,0.816436
9,0.1784,0.483296,0.819954,0.819965,0.819788,0.81985
10,0.1741,0.486183,0.815367,0.81539,0.815494,0.815355


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--recall/11f90e583db35601050aed380d48e83202a896976b9608432fba9244fb447f24 (last modified on Fri Jan 10 23:14:00 2025) since it couldn't be found locally at evaluate-metric--recall, or remotely on the Hugging Face Hub.
[I 2025-03-30 08:18:47,891] Trial 21 pruned. 


Trial 22 with params: {'learning_rate': 1.0587611658805354e-05, 'weight_decay': 0.002, 'warmup_steps': 144}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.472,0.438878,0.798165,0.799216,0.798687,0.798127
2,0.3293,0.420792,0.81422,0.815304,0.814747,0.814185
3,0.2805,0.418652,0.816514,0.816891,0.816831,0.816513
4,0.2501,0.42733,0.823394,0.823506,0.823588,0.823391
5,0.2291,0.431074,0.824541,0.824522,0.82463,0.824523
6,0.2147,0.439606,0.827982,0.827982,0.828092,0.827967
7,0.2025,0.448025,0.823394,0.823506,0.823588,0.823391
8,0.1943,0.460122,0.825688,0.825629,0.825629,0.825629
9,0.1876,0.469706,0.819954,0.820017,0.819746,0.819829
10,0.1831,0.471465,0.817661,0.817739,0.817831,0.817655


[I 2025-03-30 09:14:27,463] Trial 22 finished with value: 0.8153259275336583 and parameters: {'learning_rate': 1.0587611658805354e-05, 'weight_decay': 0.002, 'warmup_steps': 144}. Best is trial 15 with value: 0.8153259275336583.


Trial 23 with params: {'learning_rate': 1.0102129930667866e-05, 'weight_decay': 0.004, 'warmup_steps': 179}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4783,0.44044,0.795872,0.796756,0.79635,0.795845
2,0.334,0.421304,0.815367,0.816368,0.815873,0.815338
3,0.2852,0.418737,0.818807,0.819186,0.819125,0.818806
4,0.2547,0.426334,0.821101,0.821288,0.821335,0.8211
5,0.2334,0.429192,0.827982,0.827982,0.828092,0.827967
6,0.2188,0.436289,0.827982,0.828031,0.828134,0.827974
7,0.2064,0.444459,0.823394,0.823506,0.823588,0.823391
8,0.1981,0.455328,0.824541,0.82449,0.824461,0.824475
9,0.1914,0.464733,0.822248,0.822314,0.82204,0.822124
10,0.1867,0.46595,0.816514,0.816625,0.816705,0.81651


[I 2025-03-30 10:11:30,124] Trial 23 finished with value: 0.8164664530353019 and parameters: {'learning_rate': 1.0102129930667866e-05, 'weight_decay': 0.004, 'warmup_steps': 179}. Best is trial 23 with value: 0.8164664530353019.


Trial 24 with params: {'learning_rate': 1.2968345869509177e-05, 'weight_decay': 0.004, 'warmup_steps': 203}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.455,0.433194,0.802752,0.803812,0.803275,0.802715
2,0.3097,0.419932,0.81078,0.811945,0.811327,0.810738
3,0.2607,0.41982,0.819954,0.820513,0.820336,0.819948
4,0.231,0.434221,0.823394,0.823443,0.823546,0.823386
5,0.2111,0.443024,0.822248,0.8222,0.822293,0.822219
6,0.1975,0.458447,0.823394,0.823338,0.823419,0.823361
7,0.186,0.467524,0.816514,0.816563,0.816662,0.816505
8,0.1782,0.484406,0.816514,0.816501,0.816368,0.816417
9,0.1719,0.494277,0.816514,0.816501,0.816368,0.816417
10,0.1677,0.498462,0.818807,0.818856,0.818957,0.818799


[I 2025-03-30 11:07:52,669] Trial 24 finished with value: 0.8107497010902807 and parameters: {'learning_rate': 1.2968345869509177e-05, 'weight_decay': 0.004, 'warmup_steps': 203}. Best is trial 23 with value: 0.8164664530353019.


Trial 25 with params: {'learning_rate': 1.126880146707781e-05, 'weight_decay': 0.004, 'warmup_steps': 161}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4666,0.43703,0.802752,0.803812,0.803275,0.802715
2,0.3233,0.420391,0.811927,0.813381,0.812537,0.811863
3,0.2744,0.418744,0.819954,0.82039,0.820293,0.819952
4,0.2441,0.429077,0.825688,0.8258,0.825882,0.825684
5,0.2234,0.434113,0.822248,0.82227,0.822377,0.822236
6,0.2093,0.444739,0.825688,0.825688,0.825798,0.825673
7,0.1973,0.453473,0.821101,0.82115,0.821251,0.821092
8,0.1892,0.46704,0.821101,0.821059,0.820999,0.821025
9,0.1827,0.476886,0.819954,0.819965,0.819788,0.81985
10,0.1782,0.479219,0.815367,0.815445,0.815536,0.815361


[I 2025-03-30 11:44:32,281] Trial 25 pruned. 


Trial 26 with params: {'learning_rate': 1.2685258643513274e-05, 'weight_decay': 0.004, 'warmup_steps': 124}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4532,0.433626,0.803899,0.804878,0.804401,0.803868
2,0.3116,0.419951,0.81078,0.811945,0.811327,0.810738
3,0.2628,0.419501,0.818807,0.819302,0.819167,0.818804
4,0.2331,0.433292,0.825688,0.825737,0.82584,0.82568
5,0.2131,0.441267,0.821101,0.821066,0.821167,0.821077
6,0.1994,0.455776,0.824541,0.824479,0.824545,0.824502
7,0.1878,0.464807,0.818807,0.818856,0.818957,0.818799
8,0.18,0.481184,0.816514,0.816501,0.816368,0.816417
9,0.1737,0.491068,0.817661,0.817631,0.817536,0.817574
10,0.1694,0.494792,0.817661,0.817683,0.817789,0.817649


[I 2025-03-30 12:40:50,657] Trial 26 finished with value: 0.8107497010902807 and parameters: {'learning_rate': 1.2685258643513274e-05, 'weight_decay': 0.004, 'warmup_steps': 124}. Best is trial 23 with value: 0.8164664530353019.


Trial 27 with params: {'learning_rate': 4.7996786970552803e-05, 'weight_decay': 0.006, 'warmup_steps': 146}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3332,0.431517,0.809633,0.81174,0.810369,0.809512
2,0.195,0.481111,0.815367,0.815489,0.815115,0.815215
3,0.1579,0.538526,0.802752,0.80289,0.802475,0.802577
4,0.1353,0.575645,0.806193,0.806338,0.806401,0.80619
5,0.1193,0.645639,0.795872,0.795997,0.795592,0.79569


[I 2025-03-30 12:58:52,372] Trial 27 pruned. 


Trial 28 with params: {'learning_rate': 0.000319720536453825, 'weight_decay': 0.002, 'warmup_steps': 163}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2246,0.662365,0.77867,0.779907,0.777985,0.778079
2,0.1037,0.736552,0.792431,0.795128,0.791498,0.79154
3,0.0704,0.820939,0.794725,0.795532,0.795182,0.794703
4,0.0541,1.078004,0.786697,0.789428,0.787552,0.786477
5,0.0434,1.1007,0.792431,0.792518,0.792172,0.79226


[I 2025-03-30 13:18:00,527] Trial 28 pruned. 


Trial 29 with params: {'learning_rate': 1.2526936444247373e-05, 'weight_decay': 0.001, 'warmup_steps': 161}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4562,0.434087,0.803899,0.804878,0.804401,0.803868
2,0.313,0.420021,0.81078,0.811945,0.811327,0.810738
3,0.264,0.41941,0.818807,0.819302,0.819167,0.818804
4,0.2342,0.432796,0.824541,0.824564,0.824672,0.82453
5,0.2141,0.440474,0.821101,0.821066,0.821167,0.821077
6,0.2004,0.454612,0.824541,0.824479,0.824545,0.824502
7,0.1888,0.463589,0.819954,0.820032,0.820125,0.819948
8,0.1809,0.47972,0.816514,0.816501,0.816368,0.816417
9,0.1746,0.489675,0.817661,0.817631,0.817536,0.817574
10,0.1703,0.493238,0.817661,0.817683,0.817789,0.817649


[I 2025-03-30 14:13:55,073] Trial 29 finished with value: 0.8107497010902807 and parameters: {'learning_rate': 1.2526936444247373e-05, 'weight_decay': 0.001, 'warmup_steps': 161}. Best is trial 23 with value: 0.8164664530353019.


Trial 30 with params: {'learning_rate': 1.1656700835830033e-05, 'weight_decay': 0.003, 'warmup_steps': 184}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4644,0.436052,0.801606,0.80258,0.802107,0.801574
2,0.32,0.420167,0.811927,0.813381,0.812537,0.811863
3,0.271,0.418839,0.819954,0.82028,0.820251,0.819954
4,0.2409,0.430135,0.824541,0.82462,0.824714,0.824536
5,0.2204,0.43601,0.819954,0.819935,0.820041,0.819935
6,0.2064,0.447824,0.824541,0.824522,0.82463,0.824523
7,0.1945,0.456636,0.821101,0.82115,0.821251,0.821092
8,0.1865,0.471045,0.821101,0.821059,0.820999,0.821025
9,0.18,0.480902,0.821101,0.821137,0.820914,0.820987
10,0.1756,0.483657,0.816514,0.816514,0.81662,0.816498


[I 2025-03-30 15:09:43,515] Trial 30 finished with value: 0.8153375871244556 and parameters: {'learning_rate': 1.1656700835830033e-05, 'weight_decay': 0.003, 'warmup_steps': 184}. Best is trial 23 with value: 0.8164664530353019.


Trial 31 with params: {'learning_rate': 1.074137206210587e-05, 'weight_decay': 0.003, 'warmup_steps': 187}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4726,0.438513,0.799312,0.800282,0.799813,0.79928
2,0.328,0.420655,0.811927,0.813187,0.812495,0.811878
3,0.2791,0.418635,0.816514,0.816891,0.816831,0.816513
4,0.2487,0.427704,0.823394,0.823506,0.823588,0.823391
5,0.2277,0.431771,0.823394,0.823394,0.823503,0.82338
6,0.2134,0.440889,0.826835,0.826816,0.826924,0.826816
7,0.2012,0.449339,0.823394,0.823506,0.823588,0.823391
8,0.193,0.461794,0.825688,0.825629,0.825629,0.825629
9,0.1864,0.471395,0.819954,0.820017,0.819746,0.819829
10,0.1819,0.473331,0.817661,0.817739,0.817831,0.817655


[I 2025-03-30 16:06:11,237] Trial 31 finished with value: 0.8164664530353019 and parameters: {'learning_rate': 1.074137206210587e-05, 'weight_decay': 0.003, 'warmup_steps': 187}. Best is trial 23 with value: 0.8164664530353019.


Trial 32 with params: {'learning_rate': 1.1817619001250758e-05, 'weight_decay': 0.005, 'warmup_steps': 168}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4623,0.435672,0.801606,0.80258,0.802107,0.801574
2,0.3187,0.420131,0.811927,0.813381,0.812537,0.811863
3,0.2697,0.418921,0.819954,0.82028,0.820251,0.819954
4,0.2396,0.430612,0.823394,0.823443,0.823546,0.823386
5,0.2192,0.436822,0.821101,0.821066,0.821167,0.821077
6,0.2052,0.449067,0.824541,0.824522,0.82463,0.824523
7,0.1934,0.457904,0.821101,0.82115,0.821251,0.821092
8,0.1854,0.472642,0.821101,0.821059,0.820999,0.821025
9,0.179,0.482521,0.819954,0.819965,0.819788,0.81985
10,0.1746,0.485386,0.816514,0.816514,0.81662,0.816498


[I 2025-03-30 17:02:49,192] Trial 32 finished with value: 0.814172283698243 and parameters: {'learning_rate': 1.1817619001250758e-05, 'weight_decay': 0.005, 'warmup_steps': 168}. Best is trial 23 with value: 0.8164664530353019.


Trial 33 with params: {'learning_rate': 1.0808761869111402e-05, 'weight_decay': 0.002, 'warmup_steps': 221}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4737,0.438397,0.800459,0.801351,0.800939,0.800432
2,0.3276,0.420656,0.811927,0.813187,0.812495,0.811878
3,0.2785,0.418667,0.816514,0.816891,0.816831,0.816513
4,0.2481,0.427887,0.823394,0.823506,0.823588,0.823391
5,0.2271,0.432187,0.822248,0.822228,0.822335,0.822229
6,0.2128,0.441517,0.827982,0.827982,0.828092,0.827967
7,0.2007,0.450074,0.822248,0.822326,0.822419,0.822242
8,0.1925,0.462633,0.824541,0.82449,0.824461,0.824475
9,0.1859,0.472294,0.821101,0.821137,0.820914,0.820987
10,0.1814,0.474231,0.815367,0.815445,0.815536,0.815361


[I 2025-03-30 17:40:45,670] Trial 33 pruned. 


Trial 34 with params: {'learning_rate': 1.7935920764592027e-05, 'weight_decay': 0.003, 'warmup_steps': 183}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.423,0.42723,0.811927,0.813381,0.812537,0.811863
2,0.2781,0.42217,0.818807,0.820085,0.819378,0.818761
3,0.2306,0.429243,0.825688,0.825966,0.825966,0.825688
4,0.203,0.454092,0.821101,0.821035,0.821083,0.821055
5,0.1853,0.474544,0.815367,0.815303,0.815368,0.815326
6,0.1729,0.496566,0.81422,0.814185,0.814284,0.814196
7,0.1622,0.507239,0.808486,0.808713,0.808737,0.808486
8,0.1547,0.531869,0.808486,0.808536,0.808274,0.808353
9,0.1488,0.542176,0.809633,0.809615,0.809485,0.809533
10,0.1447,0.5517,0.808486,0.808509,0.808611,0.808474


[I 2025-03-30 18:18:36,053] Trial 34 pruned. 


Trial 35 with params: {'learning_rate': 1.7316542519300505e-05, 'weight_decay': 0.003, 'warmup_steps': 189}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4266,0.427696,0.813073,0.814244,0.813621,0.813032
2,0.2815,0.421616,0.819954,0.821332,0.820546,0.819901
3,0.2337,0.427755,0.826835,0.827164,0.827134,0.826835
4,0.2059,0.451368,0.822248,0.822185,0.822251,0.822208
5,0.1879,0.470524,0.816514,0.816447,0.816494,0.816466


[I 2025-03-30 18:37:08,349] Trial 35 pruned. 


Trial 36 with params: {'learning_rate': 0.0004180301872969493, 'weight_decay': 0.006, 'warmup_steps': 27}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.206,0.657687,0.791284,0.791663,0.790877,0.791003
2,0.0986,0.718008,0.772936,0.773259,0.772523,0.77263
3,0.0682,0.96554,0.775229,0.779963,0.776375,0.774707
4,0.0521,1.068864,0.769495,0.770131,0.769902,0.769481
5,0.042,1.015832,0.786697,0.786745,0.786836,0.786687
6,0.0347,1.184265,0.783257,0.783562,0.782868,0.782983
7,0.0295,1.164445,0.799312,0.800282,0.799813,0.79928
8,0.0255,1.361584,0.770642,0.771059,0.770186,0.770293
9,0.0211,1.400639,0.78211,0.783666,0.782752,0.782017
10,0.0193,1.339002,0.788991,0.788994,0.788793,0.788856


[I 2025-03-30 19:14:11,651] Trial 36 pruned. 


Trial 37 with params: {'learning_rate': 1.1598919146420157e-05, 'weight_decay': 0.003, 'warmup_steps': 173}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4643,0.436209,0.801606,0.80258,0.802107,0.801574
2,0.3205,0.420187,0.811927,0.813381,0.812537,0.811863
3,0.2715,0.418849,0.819954,0.82028,0.820251,0.819954
4,0.2414,0.429948,0.824541,0.82462,0.824714,0.824536
5,0.2208,0.435755,0.821101,0.821066,0.821167,0.821077
6,0.2068,0.447332,0.823394,0.823359,0.823461,0.823371
7,0.1949,0.456154,0.821101,0.82115,0.821251,0.821092
8,0.1869,0.470434,0.821101,0.821059,0.820999,0.821025
9,0.1804,0.480262,0.819954,0.819965,0.819788,0.81985
10,0.176,0.482919,0.816514,0.816514,0.81662,0.816498


[I 2025-03-30 19:50:33,095] Trial 37 pruned. 


Trial 38 with params: {'learning_rate': 1.001349090137674e-05, 'weight_decay': 0.004, 'warmup_steps': 224}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4814,0.440904,0.795872,0.796756,0.79635,0.795845
2,0.335,0.421464,0.815367,0.816368,0.815873,0.815338
3,0.2861,0.418802,0.818807,0.819186,0.819125,0.818806
4,0.2555,0.42617,0.821101,0.821288,0.821335,0.8211
5,0.2342,0.428971,0.827982,0.827982,0.828092,0.827967
6,0.2196,0.435812,0.827982,0.828093,0.828176,0.827978
7,0.2071,0.443929,0.824541,0.82462,0.824714,0.824536
8,0.1988,0.454669,0.823394,0.823335,0.823335,0.823335
9,0.192,0.463936,0.823394,0.823493,0.823167,0.823261
10,0.1873,0.465181,0.815367,0.815514,0.815578,0.815365


[I 2025-03-30 20:27:36,420] Trial 38 pruned. 


Trial 39 with params: {'learning_rate': 0.00019841930077213002, 'weight_decay': 0.006, 'warmup_steps': 86}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2368,0.560124,0.792431,0.793261,0.791877,0.792015
2,0.1182,0.650924,0.794725,0.796706,0.793919,0.79402
3,0.083,0.844828,0.805046,0.805024,0.804896,0.804943
4,0.064,0.95059,0.809633,0.810388,0.810074,0.809617
5,0.0521,1.043822,0.788991,0.789172,0.789214,0.78899
6,0.044,1.122687,0.791284,0.792036,0.790751,0.790887
7,0.0381,1.206624,0.792431,0.792364,0.792424,0.792385
8,0.0326,1.274352,0.791284,0.791214,0.791214,0.791214
9,0.0294,1.296659,0.77867,0.778875,0.778322,0.778425
10,0.0262,1.340388,0.779817,0.779817,0.779911,0.779798


[I 2025-03-30 21:04:11,232] Trial 39 pruned. 


Trial 40 with params: {'learning_rate': 1.0014689401833803e-05, 'weight_decay': 0.005, 'warmup_steps': 144}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4775,0.440663,0.795872,0.796756,0.79635,0.795845
2,0.3347,0.421394,0.815367,0.816543,0.815915,0.815326
3,0.2861,0.418771,0.818807,0.819186,0.819125,0.818806
4,0.2556,0.426129,0.821101,0.821288,0.821335,0.8211
5,0.2342,0.428861,0.827982,0.827982,0.828092,0.827967
6,0.2197,0.435613,0.827982,0.828093,0.828176,0.827978
7,0.2072,0.443758,0.823394,0.823443,0.823546,0.823386
8,0.1989,0.454425,0.824541,0.82449,0.824461,0.824475
9,0.1921,0.463755,0.823394,0.823493,0.823167,0.823261
10,0.1874,0.464977,0.816514,0.816625,0.816705,0.81651


[I 2025-03-30 22:01:37,609] Trial 40 finished with value: 0.8153259275336583 and parameters: {'learning_rate': 1.0014689401833803e-05, 'weight_decay': 0.005, 'warmup_steps': 144}. Best is trial 23 with value: 0.8164664530353019.


Trial 41 with params: {'learning_rate': 1.2775087742305899e-05, 'weight_decay': 0.001, 'warmup_steps': 157}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4541,0.433555,0.803899,0.804878,0.804401,0.803868
2,0.311,0.41996,0.81078,0.811945,0.811327,0.810738
3,0.2621,0.419574,0.818807,0.819302,0.819167,0.818804
4,0.2324,0.4336,0.825688,0.825737,0.82584,0.82568
5,0.2125,0.441854,0.821101,0.821066,0.821167,0.821077
6,0.1988,0.456657,0.823394,0.823338,0.823419,0.823361
7,0.1873,0.4657,0.818807,0.818856,0.818957,0.818799
8,0.1794,0.48223,0.816514,0.816501,0.816368,0.816417
9,0.1731,0.492106,0.817661,0.817631,0.817536,0.817574
10,0.1689,0.496024,0.817661,0.817683,0.817789,0.817649


[I 2025-03-30 22:58:15,340] Trial 41 finished with value: 0.8107497010902807 and parameters: {'learning_rate': 1.2775087742305899e-05, 'weight_decay': 0.001, 'warmup_steps': 157}. Best is trial 23 with value: 0.8164664530353019.


Trial 42 with params: {'learning_rate': 1.0890360023105325e-05, 'weight_decay': 0.002, 'warmup_steps': 119}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.468,0.437943,0.800459,0.801351,0.800939,0.800432
2,0.3265,0.420551,0.813073,0.814432,0.813663,0.813018
3,0.2777,0.418631,0.816514,0.816891,0.816831,0.816513
4,0.2474,0.428055,0.823394,0.823506,0.823588,0.823391
5,0.2265,0.432392,0.821101,0.821101,0.821209,0.821086
6,0.2123,0.441739,0.826835,0.826816,0.826924,0.826816
7,0.2001,0.450301,0.822248,0.822326,0.822419,0.822242
8,0.192,0.46309,0.825688,0.825629,0.825629,0.825629


In [None]:
print(best_trial3)

NameError: name 'best_trial3' is not defined

In [None]:
base.reset_seed()

In [None]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-distill_aug_hp-search", logging_dir=f"~/logs/{DATASET}/bert-distill_aug_hp-search", remove_unused_columns=False, epochs=num_epochs, batch_size=batch_size)

In [None]:
def hp_space(trial):
    params =  {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-4, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0, 1e-2, step=1e-3),
        "warmup_steps" : trial.suggest_int("warmup_steps", 0, warm_up),
        "lambda_param": trial.suggest_float("lambda_param",0,1,step=.1),
        "temperature": trial.suggest_float("temperature", 2,7, step=.5)
    }
    print(f"Trial {trial.number} with params: {params}")
    return params

In [None]:
pruner = optuna.pruners.HyperbandPruner(min_resource=min_r, max_resource=max_r, reduction_factor=2, bootstrap_count=2)
sampler = optuna.samplers.TPESampler(seed=42, multivariate=True)



In [None]:
trainer = base.DistilTrainer(
    args=training_args,
    train_dataset=train_aug,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    model_init = lambda: get_Bert()
)
  

NameError: name 'BertForSequenceClassification' is not defined

In [None]:
best_trial4 = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    compute_objective=lambda metrics: metrics["eval_f1"],
    pruner=pruner,
    sampler=sampler,
    study_name="Test-Distill-aug",
    n_trials=150
)

In [None]:
print(best_trial4)

NameError: name 'best_trial4' is not defined

In [None]:
print("Best normal training score: ", best_trial)
print("Best distilation trianing score: ", best_trial2)
#print("Best normal training score with augmentations: ", best_trial3)
print("Best distilation trianing score with augmentations: ",best_trial4)

Best normal training score:  BestRun(run_id='132', objective=0.49578714001038604, hyperparameters={'learning_rate': 0.0004675471848767979, 'weight_decay': 0.01, 'warmup_steps': 4}, run_summary=None)
Best distilation trianing score:  BestRun(run_id='86', objective=0.4778879458794155, hyperparameters={'learning_rate': 0.00048481023093695626, 'weight_decay': 0.003, 'warmup_steps': 4, 'lambda_param': 0.4, 'temperature': 2.5}, run_summary=None)
Best distilation trianing score with augmentations:  BestRun(run_id='92', objective=0.7644517643387146, hyperparameters={'learning_rate': 0.0004922578519032032, 'weight_decay': 0.008, 'warmup_steps': 6, 'lambda_param': 1.0, 'temperature': 4.0}, run_summary=None)
