In [15]:
from transformers import Trainer, BertTokenizer, BertForSequenceClassification
from datasets import load_from_disk
import optuna
import torch
import math
import base

In [16]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


In [17]:
DATASET = "trec"

In [18]:
train_data = load_from_disk(f"~/data/{DATASET}/train-logits_fine")
eval_data = load_from_disk(f"~/data/{DATASET}/eval-logits_fine")
test_data = load_from_disk(f"~/data/{DATASET}/test-logits_fine")

all_train_data = load_from_disk(f"~/data/{DATASET}/train-logits-augmented_fine")
tokenizer = BertTokenizer.from_pretrained("ndavid/autotrain-trec-fine-bert-739422530")

In [19]:
train = train_data.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the train dataset")
eval = eval_data.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the eval dataset")
test = test_data.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the test dataset")

train_aug = all_train_data.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the augmented dataset")

In [20]:
num_epochs = 30
batch_size = 128

In [21]:
#Nápočet epoch na steps
data_length = len(train_data)
min_r = math.ceil(data_length/batch_size)*5
max_r = math.ceil(data_length/batch_size)*num_epochs
warm_up = math.ceil(data_length/batch_size/10)

In [22]:
def hp_space(trial):
    params =  {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 5e-4, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0, 1e-2, step=1e-3),
        "adam_beta1" : trial.suggest_float("adam_beta1", 0.9, 0.99, step=0.01),
        "warmup_steps" : trial.suggest_int("warmup_steps", 0, warm_up)
    }   
    print(f"Trial {trial.number} with params: {params}")
    return params

In [23]:
pruner = optuna.pruners.HyperbandPruner(min_resource=min_r, max_resource=max_r, reduction_factor=2, bootstrap_count=2)
sampler = optuna.samplers.TPESampler(seed=42, multivariate=True)



In [24]:
def get_Bert():
    return BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=50)

In [25]:
base.reset_seed()

In [26]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-base_fine_hp-search", logging_dir=f"~/logs/{DATASET}/bert-base_fine_hp-search", epochs=num_epochs, batch_size=batch_size)

In [27]:
trainer = Trainer(
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    model_init = lambda: get_Bert(),
    #callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)
  

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    compute_objective=lambda metrics: metrics["eval_f1"],
    pruner=pruner,
    sampler=sampler,
    study_name="Test-base",
    n_trials=150
)

[I 2025-03-15 09:38:21,587] A new study created in memory with name: Test-base


Trial 0 with params: {'learning_rate': 1.0253509690168497e-05, 'weight_decay': 0.01, 'adam_beta1': 0.97, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8673,3.826696,0.042163,0.008459,0.025886,0.006274
2,3.8083,3.777743,0.157654,0.010138,0.019207,0.00945
3,3.7698,3.734266,0.186984,0.033938,0.023454,0.011374
4,3.7285,3.698701,0.185151,0.011023,0.022636,0.01012
5,3.704,3.666467,0.186984,0.016542,0.023014,0.010992
6,3.6668,3.635868,0.181485,0.017853,0.02137,0.008555
7,3.6379,3.607637,0.180568,0.019561,0.021096,0.008097
8,3.6148,3.580955,0.180568,0.019561,0.021096,0.008097
9,3.5884,3.555389,0.180568,0.023558,0.021096,0.008119
10,3.5663,3.530838,0.180568,0.023558,0.021096,0.008119


[I 2025-03-15 09:39:12,558] Trial 0 pruned. 


Trial 1 with params: {'learning_rate': 2.636875533972305e-06, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8874,3.868106,0.010082,0.004258,0.022089,0.002417
2,3.8647,3.852914,0.014665,0.004418,0.022256,0.00287
3,3.8561,3.839477,0.027498,0.004833,0.023356,0.003981
4,3.8408,3.826836,0.049496,0.035579,0.027371,0.0082
5,3.8335,3.814686,0.074244,0.029712,0.029648,0.008795


[I 2025-03-15 09:39:39,287] Trial 1 pruned. 


Trial 2 with params: {'learning_rate': 4.191711516695204e-05, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8091,3.696986,0.185151,0.012607,0.022466,0.009983
2,3.6426,3.55753,0.179652,0.023548,0.020822,0.007605
3,3.5204,3.427715,0.189734,0.043584,0.023764,0.012656
4,3.393,3.304562,0.31714,0.073325,0.061969,0.054421
5,3.2964,3.193207,0.386801,0.070009,0.082251,0.065953


[I 2025-03-15 09:40:08,612] Trial 2 pruned. 


Trial 3 with params: {'learning_rate': 0.0001764971584817573, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6041,3.324278,0.176902,0.003538,0.02,0.006012
2,3.1489,2.925156,0.410632,0.07081,0.089797,0.065725
3,2.7886,2.575201,0.453712,0.103001,0.112565,0.083142
4,2.4498,2.284197,0.545371,0.207764,0.174382,0.158651
5,2.1915,2.049921,0.598533,0.2631,0.217085,0.202227
6,1.9271,1.856178,0.650779,0.275616,0.264156,0.249957
7,1.7139,1.705789,0.676444,0.337832,0.288408,0.276678
8,1.5588,1.589029,0.702108,0.366951,0.332002,0.318553
9,1.3999,1.489767,0.708524,0.344747,0.336943,0.319753
10,1.264,1.410418,0.718607,0.358795,0.352657,0.335164


[I 2025-03-15 09:41:09,905] Trial 3 pruned. 


Trial 4 with params: {'learning_rate': 6.624310605949985e-06, 'weight_decay': 0.005, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8757,3.844629,0.018332,0.003887,0.02232,0.002922
2,3.8329,3.811327,0.074244,0.007133,0.028811,0.007069
3,3.8075,3.779107,0.153987,0.00988,0.018793,0.009386
4,3.7757,3.749092,0.183318,0.018724,0.022869,0.011068
5,3.755,3.722877,0.190651,0.015503,0.02437,0.012025


[I 2025-03-15 09:41:37,718] Trial 4 pruned. 


Trial 5 with params: {'learning_rate': 4.480975918214949e-05, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.7971,3.682824,0.183318,0.014353,0.021918,0.009335
2,3.627,3.539651,0.179652,0.023548,0.020822,0.007605
3,3.5007,3.405065,0.193401,0.043597,0.02486,0.014219
4,3.3686,3.278097,0.333639,0.070008,0.066459,0.057701
5,3.269,3.164247,0.394134,0.079038,0.084201,0.066301
6,3.1496,3.061096,0.410632,0.095822,0.089549,0.069288
7,3.0448,2.964983,0.418882,0.092273,0.094191,0.073619
8,2.9623,2.877033,0.442713,0.087469,0.105601,0.08324
9,2.8728,2.796171,0.448213,0.08448,0.107805,0.081959
10,2.7974,2.722031,0.464711,0.104504,0.119127,0.094073


[I 2025-03-15 09:42:28,646] Trial 5 pruned. 


Trial 6 with params: {'learning_rate': 1.7018418817029176e-05, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8523,3.793196,0.12374,0.009476,0.034929,0.008628
2,3.7638,3.716041,0.185151,0.018036,0.022906,0.010897
3,3.7045,3.6557,0.181485,0.012675,0.02137,0.00846
4,3.6446,3.601826,0.185151,0.021577,0.022466,0.010407
5,3.6033,3.545932,0.180568,0.023564,0.021096,0.008128


[I 2025-03-15 09:42:56,365] Trial 6 pruned. 


Trial 7 with params: {'learning_rate': 3.971084710792477e-05, 'weight_decay': 0.0, 'adam_beta1': 0.96, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8019,3.699815,0.191567,0.012598,0.024554,0.012295
2,3.6514,3.577038,0.176902,0.003538,0.02,0.006012
3,3.545,3.464563,0.177819,0.023541,0.020274,0.006558
4,3.4325,3.359885,0.219982,0.073712,0.032764,0.025181
5,3.3497,3.257524,0.329973,0.071322,0.06553,0.057387
6,3.2451,3.165879,0.384968,0.080572,0.081672,0.065432
7,3.1528,3.0818,0.407883,0.075603,0.087804,0.06601
8,3.0797,3.002302,0.415215,0.073261,0.090896,0.067756
9,3.0003,2.928501,0.421632,0.072187,0.09575,0.074489
10,2.932,2.860335,0.437214,0.087341,0.102646,0.078603


[I 2025-03-15 09:43:48,729] Trial 7 pruned. 


Trial 8 with params: {'learning_rate': 1.4982086432155468e-06, 'weight_decay': 0.01, 'adam_beta1': 0.99, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8904,3.874964,0.007333,0.003625,0.021778,0.002019
2,3.8745,3.865734,0.009166,0.003681,0.021634,0.002025
3,3.8712,3.85742,0.013749,0.004444,0.022153,0.002741
4,3.8604,3.849826,0.015582,0.003999,0.022009,0.002666
5,3.857,3.842901,0.019248,0.004041,0.022423,0.003047
6,3.8458,3.836313,0.029331,0.025005,0.024049,0.004821
7,3.8391,3.830123,0.035747,0.030279,0.025295,0.006044
8,3.8346,3.824315,0.047663,0.029865,0.026643,0.007005
9,3.8294,3.818778,0.054995,0.007966,0.026986,0.006512
10,3.8237,3.813504,0.073327,0.007138,0.028708,0.007053


[I 2025-03-15 09:44:40,043] Trial 8 pruned. 


Trial 9 with params: {'learning_rate': 6.639623079859462e-06, 'weight_decay': 0.001, 'adam_beta1': 0.96, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8762,3.845111,0.018332,0.004052,0.02232,0.002962
2,3.8333,3.811925,0.073327,0.007152,0.028708,0.007044
3,3.808,3.780201,0.150321,0.008878,0.018027,0.008642
4,3.7767,3.750859,0.183318,0.022054,0.022869,0.011081
5,3.7564,3.724987,0.189734,0.015193,0.024096,0.011719


[I 2025-03-15 09:45:06,538] Trial 9 pruned. 


Trial 10 with params: {'learning_rate': 0.0003327590120039615, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4372,3.010705,0.394134,0.057087,0.08445,0.063515
2,2.7394,2.418249,0.494042,0.169154,0.136597,0.114692
3,2.2009,1.966449,0.600367,0.267899,0.237507,0.226177
4,1.7704,1.675867,0.686526,0.338317,0.309582,0.292793
5,1.4598,1.470566,0.71769,0.340215,0.352231,0.32962
6,1.2085,1.319039,0.725023,0.386151,0.365073,0.349412
7,1.0077,1.240175,0.734189,0.39726,0.376775,0.361396
8,0.8667,1.186004,0.756187,0.426146,0.423945,0.405376
9,0.7407,1.132536,0.753437,0.469212,0.431798,0.426584
10,0.6317,1.094384,0.764436,0.480821,0.439009,0.443816


[I 2025-03-15 09:46:57,615] Trial 10 pruned. 


Trial 11 with params: {'learning_rate': 0.0003522178034287917, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4189,2.975949,0.402383,0.055383,0.086363,0.064005
2,2.6956,2.367807,0.505958,0.176992,0.144579,0.126175
3,2.1428,1.911507,0.626031,0.29906,0.255965,0.245454
4,1.7078,1.624111,0.692026,0.329687,0.314608,0.297772
5,1.3962,1.432057,0.72044,0.350946,0.358072,0.335169
6,1.1519,1.282835,0.733272,0.38285,0.360638,0.344969
7,0.9527,1.211352,0.737855,0.41562,0.391923,0.378423
8,0.8144,1.161197,0.757104,0.454024,0.432154,0.419232
9,0.6895,1.111417,0.757104,0.470745,0.44491,0.442304
10,0.5825,1.072799,0.764436,0.485935,0.444087,0.449272


[I 2025-03-15 09:48:51,142] Trial 11 pruned. 


Trial 12 with params: {'learning_rate': 0.0001253755316943676, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.667,3.443404,0.176902,0.003538,0.02,0.006012
2,3.3072,3.120686,0.384968,0.060333,0.081479,0.063911
3,3.0225,2.843782,0.433547,0.068327,0.100877,0.075838
4,2.7471,2.59674,0.462878,0.098269,0.117706,0.091629
5,2.5342,2.376079,0.522456,0.167433,0.159024,0.140469
6,2.3039,2.198604,0.572869,0.227967,0.19554,0.180611
7,2.1163,2.052144,0.595784,0.244274,0.214195,0.202294
8,1.9724,1.926363,0.649863,0.309583,0.271876,0.261446
9,1.8198,1.811438,0.663611,0.285827,0.277576,0.264113
10,1.687,1.716246,0.688359,0.361282,0.314251,0.304716


[I 2025-03-15 09:50:37,096] Trial 12 pruned. 


Trial 13 with params: {'learning_rate': 0.0004449518806372288, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3817,2.872418,0.415215,0.068809,0.096303,0.074193
2,2.5518,2.203707,0.550871,0.216713,0.186537,0.174408
3,1.9304,1.716975,0.669111,0.299072,0.286316,0.267099
4,1.4686,1.445356,0.702108,0.326728,0.322378,0.304091
5,1.1531,1.296354,0.734189,0.360052,0.3787,0.35178
6,0.9262,1.172018,0.747021,0.432753,0.388149,0.379087
7,0.7496,1.129712,0.75527,0.489186,0.440263,0.43742
8,0.6271,1.08887,0.75802,0.473226,0.456779,0.449153
9,0.5139,1.049461,0.761687,0.498873,0.472802,0.47308
10,0.4236,1.032657,0.765353,0.478292,0.45232,0.453588


[I 2025-03-15 09:53:22,300] Trial 13 finished with value: 0.7034076415908637 and parameters: {'learning_rate': 0.0004449518806372288, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 1}. Best is trial 13 with value: 0.7034076415908637.


Trial 14 with params: {'learning_rate': 0.0002223123214912636, 'weight_decay': 0.0, 'adam_beta1': 0.92, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5922,3.266318,0.180568,0.043551,0.021024,0.008028
2,3.06,2.801454,0.424381,0.0615,0.099022,0.072819
3,2.6372,2.405925,0.495875,0.129852,0.138585,0.115885
4,2.2572,2.093859,0.572869,0.183873,0.198074,0.179959
5,1.9646,1.846426,0.647113,0.302119,0.259143,0.248374
6,1.6803,1.643362,0.696609,0.364683,0.316996,0.309421
7,1.4515,1.503017,0.705775,0.340973,0.327517,0.315294
8,1.2919,1.402086,0.726856,0.362094,0.367392,0.345679
9,1.1388,1.325498,0.734189,0.358743,0.373929,0.356387
10,1.007,1.268316,0.750687,0.425207,0.402843,0.394821


[I 2025-03-15 09:55:12,564] Trial 14 pruned. 


Trial 15 with params: {'learning_rate': 0.0004839141884869, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3562,2.822894,0.420715,0.066254,0.099744,0.075367
2,2.4869,2.13344,0.557287,0.198798,0.190561,0.174529
3,1.8435,1.641774,0.676444,0.286341,0.29634,0.275298
4,1.3804,1.382836,0.708524,0.335211,0.330472,0.314049
5,1.0698,1.256555,0.736022,0.374673,0.375889,0.35137
6,0.8504,1.133198,0.752521,0.453194,0.404045,0.40124
7,0.6796,1.113636,0.759853,0.469173,0.45305,0.446436
8,0.5669,1.070914,0.765353,0.496792,0.473603,0.4703
9,0.4591,1.031723,0.766269,0.484484,0.481521,0.474505
10,0.3683,1.010424,0.768103,0.478903,0.474066,0.470531


[I 2025-03-15 09:57:01,059] Trial 15 pruned. 


Trial 16 with params: {'learning_rate': 0.00019200962492670843, 'weight_decay': 0.005, 'adam_beta1': 0.92, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6136,3.31654,0.176902,0.003538,0.02,0.006012
2,3.1303,2.893987,0.412466,0.070942,0.091925,0.06933
3,2.7483,2.525989,0.461962,0.125173,0.118583,0.090627
4,2.3927,2.224229,0.557287,0.203924,0.182503,0.165845
5,2.1199,1.983265,0.607699,0.293279,0.227151,0.215968
6,1.8467,1.783638,0.668194,0.303654,0.278497,0.265349
7,1.6268,1.63573,0.688359,0.341924,0.30086,0.291776
8,1.4683,1.520844,0.713107,0.354929,0.341462,0.322491
9,1.3101,1.429763,0.71494,0.353042,0.350833,0.335567
10,1.1738,1.359206,0.730522,0.36916,0.374041,0.359385


[I 2025-03-15 09:58:52,253] Trial 16 pruned. 


Trial 17 with params: {'learning_rate': 0.0003518674002568535, 'weight_decay': 0.001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.463,3.053541,0.353804,0.066554,0.071733,0.056794
2,2.7848,2.478107,0.454629,0.115564,0.116799,0.089047
3,2.2607,2.035408,0.56462,0.215085,0.191624,0.174802
4,1.8211,1.728583,0.665445,0.309581,0.288001,0.272
5,1.5027,1.501738,0.708524,0.36682,0.350635,0.330457
6,1.2254,1.32779,0.726856,0.36091,0.357714,0.340822
7,1.0021,1.24213,0.734189,0.376558,0.392067,0.372448
8,0.8555,1.190875,0.753437,0.428164,0.423477,0.40737
9,0.7287,1.138039,0.752521,0.466428,0.438964,0.433579
10,0.6171,1.118648,0.758937,0.477567,0.45045,0.448462


[I 2025-03-15 10:00:45,379] Trial 17 pruned. 


Trial 18 with params: {'learning_rate': 0.00044774926371395345, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3891,2.875515,0.416132,0.072124,0.094344,0.072598
2,2.5525,2.201509,0.549954,0.209289,0.185004,0.170074
3,1.9317,1.724058,0.659945,0.322797,0.288172,0.272777
4,1.4729,1.451723,0.701192,0.31504,0.314717,0.29728
5,1.161,1.319734,0.733272,0.377535,0.385675,0.360683
6,0.9263,1.199748,0.746104,0.422343,0.409283,0.396751
7,0.7514,1.141133,0.752521,0.466819,0.434935,0.43154
8,0.6304,1.099949,0.762603,0.463691,0.461363,0.448306
9,0.5136,1.04296,0.768103,0.50416,0.47667,0.476533
10,0.4228,1.031526,0.769936,0.482453,0.470611,0.468023


[I 2025-03-15 10:03:26,266] Trial 18 finished with value: 0.6721048077147119 and parameters: {'learning_rate': 0.00044774926371395345, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 2}. Best is trial 13 with value: 0.7034076415908637.


Trial 19 with params: {'learning_rate': 0.00025421502789618744, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5474,3.184589,0.292392,0.072515,0.05412,0.045142
2,2.9506,2.662975,0.449129,0.082464,0.110887,0.079648
3,2.4782,2.236907,0.541705,0.195753,0.173261,0.157039
4,2.0734,1.922567,0.640697,0.310003,0.267859,0.255542
5,1.7647,1.687317,0.695692,0.335174,0.317789,0.304163
6,1.4893,1.497694,0.715857,0.354108,0.341318,0.326613
7,1.2699,1.383525,0.724106,0.348619,0.352036,0.337568
8,1.1198,1.315917,0.745188,0.402964,0.405178,0.381929
9,0.9875,1.239282,0.750687,0.42481,0.407654,0.396401
10,0.8653,1.193924,0.753437,0.435644,0.406938,0.401546


[I 2025-03-15 10:05:15,647] Trial 19 pruned. 


Trial 20 with params: {'learning_rate': 1.6173144067582056e-06, 'weight_decay': 0.008, 'adam_beta1': 0.97, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8894,3.873489,0.007333,0.003577,0.021778,0.002029
2,3.8727,3.86368,0.010082,0.003815,0.021738,0.002153
3,3.869,3.854964,0.013749,0.004285,0.022153,0.00274
4,3.8577,3.846903,0.017415,0.003983,0.022216,0.002845
5,3.8541,3.839314,0.028414,0.005268,0.023811,0.004425
6,3.8424,3.832316,0.035747,0.028239,0.025476,0.006349
7,3.8352,3.825656,0.051329,0.033938,0.027227,0.007711
8,3.8303,3.819247,0.057745,0.029732,0.027953,0.007819
9,3.8246,3.813191,0.078827,0.008716,0.02968,0.007978
10,3.8186,3.807405,0.092576,0.00968,0.031586,0.008861


[I 2025-03-15 10:06:08,959] Trial 20 pruned. 


Trial 21 with params: {'learning_rate': 0.00019320445253174156, 'weight_decay': 0.003, 'adam_beta1': 0.99, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6261,3.359946,0.176902,0.003538,0.02,0.006012
2,3.2072,3.029256,0.36022,0.060941,0.074539,0.056072
3,2.9228,2.761611,0.427131,0.064416,0.098081,0.072022
4,2.6528,2.519797,0.453712,0.09442,0.118262,0.09113
5,2.4529,2.320155,0.48396,0.131432,0.14175,0.120534
6,2.2246,2.138783,0.503208,0.177459,0.153831,0.134565
7,2.0227,1.982812,0.578368,0.203854,0.201065,0.180162
8,1.8658,1.851292,0.595784,0.232196,0.231009,0.203597
9,1.7,1.719997,0.610449,0.223226,0.232581,0.205222
10,1.5503,1.625941,0.667278,0.295175,0.300929,0.269486


[I 2025-03-15 10:07:54,929] Trial 21 pruned. 


Trial 22 with params: {'learning_rate': 0.0001668393524829016, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6374,3.361547,0.176902,0.003538,0.02,0.006012
2,3.1875,2.961573,0.407883,0.093877,0.088962,0.066628
3,2.8295,2.614855,0.458295,0.103058,0.115369,0.088315
4,2.4974,2.327456,0.536205,0.187387,0.166025,0.149057
5,2.2417,2.092039,0.593034,0.277076,0.214242,0.200573
6,1.9794,1.899177,0.644363,0.284641,0.262266,0.250887
7,1.7689,1.746739,0.67461,0.365632,0.290854,0.281471
8,1.6131,1.628473,0.704858,0.371371,0.334254,0.322368
9,1.4535,1.522167,0.711274,0.362209,0.339215,0.324705
10,1.3167,1.442157,0.716774,0.36801,0.346639,0.3313


[I 2025-03-15 10:08:52,936] Trial 22 pruned. 


Trial 23 with params: {'learning_rate': 0.0004951185712772382, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3544,2.809672,0.425298,0.066495,0.098964,0.075264
2,2.4646,2.107485,0.560953,0.230323,0.201904,0.187669
3,1.8231,1.631352,0.675527,0.291192,0.302115,0.282497
4,1.3645,1.378104,0.724106,0.337756,0.34295,0.325717
5,1.0566,1.256115,0.742438,0.411225,0.404124,0.38499
6,0.8305,1.138978,0.749771,0.46096,0.420699,0.417828
7,0.6617,1.106931,0.749771,0.469086,0.448686,0.443968
8,0.552,1.084435,0.758937,0.502588,0.467873,0.466165
9,0.4384,1.021252,0.777269,0.523088,0.49383,0.495122
10,0.3546,1.020649,0.774519,0.535814,0.487131,0.493944


[I 2025-03-15 10:11:39,645] Trial 23 finished with value: 0.6792130293173753 and parameters: {'learning_rate': 0.0004951185712772382, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 2}. Best is trial 13 with value: 0.7034076415908637.


Trial 24 with params: {'learning_rate': 0.0001946670988041245, 'weight_decay': 0.008, 'adam_beta1': 0.92, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6108,3.311188,0.176902,0.003538,0.02,0.006012
2,3.1232,2.885005,0.415215,0.070305,0.09343,0.070682
3,2.7377,2.514435,0.465628,0.126781,0.121102,0.094455
4,2.3798,2.211711,0.560037,0.205188,0.184043,0.167424
5,2.1053,1.970304,0.610449,0.288733,0.23013,0.218857
6,1.8311,1.770309,0.667278,0.303681,0.27814,0.265187
7,1.6104,1.623082,0.690192,0.356573,0.309569,0.303415
8,1.4521,1.509766,0.713107,0.354663,0.341462,0.322299
9,1.2945,1.420034,0.715857,0.354434,0.352262,0.337342
10,1.1584,1.350623,0.735105,0.373045,0.380803,0.365117


[I 2025-03-15 10:13:25,752] Trial 24 pruned. 


Trial 25 with params: {'learning_rate': 0.0004873379869051783, 'weight_decay': 0.007, 'adam_beta1': 0.91, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3852,2.861707,0.416132,0.070786,0.096518,0.074874
2,2.5206,2.16373,0.540788,0.21583,0.183967,0.171783
3,1.8809,1.668993,0.670027,0.288436,0.291551,0.270628
4,1.4128,1.405193,0.703941,0.323523,0.32269,0.302926
5,1.0904,1.267122,0.738772,0.402925,0.395584,0.371112
6,0.8559,1.156745,0.749771,0.468544,0.412684,0.411409
7,0.6815,1.106336,0.758937,0.480257,0.473015,0.465655
8,0.5708,1.072843,0.762603,0.48638,0.471693,0.465823
9,0.4599,1.034702,0.768103,0.477737,0.47826,0.471279
10,0.3676,1.028865,0.765353,0.461672,0.473162,0.463816


[I 2025-03-15 10:16:06,055] Trial 25 finished with value: 0.713771496767995 and parameters: {'learning_rate': 0.0004873379869051783, 'weight_decay': 0.007, 'adam_beta1': 0.91, 'warmup_steps': 3}. Best is trial 25 with value: 0.713771496767995.


Trial 26 with params: {'learning_rate': 0.00036485975145723236, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4807,3.03559,0.362053,0.063729,0.076172,0.05765
2,2.7361,2.397801,0.503208,0.152157,0.149534,0.12999
3,2.1595,1.913382,0.616865,0.307896,0.246833,0.234856
4,1.7061,1.624993,0.695692,0.32943,0.316704,0.299107
5,1.3779,1.424653,0.724106,0.357703,0.363074,0.339401
6,1.1205,1.270166,0.741522,0.401348,0.383789,0.367852
7,0.9175,1.198814,0.749771,0.432743,0.406419,0.395643
8,0.7839,1.147538,0.76352,0.463775,0.449789,0.441745
9,0.6598,1.098641,0.765353,0.501285,0.473334,0.474552
10,0.5591,1.082698,0.766269,0.481187,0.468603,0.462072


[I 2025-03-15 10:18:50,650] Trial 26 finished with value: 0.651926342895605 and parameters: {'learning_rate': 0.00036485975145723236, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 4}. Best is trial 25 with value: 0.713771496767995.


Trial 27 with params: {'learning_rate': 0.00048712164237687974, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3899,2.875697,0.408799,0.074989,0.093054,0.072795
2,2.5396,2.182945,0.534372,0.194605,0.179273,0.159774
3,1.9031,1.687192,0.663611,0.279213,0.287394,0.267503
4,1.4331,1.431242,0.696609,0.300685,0.317188,0.290889
5,1.1087,1.285738,0.731439,0.394336,0.383973,0.35891
6,0.8731,1.160569,0.747021,0.442751,0.407728,0.397502
7,0.6929,1.10985,0.753437,0.488289,0.457808,0.452436
8,0.5783,1.074225,0.761687,0.479737,0.464107,0.45852
9,0.4677,1.031456,0.770852,0.485178,0.479871,0.473529
10,0.3768,1.030198,0.76352,0.482655,0.475304,0.469942


[I 2025-03-15 10:21:34,222] Trial 27 finished with value: 0.711448611893853 and parameters: {'learning_rate': 0.00048712164237687974, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 3}. Best is trial 25 with value: 0.713771496767995.


Trial 28 with params: {'learning_rate': 0.00043402962575343795, 'weight_decay': 0.006, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.451,3.001331,0.36022,0.063906,0.074836,0.057278
2,2.6984,2.385731,0.474794,0.143524,0.132316,0.107745
3,2.1337,1.907023,0.600367,0.241314,0.237691,0.215368
4,1.6634,1.593915,0.690192,0.306495,0.315368,0.289206
5,1.3193,1.387321,0.71494,0.342811,0.358938,0.33276
6,1.0494,1.236084,0.745188,0.450464,0.418355,0.407533
7,0.8409,1.171244,0.745188,0.418844,0.407558,0.394263
8,0.6991,1.119355,0.76077,0.485049,0.462433,0.459105
9,0.5759,1.07801,0.768103,0.519514,0.490734,0.492011
10,0.4775,1.089035,0.76077,0.487492,0.48185,0.472383


[I 2025-03-15 10:24:14,713] Trial 28 finished with value: 0.677724449478389 and parameters: {'learning_rate': 0.00043402962575343795, 'weight_decay': 0.006, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 4}. Best is trial 25 with value: 0.713771496767995.


Trial 29 with params: {'learning_rate': 0.0002755879459606728, 'weight_decay': 0.007, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5483,3.190046,0.224565,0.036334,0.033457,0.025379
2,2.961,2.685336,0.439047,0.080616,0.107454,0.077147
3,2.5009,2.269268,0.516957,0.144051,0.152518,0.131393
4,2.0909,1.944497,0.59945,0.259428,0.223165,0.20583
5,1.7735,1.69049,0.681027,0.336968,0.300691,0.286147
6,1.4804,1.490884,0.713107,0.340555,0.337443,0.323538
7,1.2437,1.374671,0.71494,0.343896,0.337263,0.323111
8,1.0868,1.296757,0.743355,0.408984,0.405945,0.386111
9,0.9437,1.231935,0.745188,0.404205,0.403509,0.38707
10,0.8204,1.193969,0.758937,0.490599,0.433264,0.434065


[I 2025-03-15 10:25:08,501] Trial 29 pruned. 


Trial 30 with params: {'learning_rate': 0.00027086882126714745, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5546,3.18508,0.28231,0.052393,0.050771,0.041793
2,2.9402,2.64244,0.444546,0.088409,0.110123,0.079615
3,2.4499,2.204448,0.547204,0.220815,0.17845,0.164385
4,2.0323,1.884424,0.644363,0.293246,0.271097,0.257291
5,1.7116,1.639735,0.705775,0.343792,0.334846,0.320584
6,1.4292,1.451692,0.716774,0.35099,0.346397,0.331531
7,1.206,1.345203,0.72044,0.345813,0.349176,0.333529
8,1.056,1.288666,0.749771,0.4053,0.411122,0.387876
9,0.9275,1.214523,0.753437,0.431672,0.419365,0.406861
10,0.8056,1.1697,0.75527,0.436788,0.411731,0.404291


[I 2025-03-15 10:26:01,814] Trial 30 pruned. 


Trial 31 with params: {'learning_rate': 0.00023665444713746742, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5647,3.218992,0.256645,0.07665,0.04288,0.035435
2,2.996,2.720794,0.437214,0.061411,0.105498,0.075671
3,2.546,2.309584,0.515124,0.165705,0.152593,0.134927
4,2.156,1.997207,0.609533,0.289805,0.234191,0.221816
5,1.8539,1.754989,0.683776,0.341923,0.307213,0.296206
6,1.5746,1.562187,0.707608,0.356442,0.331172,0.319693
7,1.3521,1.435514,0.71494,0.321935,0.327241,0.310434
8,1.1986,1.356616,0.735105,0.393458,0.384405,0.361924
9,1.0597,1.276526,0.736022,0.376631,0.377866,0.362502
10,0.9343,1.227765,0.750687,0.439124,0.408161,0.402805


[I 2025-03-15 10:27:47,524] Trial 31 pruned. 


Trial 32 with params: {'learning_rate': 0.0003692254257662043, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4628,3.006428,0.381302,0.060096,0.081008,0.061017
2,2.7043,2.364909,0.513291,0.17679,0.154715,0.138435
3,2.1232,1.882494,0.641613,0.318542,0.266771,0.257772
4,1.6693,1.591036,0.696609,0.308896,0.312208,0.294653
5,1.3477,1.412908,0.719523,0.348895,0.360266,0.335635
6,1.0992,1.259933,0.736939,0.40449,0.376749,0.362838
7,0.9033,1.195505,0.743355,0.432458,0.410583,0.396424
8,0.7726,1.145831,0.756187,0.467427,0.444078,0.435122
9,0.6486,1.084206,0.767186,0.50298,0.474033,0.473657
10,0.5454,1.070478,0.770852,0.473553,0.467874,0.461314


[I 2025-03-15 10:30:29,236] Trial 32 finished with value: 0.6527034994036484 and parameters: {'learning_rate': 0.0003692254257662043, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 3}. Best is trial 25 with value: 0.713771496767995.


Trial 33 with params: {'learning_rate': 1.2161047690501487e-06, 'weight_decay': 0.002, 'adam_beta1': 0.99, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8912,3.876736,0.008249,0.004204,0.022233,0.002464
2,3.877,3.869148,0.010082,0.00439,0.022089,0.002458
3,3.8753,3.862123,0.010082,0.003545,0.021738,0.002126
4,3.8656,3.855793,0.013749,0.004369,0.022153,0.002745
5,3.8633,3.84991,0.015582,0.003936,0.022009,0.002648
6,3.8533,3.844429,0.020165,0.004216,0.022527,0.003167
7,3.8474,3.839283,0.026581,0.005103,0.023603,0.004237
8,3.844,3.83445,0.032081,0.025387,0.024711,0.00538
9,3.8397,3.829863,0.036664,0.030243,0.025399,0.006082
10,3.8349,3.825515,0.044913,0.030504,0.026332,0.006682


[I 2025-03-15 10:32:18,248] Trial 33 pruned. 


Trial 34 with params: {'learning_rate': 3.6875829250628446e-05, 'weight_decay': 0.01, 'adam_beta1': 0.99, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8125,3.717633,0.197067,0.020232,0.026287,0.013269
2,3.674,3.610057,0.176902,0.003538,0.02,0.006012
3,3.5828,3.516364,0.176902,0.003538,0.02,0.006012
4,3.4867,3.431469,0.176902,0.003538,0.02,0.006012
5,3.4207,3.346731,0.178735,0.023545,0.020548,0.007089
6,3.3345,3.271289,0.228231,0.075042,0.035176,0.028247
7,3.2581,3.203288,0.31989,0.069716,0.062808,0.053385
8,3.1993,3.138176,0.370302,0.063325,0.077249,0.062111
9,3.1359,3.078069,0.389551,0.059374,0.082475,0.06421
10,3.0827,3.022057,0.394134,0.055802,0.083573,0.063255


[I 2025-03-15 10:33:10,117] Trial 34 pruned. 


Trial 35 with params: {'learning_rate': 0.00028825028916564503, 'weight_decay': 0.005, 'adam_beta1': 0.93, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.535,3.161306,0.288726,0.072758,0.05272,0.04335
2,2.9174,2.625308,0.444546,0.085219,0.110123,0.078588
3,2.4316,2.197621,0.536205,0.180434,0.165777,0.145873
4,2.0167,1.881277,0.612282,0.267668,0.237511,0.22472
5,1.6982,1.630718,0.697525,0.354632,0.327383,0.311242
6,1.4067,1.437995,0.719523,0.350835,0.350178,0.334491
7,1.1745,1.333214,0.72044,0.342199,0.345976,0.330475
8,1.0204,1.260923,0.745188,0.400939,0.406162,0.384668
9,0.8858,1.20383,0.749771,0.416582,0.415569,0.399949
10,0.7628,1.17213,0.75527,0.496358,0.438631,0.443034


[I 2025-03-15 10:35:52,395] Trial 35 finished with value: 0.5656240822367657 and parameters: {'learning_rate': 0.00028825028916564503, 'weight_decay': 0.005, 'adam_beta1': 0.93, 'warmup_steps': 3}. Best is trial 25 with value: 0.713771496767995.


Trial 36 with params: {'learning_rate': 0.0004455699071177728, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3908,2.878551,0.415215,0.072015,0.093699,0.071787
2,2.5566,2.206064,0.549038,0.210915,0.18455,0.170415
3,1.9368,1.728372,0.660862,0.324055,0.288275,0.272936
4,1.4781,1.455531,0.701192,0.315265,0.314717,0.297387
5,1.1659,1.322307,0.732356,0.386549,0.384342,0.359881
6,0.9309,1.201065,0.747938,0.433711,0.409824,0.396876
7,0.7554,1.141269,0.753437,0.466652,0.435173,0.431408
8,0.6338,1.101549,0.76352,0.464723,0.466265,0.451055
9,0.5172,1.043104,0.768103,0.508115,0.480296,0.479539
10,0.4268,1.031891,0.769019,0.483167,0.466513,0.465721


[I 2025-03-15 10:38:40,977] Trial 36 finished with value: 0.6740922743074372 and parameters: {'learning_rate': 0.0004455699071177728, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 2}. Best is trial 25 with value: 0.713771496767995.


Trial 37 with params: {'learning_rate': 0.0004242582047930815, 'weight_decay': 0.007, 'adam_beta1': 0.92, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4318,2.954758,0.373052,0.078597,0.080304,0.059518
2,2.643,2.29981,0.522456,0.176465,0.165034,0.146096
3,2.0429,1.806176,0.641613,0.281055,0.263426,0.246867
4,1.5747,1.528529,0.690192,0.313108,0.325986,0.301925
5,1.2422,1.368002,0.719523,0.363637,0.362768,0.336324
6,0.9919,1.210244,0.744271,0.453239,0.410007,0.404067
7,0.8036,1.147501,0.758937,0.481636,0.453899,0.444928
8,0.6796,1.099383,0.76352,0.484251,0.466965,0.460502
9,0.5603,1.067924,0.757104,0.471616,0.4718,0.462725
10,0.4674,1.056194,0.76077,0.455922,0.469015,0.457141


[I 2025-03-15 10:40:28,271] Trial 37 pruned. 


Trial 38 with params: {'learning_rate': 1.1626268513139648e-06, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8911,3.876778,0.008249,0.004173,0.022233,0.002446
2,3.8773,3.869459,0.010082,0.004468,0.022089,0.002463
3,3.8759,3.862891,0.010082,0.003595,0.021738,0.002132
4,3.8665,3.856837,0.014665,0.004439,0.022256,0.002816
5,3.8646,3.851101,0.019248,0.004978,0.022774,0.003478
6,3.855,3.845855,0.023831,0.004915,0.023293,0.003906
7,3.8491,3.840852,0.029331,0.00925,0.024265,0.005224
8,3.8459,3.836077,0.032081,0.00837,0.024576,0.005353
9,3.8417,3.831641,0.04033,0.027906,0.025994,0.006626
10,3.8371,3.827292,0.051329,0.036069,0.027578,0.008371


[I 2025-03-15 10:42:16,540] Trial 38 pruned. 


Trial 39 with params: {'learning_rate': 2.744905812550553e-06, 'weight_decay': 0.006, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8871,3.867419,0.009166,0.003849,0.021634,0.002023
2,3.8637,3.851833,0.014665,0.004378,0.022256,0.002884
3,3.8546,3.837835,0.028414,0.005229,0.023811,0.004423
4,3.839,3.824835,0.053162,0.035998,0.027786,0.00853
5,3.8314,3.812328,0.082493,0.009595,0.030446,0.008714


[I 2025-03-15 10:42:44,539] Trial 39 pruned. 


Trial 40 with params: {'learning_rate': 8.598520700165698e-06, 'weight_decay': 0.01, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8719,3.835581,0.030247,0.025419,0.024503,0.00526
2,3.82,3.793098,0.119157,0.009668,0.03441,0.008785
3,3.7863,3.752241,0.179652,0.017818,0.021944,0.009991
4,3.7476,3.717832,0.187901,0.035121,0.023728,0.011848
5,3.7239,3.687949,0.188818,0.012386,0.023732,0.011482
6,3.6901,3.660423,0.185151,0.015594,0.022466,0.010184
7,3.664,3.634884,0.183318,0.015251,0.021918,0.009373
8,3.6434,3.611047,0.181485,0.020231,0.02137,0.008582
9,3.6198,3.588417,0.181485,0.020231,0.02137,0.008582
10,3.5999,3.567006,0.180568,0.019561,0.021096,0.008097


[I 2025-03-15 10:43:40,916] Trial 40 pruned. 


Trial 41 with params: {'learning_rate': 0.000270092496664244, 'weight_decay': 0.006, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5653,3.214044,0.179652,0.023548,0.020714,0.007406
2,2.9967,2.733759,0.44088,0.061075,0.106052,0.07588
3,2.5545,2.324817,0.497709,0.162196,0.146949,0.124265
4,2.147,1.991963,0.595784,0.285229,0.223418,0.20722
5,1.8274,1.733214,0.67461,0.32145,0.296882,0.28284
6,1.5336,1.534032,0.702108,0.318003,0.328068,0.308938
7,1.2942,1.406061,0.716774,0.365208,0.345588,0.330761
8,1.1338,1.322978,0.743355,0.40106,0.401448,0.381774
9,0.987,1.24769,0.746104,0.413072,0.400195,0.38579
10,0.8611,1.207762,0.75527,0.450265,0.432089,0.424327


[I 2025-03-15 10:45:31,604] Trial 41 pruned. 


Trial 42 with params: {'learning_rate': 0.00026846937217687056, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.97, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.572,3.233528,0.176902,0.003538,0.02,0.006012
2,3.0334,2.799177,0.423465,0.063873,0.097923,0.072537
3,2.6348,2.418626,0.479377,0.138312,0.138124,0.113598
4,2.2507,2.100898,0.571036,0.211709,0.200204,0.178503
5,1.9429,1.834127,0.619615,0.290084,0.246393,0.225341
6,1.6476,1.6274,0.681027,0.334435,0.304472,0.292816
7,1.402,1.483749,0.700275,0.372979,0.327732,0.31298
8,1.2232,1.37495,0.722273,0.388163,0.368156,0.349959
9,1.0662,1.306632,0.740605,0.418329,0.407713,0.385673
10,0.9362,1.258739,0.736939,0.438596,0.403635,0.388351


[I 2025-03-15 10:47:23,913] Trial 42 pruned. 


Trial 43 with params: {'learning_rate': 0.00012302824092650286, 'weight_decay': 0.005, 'adam_beta1': 0.97, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.7075,3.505467,0.176902,0.003538,0.02,0.006012
2,3.3811,3.221679,0.253896,0.05451,0.042242,0.035472
3,3.1331,2.982766,0.39505,0.055152,0.084405,0.062279
4,2.8947,2.762021,0.43538,0.064514,0.102611,0.076198
5,2.7088,2.560732,0.460128,0.104242,0.118392,0.088547
6,2.4929,2.380018,0.495875,0.163917,0.147098,0.126869
7,2.3066,2.225447,0.547204,0.197027,0.186542,0.171587
8,2.1616,2.094768,0.582035,0.245911,0.218821,0.206099
9,2.0047,1.966284,0.611366,0.287361,0.246473,0.231195
10,1.8633,1.860889,0.64528,0.324754,0.275698,0.265741


[I 2025-03-15 10:48:18,541] Trial 43 pruned. 


Trial 44 with params: {'learning_rate': 0.0003092173892725669, 'weight_decay': 0.004, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5338,3.157379,0.243813,0.036014,0.038673,0.029922
2,2.9172,2.63315,0.446379,0.064466,0.109139,0.077243
3,2.4402,2.208831,0.525206,0.160527,0.159821,0.138363
4,2.0135,1.871538,0.623281,0.315534,0.256655,0.24497
5,1.6781,1.61566,0.691109,0.337499,0.324622,0.305431
6,1.3806,1.425485,0.719523,0.361841,0.355482,0.335627
7,1.1444,1.320124,0.729606,0.369525,0.375064,0.358796
8,0.9905,1.253955,0.746104,0.416426,0.415359,0.39688
9,0.8518,1.187218,0.750687,0.444059,0.423121,0.416298
10,0.733,1.166965,0.76077,0.504934,0.465564,0.464268


[I 2025-03-15 10:50:02,494] Trial 44 pruned. 


Trial 45 with params: {'learning_rate': 0.0003090706082000009, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4894,3.088002,0.347388,0.068093,0.070863,0.057688
2,2.8292,2.51915,0.455545,0.104213,0.114052,0.084344
3,2.311,2.071147,0.571952,0.255636,0.200038,0.185815
4,1.8807,1.768602,0.668194,0.341544,0.292178,0.276907
5,1.5629,1.542407,0.714024,0.351093,0.352726,0.332423
6,1.2923,1.370013,0.724106,0.352439,0.362786,0.343873
7,1.0805,1.279246,0.724106,0.343033,0.36293,0.341672
8,0.9372,1.234309,0.749771,0.411174,0.41589,0.393618
9,0.8141,1.168012,0.751604,0.458538,0.429501,0.42113
10,0.6987,1.125639,0.758937,0.470863,0.425773,0.424154


[I 2025-03-15 10:51:52,250] Trial 45 pruned. 


Trial 46 with params: {'learning_rate': 3.3046602676148886e-06, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8842,3.862834,0.010082,0.003595,0.021738,0.002132
2,3.8579,3.844604,0.018332,0.004123,0.02232,0.002979
3,3.8465,3.828222,0.039413,0.030304,0.02571,0.006277
4,3.8287,3.812831,0.07791,0.009796,0.029928,0.008533
5,3.8192,3.797659,0.112741,0.00924,0.033685,0.008847
6,3.7994,3.782979,0.147571,0.009884,0.037964,0.00966
7,3.7855,3.768638,0.171402,0.013066,0.021011,0.010097
8,3.7742,3.75553,0.178735,0.015965,0.022011,0.010326
9,3.763,3.743652,0.186068,0.022177,0.023351,0.011354
10,3.7525,3.732893,0.187901,0.023793,0.023728,0.011787


[I 2025-03-15 10:53:40,340] Trial 46 pruned. 


Trial 47 with params: {'learning_rate': 0.0003334709121687755, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4902,3.062112,0.36572,0.065367,0.076712,0.060397
2,2.7781,2.449457,0.483043,0.151907,0.133945,0.112452
3,2.2226,1.976173,0.598533,0.291192,0.232137,0.221737
4,1.7805,1.68055,0.687443,0.305205,0.303414,0.287139
5,1.4608,1.472287,0.72044,0.370137,0.359064,0.339404
6,1.2028,1.315924,0.729606,0.383769,0.373325,0.355785
7,1.001,1.239117,0.732356,0.37667,0.370789,0.355261
8,0.8626,1.193823,0.756187,0.450659,0.437614,0.421388
9,0.7393,1.134481,0.75802,0.462675,0.44921,0.441759
10,0.6294,1.101507,0.764436,0.471946,0.446325,0.446922


[I 2025-03-15 10:56:23,540] Trial 47 finished with value: 0.6033148350383309 and parameters: {'learning_rate': 0.0003334709121687755, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 3}. Best is trial 25 with value: 0.713771496767995.


Trial 48 with params: {'learning_rate': 8.153679865827414e-06, 'weight_decay': 0.004, 'adam_beta1': 0.99, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8704,3.835476,0.028414,0.005631,0.024162,0.004819
2,3.821,3.796159,0.114574,0.01007,0.033712,0.008305
3,3.7908,3.760752,0.177819,0.021158,0.021907,0.010355
4,3.7563,3.73104,0.187901,0.016059,0.023548,0.011072
5,3.7364,3.70585,0.193401,0.018713,0.025192,0.0131
6,3.7073,3.683157,0.188818,0.014024,0.023562,0.011443
7,3.6849,3.661191,0.184235,0.015888,0.022192,0.009795
8,3.667,3.640113,0.179652,0.018551,0.020822,0.00759
9,3.6474,3.620883,0.179652,0.018554,0.020822,0.007594
10,3.6303,3.602739,0.178735,0.016884,0.020548,0.007084


[I 2025-03-15 10:57:18,492] Trial 48 pruned. 


Trial 49 with params: {'learning_rate': 0.00048122908288791095, 'weight_decay': 0.01, 'adam_beta1': 0.99, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3572,2.914744,0.380385,0.077559,0.079426,0.060781
2,2.6538,2.39806,0.459212,0.112354,0.120477,0.095868
3,2.1908,2.004489,0.543538,0.173122,0.187297,0.163363
4,1.7777,1.704744,0.635197,0.253388,0.283047,0.247977
5,1.4703,1.500384,0.669111,0.328182,0.320135,0.28676
6,1.2021,1.342596,0.705775,0.353697,0.354584,0.324434
7,0.9798,1.243715,0.714024,0.345907,0.347833,0.322668
8,0.8208,1.178197,0.731439,0.390248,0.405868,0.383413
9,0.6861,1.14783,0.724106,0.387361,0.407827,0.386347
10,0.5794,1.136222,0.738772,0.452949,0.420549,0.406524


[I 2025-03-15 10:59:58,552] Trial 49 finished with value: 0.6657001579407984 and parameters: {'learning_rate': 0.00048122908288791095, 'weight_decay': 0.01, 'adam_beta1': 0.99, 'warmup_steps': 0}. Best is trial 25 with value: 0.713771496767995.


Trial 50 with params: {'learning_rate': 9.107811820095339e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.7142,3.536433,0.176902,0.003538,0.02,0.006012
2,3.429,3.273173,0.304308,0.073363,0.058041,0.051202
3,3.1985,3.047832,0.407883,0.074066,0.087804,0.064932
4,2.9744,2.842883,0.43538,0.088358,0.101605,0.079107
5,2.8039,2.65544,0.464711,0.10447,0.118435,0.094138
6,2.6104,2.494031,0.496792,0.122075,0.136841,0.111744
7,2.4471,2.360298,0.52154,0.185666,0.161207,0.145845
8,2.3232,2.23844,0.585701,0.261894,0.211301,0.199376
9,2.1862,2.123917,0.591201,0.261217,0.2144,0.20338
10,2.0655,2.032358,0.635197,0.311936,0.256433,0.250014


[I 2025-03-15 11:01:47,042] Trial 50 pruned. 


Trial 51 with params: {'learning_rate': 0.0004834730103326943, 'weight_decay': 0.008, 'adam_beta1': 0.93, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.369,2.861857,0.4033,0.070218,0.091584,0.070947
2,2.5388,2.198837,0.544455,0.173113,0.185508,0.164532
3,1.9209,1.714238,0.652612,0.291153,0.271372,0.256125
4,1.4545,1.443116,0.698442,0.289624,0.326541,0.296017
5,1.1238,1.286849,0.735105,0.376348,0.383276,0.358556
6,0.8866,1.158201,0.748854,0.474337,0.417139,0.413708
7,0.7107,1.125081,0.747021,0.460633,0.424482,0.422232
8,0.5935,1.06547,0.764436,0.469753,0.451588,0.447138
9,0.4787,1.033097,0.768103,0.498372,0.476462,0.474471
10,0.3945,1.026721,0.769019,0.466678,0.475832,0.466241


[I 2025-03-15 11:04:27,844] Trial 51 finished with value: 0.7084684707060201 and parameters: {'learning_rate': 0.0004834730103326943, 'weight_decay': 0.008, 'adam_beta1': 0.93, 'warmup_steps': 1}. Best is trial 25 with value: 0.713771496767995.


Trial 52 with params: {'learning_rate': 0.00039887994036349827, 'weight_decay': 0.008, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3949,2.955765,0.384968,0.057676,0.080665,0.06065
2,2.6739,2.365986,0.493126,0.137401,0.13847,0.118278
3,2.1345,1.924014,0.588451,0.250127,0.216575,0.199654
4,1.6951,1.617447,0.68011,0.314709,0.303873,0.281029
5,1.3739,1.424857,0.707608,0.344543,0.345095,0.323087
6,1.1134,1.256131,0.732356,0.377087,0.368546,0.348366
7,0.899,1.18564,0.744271,0.41857,0.401823,0.390949
8,0.7544,1.128993,0.76077,0.447812,0.439078,0.428191
9,0.6311,1.073737,0.769936,0.489256,0.464884,0.461887
10,0.528,1.069554,0.772686,0.486316,0.483673,0.475507


[I 2025-03-15 11:07:11,058] Trial 52 finished with value: 0.6549264952515926 and parameters: {'learning_rate': 0.00039887994036349827, 'weight_decay': 0.008, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0}. Best is trial 25 with value: 0.713771496767995.


Trial 53 with params: {'learning_rate': 0.0002928943933735547, 'weight_decay': 0.007, 'adam_beta1': 0.93, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.484,3.11115,0.332722,0.068052,0.066877,0.052022
2,2.8683,2.576427,0.447296,0.062754,0.110333,0.078479
3,2.3873,2.161392,0.541705,0.183576,0.169522,0.14978
4,1.9788,1.853513,0.627864,0.275747,0.246069,0.231218
5,1.667,1.609065,0.700275,0.346683,0.337253,0.319111
6,1.3857,1.423308,0.724106,0.344229,0.350176,0.331828
7,1.1589,1.317231,0.719523,0.338989,0.350847,0.331712
8,1.0063,1.250302,0.750687,0.401559,0.406404,0.383116
9,0.8744,1.193455,0.748854,0.417354,0.415687,0.402902
10,0.7528,1.151004,0.75802,0.446563,0.417804,0.412619


[I 2025-03-15 11:09:57,464] Trial 53 finished with value: 0.5752588429704385 and parameters: {'learning_rate': 0.0002928943933735547, 'weight_decay': 0.007, 'adam_beta1': 0.93, 'warmup_steps': 0}. Best is trial 25 with value: 0.713771496767995.


Trial 54 with params: {'learning_rate': 0.00035364062130837363, 'weight_decay': 0.007, 'adam_beta1': 0.91, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4517,3.015185,0.387718,0.062319,0.082513,0.06439
2,2.7334,2.408134,0.496792,0.161183,0.140126,0.121293
3,2.1782,1.941123,0.592117,0.288665,0.223054,0.211805
4,1.7339,1.653352,0.688359,0.336346,0.32197,0.296584
5,1.4125,1.449905,0.72044,0.360553,0.360047,0.337106
6,1.1566,1.291341,0.730522,0.399144,0.376377,0.360926
7,0.9544,1.217312,0.739688,0.402603,0.390184,0.374349
8,0.8192,1.172167,0.756187,0.438635,0.435674,0.419814
9,0.6976,1.124901,0.76077,0.479875,0.457959,0.454604
10,0.5909,1.091669,0.767186,0.47842,0.459984,0.4563


[I 2025-03-15 11:11:51,391] Trial 54 pruned. 


Trial 55 with params: {'learning_rate': 0.00047233886380038565, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3791,2.864584,0.413382,0.072201,0.092149,0.070052
2,2.537,2.187127,0.549954,0.216462,0.193287,0.178052
3,1.9119,1.708606,0.657195,0.281888,0.283391,0.267012
4,1.4506,1.438772,0.710357,0.33411,0.343002,0.318887
5,1.1265,1.301271,0.735105,0.382255,0.39917,0.373238
6,0.8951,1.179702,0.745188,0.432423,0.402893,0.393187
7,0.7203,1.136972,0.748854,0.479599,0.43005,0.431893
8,0.6024,1.088824,0.761687,0.461792,0.454568,0.444536
9,0.4859,1.050336,0.769019,0.49566,0.483888,0.479549
10,0.4,1.041816,0.769019,0.531984,0.4835,0.491223


[I 2025-03-15 11:14:35,634] Trial 55 finished with value: 0.6801901055477038 and parameters: {'learning_rate': 0.00047233886380038565, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 2}. Best is trial 25 with value: 0.713771496767995.


Trial 56 with params: {'learning_rate': 0.0004469265177335313, 'weight_decay': 0.006, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4069,2.930498,0.394134,0.056332,0.084279,0.063051
2,2.6227,2.293921,0.514207,0.160987,0.15812,0.141506
3,2.0331,1.823972,0.603116,0.249364,0.237096,0.219629
4,1.5703,1.530018,0.692942,0.313555,0.322719,0.293859
5,1.2421,1.341606,0.722273,0.369722,0.375273,0.353406
6,0.9828,1.219474,0.744271,0.452724,0.402559,0.394954
7,0.7944,1.164233,0.744271,0.477247,0.430299,0.426788
8,0.6683,1.108343,0.76352,0.477189,0.452408,0.450872
9,0.5475,1.073026,0.76077,0.498157,0.479913,0.476481
10,0.4525,1.069025,0.769019,0.470225,0.480435,0.465463


[I 2025-03-15 11:17:23,023] Trial 56 finished with value: 0.669096956577762 and parameters: {'learning_rate': 0.0004469265177335313, 'weight_decay': 0.006, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 2}. Best is trial 25 with value: 0.713771496767995.


Trial 57 with params: {'learning_rate': 0.00011590846097366166, 'weight_decay': 0.008, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6956,3.494214,0.176902,0.003538,0.02,0.006012
2,3.3679,3.197479,0.336389,0.067889,0.068009,0.055456
3,3.1069,2.94398,0.412466,0.071563,0.089932,0.065472
4,2.8535,2.709832,0.449129,0.102978,0.108624,0.081199
5,2.6548,2.498431,0.486709,0.121235,0.132121,0.107027
6,2.4328,2.319435,0.537122,0.225758,0.170258,0.156071
7,2.2476,2.168369,0.570119,0.227944,0.199877,0.187
8,2.1051,2.039421,0.609533,0.292241,0.240557,0.227617
9,1.9551,1.924816,0.649863,0.319565,0.275214,0.267304
10,1.8225,1.824992,0.670027,0.351439,0.286024,0.277963


[I 2025-03-15 11:19:10,438] Trial 57 pruned. 


Trial 58 with params: {'learning_rate': 0.0004914006445390331, 'weight_decay': 0.007, 'adam_beta1': 0.92, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3656,2.839089,0.418882,0.070999,0.09516,0.072715
2,2.5033,2.15017,0.55912,0.219634,0.203513,0.187443
3,1.8686,1.671841,0.661778,0.292147,0.2886,0.270923
4,1.4076,1.407047,0.711274,0.332893,0.331176,0.311515
5,1.088,1.280976,0.736939,0.40411,0.404992,0.380806
6,0.863,1.166589,0.741522,0.4371,0.401828,0.394801
7,0.6886,1.12312,0.752521,0.484075,0.436432,0.437656
8,0.5756,1.074963,0.762603,0.472172,0.45192,0.447358
9,0.4558,1.04512,0.776352,0.518741,0.498317,0.494165
10,0.3754,1.037243,0.768103,0.529697,0.476034,0.485881


[I 2025-03-15 11:21:57,363] Trial 58 finished with value: 0.6812307962500159 and parameters: {'learning_rate': 0.0004914006445390331, 'weight_decay': 0.007, 'adam_beta1': 0.92, 'warmup_steps': 2}. Best is trial 25 with value: 0.713771496767995.


Trial 59 with params: {'learning_rate': 0.00039949817551036056, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.442,3.002767,0.36297,0.064639,0.075035,0.058433
2,2.7111,2.399761,0.476627,0.144013,0.129614,0.104316
3,2.1572,1.9356,0.581118,0.242791,0.209735,0.194341
4,1.7045,1.630519,0.676444,0.312064,0.303557,0.286388
5,1.3713,1.406162,0.713107,0.383137,0.348552,0.329649
6,1.0951,1.255681,0.734189,0.40038,0.37559,0.363731
7,0.8845,1.183387,0.741522,0.416288,0.411393,0.396639
8,0.7469,1.142109,0.757104,0.460805,0.453001,0.444008
9,0.6245,1.091348,0.75802,0.496621,0.475193,0.473621
10,0.5214,1.086735,0.76352,0.46885,0.473215,0.461521


[I 2025-03-15 11:23:45,052] Trial 59 pruned. 


Trial 60 with params: {'learning_rate': 0.0004116756042903734, 'weight_decay': 0.007, 'adam_beta1': 0.93, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4291,2.969216,0.386801,0.060523,0.082521,0.063287
2,2.67,2.343322,0.508708,0.167556,0.153097,0.136394
3,2.0899,1.865589,0.603116,0.251531,0.229937,0.215786
4,1.6279,1.571006,0.689276,0.331803,0.326744,0.298787
5,1.2935,1.378654,0.72319,0.350822,0.365323,0.340278
6,1.0332,1.232098,0.740605,0.409552,0.3927,0.378027
7,0.8387,1.174715,0.750687,0.454435,0.425334,0.417051
8,0.7096,1.121793,0.762603,0.455974,0.456346,0.445931
9,0.5934,1.073797,0.769936,0.495983,0.483221,0.477557
10,0.4916,1.077328,0.764436,0.468425,0.4818,0.469029


[I 2025-03-15 11:26:33,448] Trial 60 finished with value: 0.652094443743539 and parameters: {'learning_rate': 0.0004116756042903734, 'weight_decay': 0.007, 'adam_beta1': 0.93, 'warmup_steps': 2}. Best is trial 25 with value: 0.713771496767995.


Trial 61 with params: {'learning_rate': 5.2665021348115615e-05, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.7897,3.662167,0.180568,0.019554,0.021096,0.008087
2,3.5969,3.496139,0.179652,0.023548,0.020822,0.007605
3,3.4493,3.339842,0.258478,0.073419,0.043838,0.038293
4,3.2972,3.196322,0.380385,0.066072,0.080003,0.063746
5,3.1829,3.067702,0.410632,0.096582,0.089142,0.068519
6,3.0465,2.949612,0.432631,0.0895,0.100648,0.079728
7,2.9296,2.845383,0.442713,0.085636,0.105862,0.083764
8,2.8386,2.746836,0.462878,0.104347,0.11683,0.092684
9,2.7369,2.655128,0.472961,0.103823,0.122671,0.096759
10,2.6501,2.573918,0.482126,0.102708,0.127921,0.101672


[I 2025-03-15 11:27:27,571] Trial 61 pruned. 


Trial 62 with params: {'learning_rate': 0.00047095570456479925, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3758,2.8543,0.417965,0.071747,0.094663,0.072529
2,2.5233,2.170348,0.55912,0.218082,0.201488,0.185668
3,1.8944,1.693023,0.659945,0.304668,0.289229,0.274904
4,1.4323,1.421276,0.707608,0.328292,0.322277,0.306825
5,1.1187,1.299342,0.738772,0.391791,0.396256,0.370805
6,0.8905,1.184991,0.741522,0.429955,0.400688,0.391347
7,0.7196,1.128783,0.747938,0.471472,0.433114,0.43285
8,0.6026,1.090915,0.766269,0.468813,0.462279,0.452417
9,0.4833,1.039922,0.777269,0.518075,0.495817,0.4954
10,0.3959,1.033286,0.771769,0.534079,0.481058,0.49071


[I 2025-03-15 11:30:17,321] Trial 62 finished with value: 0.6805202979398901 and parameters: {'learning_rate': 0.00047095570456479925, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 2}. Best is trial 25 with value: 0.713771496767995.


Trial 63 with params: {'learning_rate': 0.00046468365490173844, 'weight_decay': 0.005, 'adam_beta1': 0.91, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3804,2.862822,0.417049,0.071643,0.094448,0.072352
2,2.5349,2.183012,0.554537,0.220558,0.196845,0.182979
3,1.9091,1.705838,0.657195,0.31878,0.287225,0.273388
4,1.4477,1.432134,0.704858,0.32381,0.319172,0.302125
5,1.1327,1.307429,0.734189,0.376053,0.387236,0.359178
6,0.9026,1.18999,0.742438,0.419966,0.400794,0.390974
7,0.7303,1.134468,0.747938,0.452014,0.430361,0.428153
8,0.6125,1.091539,0.766269,0.469102,0.460802,0.451696
9,0.4939,1.043919,0.774519,0.522432,0.488546,0.489914
10,0.4051,1.033424,0.769936,0.528804,0.479153,0.487763


[I 2025-03-15 11:33:06,407] Trial 63 finished with value: 0.6797903358336547 and parameters: {'learning_rate': 0.00046468365490173844, 'weight_decay': 0.005, 'adam_beta1': 0.91, 'warmup_steps': 2}. Best is trial 25 with value: 0.713771496767995.


Trial 64 with params: {'learning_rate': 1.6488779238415127e-06, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8899,3.873888,0.007333,0.003531,0.021778,0.002002
2,3.8731,3.863904,0.010082,0.003756,0.021738,0.002147
3,3.8692,3.85498,0.014665,0.004337,0.022256,0.002845
4,3.8578,3.846702,0.022915,0.00503,0.023189,0.00386
5,3.854,3.838834,0.029331,0.008993,0.024265,0.005147
6,3.8421,3.831679,0.042163,0.035361,0.026191,0.007329
7,3.8347,3.824736,0.055912,0.035758,0.028096,0.008695
8,3.8296,3.81813,0.060495,0.030959,0.028615,0.008545
9,3.8237,3.811769,0.087076,0.009767,0.030964,0.008995
10,3.8175,3.805781,0.097159,0.009385,0.032104,0.008989


[I 2025-03-15 11:34:55,632] Trial 64 pruned. 


Trial 65 with params: {'learning_rate': 0.0003634999059172566, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4622,3.028636,0.352887,0.062938,0.073004,0.05435
2,2.7408,2.41941,0.512374,0.184093,0.157874,0.140131
3,2.1848,1.948323,0.598533,0.259347,0.230548,0.215589
4,1.7298,1.647001,0.681027,0.310253,0.3065,0.287635
5,1.3976,1.441891,0.716774,0.32907,0.35284,0.327327
6,1.1314,1.281734,0.730522,0.41619,0.372216,0.356785
7,0.9286,1.207245,0.748854,0.4692,0.425335,0.417109
8,0.7951,1.157291,0.759853,0.44943,0.441028,0.428803
9,0.6742,1.105191,0.762603,0.493488,0.472916,0.468836
10,0.5683,1.08912,0.767186,0.462096,0.475126,0.46252


[I 2025-03-15 11:36:45,750] Trial 65 pruned. 


Trial 66 with params: {'learning_rate': 8.876292630413755e-05, 'weight_decay': 0.005, 'adam_beta1': 0.92, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.7289,3.556752,0.176902,0.003538,0.02,0.006012
2,3.4532,3.305208,0.249313,0.073401,0.041764,0.036654
3,3.2303,3.083795,0.39505,0.056053,0.083983,0.062981
4,3.0121,2.882319,0.430797,0.091044,0.099883,0.078209
5,2.8448,2.699159,0.452796,0.084107,0.112012,0.08562
6,2.6538,2.537355,0.487626,0.123143,0.132122,0.107957
7,2.4913,2.400679,0.510541,0.152738,0.149576,0.131549
8,2.3687,2.278979,0.566453,0.253048,0.196275,0.185555
9,2.2332,2.167153,0.587534,0.28457,0.214568,0.205947
10,2.1132,2.072596,0.614115,0.311682,0.241223,0.235218


[I 2025-03-15 11:37:40,536] Trial 66 pruned. 


Trial 67 with params: {'learning_rate': 0.0004377148075584707, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3467,2.839275,0.425298,0.069718,0.099959,0.077373
2,2.5246,2.179063,0.557287,0.214939,0.194352,0.179648
3,1.9225,1.714153,0.662695,0.284322,0.284363,0.264468
4,1.4775,1.447072,0.700275,0.318125,0.320565,0.300572
5,1.1679,1.306155,0.731439,0.344357,0.364608,0.334655
6,0.9465,1.175856,0.749771,0.42788,0.393121,0.380932
7,0.76,1.125531,0.75527,0.468888,0.431212,0.424421
8,0.6362,1.083332,0.767186,0.485685,0.462419,0.457754
9,0.5214,1.042304,0.768103,0.486381,0.476909,0.473124
10,0.4286,1.008617,0.775435,0.482798,0.477918,0.474812


[I 2025-03-15 11:40:23,545] Trial 67 finished with value: 0.6850486432479292 and parameters: {'learning_rate': 0.0004377148075584707, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 0}. Best is trial 25 with value: 0.713771496767995.


Trial 68 with params: {'learning_rate': 0.00024334986693978525, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5272,3.180024,0.297892,0.071245,0.056266,0.046627
2,2.9569,2.677668,0.442713,0.06274,0.106677,0.076937
3,2.5048,2.271835,0.532539,0.185983,0.165309,0.14881
4,2.1126,1.963127,0.626031,0.290409,0.248335,0.230793
5,1.8143,1.726287,0.689276,0.345892,0.312999,0.301317
6,1.5393,1.538023,0.711274,0.347049,0.335377,0.319839
7,1.3227,1.416404,0.716774,0.328737,0.337407,0.318635
8,1.1708,1.33995,0.736022,0.416125,0.392454,0.372696
9,1.0363,1.263529,0.746104,0.393845,0.396704,0.382614
10,0.9113,1.212863,0.753437,0.439292,0.408694,0.400766


[I 2025-03-15 11:42:14,224] Trial 68 pruned. 


Trial 69 with params: {'learning_rate': 0.0004568208696615515, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3404,2.836654,0.417965,0.069412,0.097472,0.075715
2,2.5235,2.179508,0.554537,0.173029,0.190253,0.1691
3,1.9205,1.714251,0.656279,0.283159,0.274885,0.255483
4,1.4755,1.457816,0.704858,0.304425,0.336298,0.303714
5,1.1595,1.296005,0.734189,0.33197,0.373923,0.342362
6,0.927,1.171884,0.744271,0.443897,0.394732,0.387395
7,0.7431,1.118713,0.75802,0.474563,0.433248,0.429618
8,0.6193,1.073712,0.76352,0.462648,0.450368,0.444452
9,0.505,1.051094,0.762603,0.489315,0.475471,0.470106
10,0.416,1.0169,0.771769,0.463876,0.466905,0.459667


[I 2025-03-15 11:45:01,432] Trial 69 finished with value: 0.7058767245019291 and parameters: {'learning_rate': 0.0004568208696615515, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 0}. Best is trial 25 with value: 0.713771496767995.


Trial 70 with params: {'learning_rate': 1.1328698100804768e-05, 'weight_decay': 0.01, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8623,3.818775,0.059578,0.008463,0.027324,0.006749
2,3.7986,3.764533,0.170486,0.008203,0.020307,0.008707
3,3.7559,3.717805,0.188818,0.017166,0.023822,0.011553
4,3.7116,3.678671,0.185151,0.013607,0.022466,0.010068
5,3.684,3.642524,0.186068,0.016097,0.02274,0.010592
6,3.6432,3.608618,0.181485,0.020231,0.02137,0.008582
7,3.6111,3.576872,0.181485,0.020231,0.02137,0.008582
8,3.5849,3.546365,0.180568,0.023558,0.021096,0.008119
9,3.555,3.517286,0.180568,0.023558,0.021096,0.008119
10,3.531,3.490431,0.180568,0.023558,0.021096,0.008119


[I 2025-03-15 11:46:52,832] Trial 70 pruned. 


Trial 71 with params: {'learning_rate': 9.262456188329795e-05, 'weight_decay': 0.001, 'adam_beta1': 0.93, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.7142,3.539336,0.176902,0.003538,0.02,0.006012
2,3.4339,3.283654,0.267644,0.072798,0.047023,0.041189
3,3.2086,3.062359,0.40055,0.055558,0.085719,0.063914
4,2.9893,2.859336,0.436297,0.07077,0.101387,0.078108
5,2.8203,2.674308,0.450962,0.083364,0.110814,0.082061


[I 2025-03-15 11:47:18,886] Trial 71 pruned. 


Trial 72 with params: {'learning_rate': 0.00038768310345292965, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3871,2.915336,0.409716,0.074075,0.091096,0.069985
2,2.6196,2.281554,0.539872,0.200317,0.176798,0.165078
3,2.0417,1.818793,0.641613,0.289496,0.270814,0.257357
4,1.6009,1.536779,0.699358,0.332667,0.314596,0.298794
5,1.2885,1.370311,0.724106,0.33233,0.35692,0.329426
6,1.0555,1.2305,0.747021,0.428875,0.385229,0.372962
7,0.8623,1.165143,0.749771,0.451837,0.406814,0.399504
8,0.73,1.123685,0.761687,0.471871,0.447642,0.442903
9,0.6086,1.070622,0.770852,0.50045,0.469464,0.468154
10,0.5084,1.042529,0.770852,0.499573,0.467136,0.46696


[I 2025-03-15 11:50:05,465] Trial 72 finished with value: 0.6566034200505996 and parameters: {'learning_rate': 0.00038768310345292965, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 0}. Best is trial 25 with value: 0.713771496767995.


Trial 73 with params: {'learning_rate': 4.818236733162463e-06, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8797,3.853867,0.014665,0.004401,0.022256,0.002861
2,3.8457,3.828573,0.035747,0.030303,0.025295,0.006056
3,3.828,3.805074,0.091659,0.008387,0.031131,0.008063
4,3.8036,3.782104,0.148488,0.010114,0.038067,0.009631
5,3.788,3.75952,0.175985,0.03349,0.021834,0.010713
6,3.7622,3.739798,0.186984,0.021585,0.023625,0.011626
7,3.7442,3.722479,0.186984,0.023718,0.023454,0.011488
8,3.7301,3.706725,0.185151,0.015819,0.022726,0.010476
9,3.7161,3.6924,0.185151,0.011454,0.022466,0.009892
10,3.7029,3.679285,0.184235,0.013604,0.022192,0.009703


[I 2025-03-15 11:50:59,251] Trial 73 pruned. 


Trial 74 with params: {'learning_rate': 0.0003878516193309889, 'weight_decay': 0.0, 'adam_beta1': 0.92, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4287,2.974332,0.392301,0.059899,0.083723,0.063907
2,2.6826,2.351691,0.515124,0.168792,0.15441,0.137279
3,2.1123,1.884209,0.608616,0.260426,0.234864,0.223412
4,1.66,1.597839,0.692026,0.308434,0.321418,0.292101
5,1.334,1.400643,0.718607,0.372516,0.357712,0.337845
6,1.0795,1.247879,0.740605,0.439829,0.389756,0.378361
7,0.8828,1.184591,0.746104,0.446223,0.416416,0.406605
8,0.7505,1.137709,0.764436,0.470321,0.449616,0.441536
9,0.6297,1.100069,0.767186,0.492562,0.479474,0.474116
10,0.5307,1.083221,0.768103,0.480929,0.472236,0.465734


[I 2025-03-15 11:52:48,713] Trial 74 pruned. 


Trial 75 with params: {'learning_rate': 0.0002224376724307852, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5537,3.235558,0.218148,0.075963,0.03175,0.024391
2,3.0316,2.780749,0.428048,0.064524,0.10036,0.074315
3,2.6191,2.394619,0.487626,0.140014,0.130567,0.108778
4,2.2448,2.088908,0.578368,0.217201,0.199356,0.184077
5,1.9578,1.845086,0.648029,0.264783,0.260424,0.242358


[I 2025-03-15 11:53:15,945] Trial 75 pruned. 


Trial 76 with params: {'learning_rate': 0.00035243578128822174, 'weight_decay': 0.003, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4313,3.018752,0.382218,0.06147,0.081186,0.06228
2,2.7518,2.445083,0.458295,0.120981,0.120303,0.095641
3,2.2295,2.006727,0.568286,0.247851,0.195555,0.179453
4,1.7948,1.705004,0.672777,0.30706,0.294135,0.279825
5,1.4797,1.478865,0.708524,0.360333,0.345037,0.324209
6,1.2137,1.317816,0.72594,0.34953,0.36141,0.341914
7,0.9966,1.233377,0.736939,0.397017,0.386884,0.373901
8,0.8509,1.181818,0.756187,0.455207,0.422834,0.407044
9,0.7268,1.123993,0.759853,0.47886,0.450786,0.445924
10,0.6157,1.103249,0.761687,0.468251,0.444797,0.442204


[I 2025-03-15 11:55:58,264] Trial 76 finished with value: 0.6210694420665043 and parameters: {'learning_rate': 0.00035243578128822174, 'weight_decay': 0.003, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0}. Best is trial 25 with value: 0.713771496767995.


Trial 77 with params: {'learning_rate': 0.00048440127575320155, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3598,2.833707,0.418882,0.068113,0.099314,0.07559
2,2.5018,2.152117,0.55912,0.201734,0.192564,0.176427
3,1.8623,1.658626,0.671861,0.289547,0.29502,0.276196
4,1.3977,1.401109,0.708524,0.318462,0.330447,0.309752
5,1.0859,1.265182,0.735105,0.360064,0.382446,0.353205
6,0.86,1.143666,0.751604,0.467127,0.410234,0.407011
7,0.6913,1.118812,0.75802,0.462879,0.442084,0.436215
8,0.5764,1.071232,0.761687,0.478905,0.462289,0.455338
9,0.4647,1.031641,0.767186,0.492218,0.481113,0.476171
10,0.3761,1.01782,0.771769,0.511779,0.481305,0.484285


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--f1/34c46321f42186df33a6260966e34a368f14868d9cc2ba47d142112e2800d233 (last modified on Fri Jan 10 23:14:01 2025) since it couldn't be found locally at evaluate-metric--f1, or remotely on the Hugging Face Hub.
[I 2025-03-15 11:58:53,955] Trial 77 finished with value: 0.705212680296082 and parameters: {'learning_rate': 0.00048440127575320155, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 1}. Best is trial 25 with value: 0.713771496767995.


Trial 78 with params: {'learning_rate': 0.00035816540254459545, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4481,3.007997,0.388634,0.061499,0.08284,0.064167
2,2.7243,2.397425,0.498625,0.161494,0.141626,0.123509
3,2.1658,1.92942,0.595784,0.284055,0.224776,0.213204
4,1.7205,1.642945,0.690192,0.332565,0.323094,0.297491
5,1.3992,1.442545,0.721357,0.364189,0.36138,0.33925
6,1.145,1.284952,0.731439,0.409835,0.37881,0.365584
7,0.9439,1.212046,0.739688,0.401225,0.390184,0.374624
8,0.8092,1.167189,0.75527,0.438745,0.436967,0.420512
9,0.688,1.121387,0.766269,0.491385,0.472206,0.467807
10,0.5823,1.088541,0.768103,0.476954,0.460511,0.456484


[I 2025-03-15 11:59:48,883] Trial 78 pruned. 


Trial 79 with params: {'learning_rate': 0.0004790137680264927, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3178,2.785002,0.434464,0.066547,0.105416,0.078596
2,2.4563,2.105116,0.562786,0.191514,0.196126,0.174823
3,1.8328,1.639565,0.670027,0.282471,0.283311,0.261765
4,1.3833,1.380098,0.707608,0.309086,0.322795,0.301175
5,1.076,1.248396,0.736022,0.332272,0.368183,0.337695
6,0.8558,1.142515,0.757104,0.453545,0.414527,0.40748
7,0.6816,1.10944,0.754354,0.481608,0.450676,0.444621
8,0.5685,1.0712,0.76352,0.491991,0.472392,0.467882
9,0.459,1.032886,0.766269,0.475192,0.486057,0.47559
10,0.3721,1.000563,0.783685,0.493476,0.495842,0.488353


[I 2025-03-15 12:02:29,347] Trial 79 finished with value: 0.713990114113548 and parameters: {'learning_rate': 0.0004790137680264927, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 0}. Best is trial 79 with value: 0.713990114113548.


Trial 80 with params: {'learning_rate': 0.0004972963156919285, 'weight_decay': 0.003, 'adam_beta1': 0.91, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3102,2.775468,0.433547,0.065404,0.104771,0.077661
2,2.4431,2.094748,0.56187,0.179758,0.195725,0.173162
3,1.8129,1.621951,0.672777,0.303566,0.293351,0.270766
4,1.3614,1.368449,0.712191,0.309888,0.327186,0.30476
5,1.0537,1.240427,0.734189,0.355522,0.373588,0.346643
6,0.8321,1.131132,0.754354,0.462294,0.41591,0.411208
7,0.6644,1.100976,0.757104,0.491727,0.462523,0.455176
8,0.5551,1.058402,0.761687,0.489122,0.45832,0.452994
9,0.4473,1.026635,0.770852,0.487594,0.484896,0.479528
10,0.3607,1.005184,0.781852,0.515,0.496099,0.493904


[I 2025-03-15 12:05:09,813] Trial 80 finished with value: 0.7071028515372666 and parameters: {'learning_rate': 0.0004972963156919285, 'weight_decay': 0.003, 'adam_beta1': 0.91, 'warmup_steps': 0}. Best is trial 79 with value: 0.713990114113548.


Trial 81 with params: {'learning_rate': 0.0003753563284820992, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4009,2.945196,0.40055,0.054691,0.086193,0.063322
2,2.6571,2.324746,0.526123,0.19434,0.167555,0.155975
3,2.0933,1.866673,0.633364,0.304345,0.265303,0.255791
4,1.6559,1.586268,0.694775,0.331559,0.32482,0.302363
5,1.3386,1.396671,0.724106,0.364842,0.360576,0.338285
6,1.0943,1.24968,0.745188,0.441165,0.3824,0.373341
7,0.8944,1.181851,0.747938,0.453353,0.407758,0.401119
8,0.7585,1.134325,0.757104,0.454808,0.441833,0.431719
9,0.6374,1.084173,0.765353,0.494854,0.468814,0.467029
10,0.5345,1.058503,0.768103,0.471956,0.456196,0.453216


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--precision/155d3220d6cd4a6553f12da68eeb3d1f97cf431206304a4bc6e2d564c29502e9 (last modified on Fri Jan 10 23:13:59 2025) since it couldn't be found locally at evaluate-metric--precision, or remotely on the Hugging Face Hub.
[I 2025-03-15 12:08:30,870] Trial 81 finished with value: 0.6570111498844378 and parameters: {'learning_rate': 0.0003753563284820992, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 0}. Best is trial 79 with value: 0.713990114113548.


Trial 82 with params: {'learning_rate': 0.0004777337176508395, 'weight_decay': 0.003, 'adam_beta1': 0.91, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3226,2.798232,0.429881,0.067213,0.10305,0.077942
2,2.4735,2.125284,0.55912,0.171629,0.193921,0.172018
3,1.8531,1.655551,0.665445,0.309153,0.288088,0.26695
4,1.4023,1.396531,0.700275,0.295548,0.316662,0.291222
5,1.0937,1.267681,0.736939,0.358043,0.375947,0.347697
6,0.8703,1.144303,0.753437,0.45594,0.405131,0.398041
7,0.6938,1.111831,0.752521,0.462877,0.446768,0.434937
8,0.5803,1.068822,0.762603,0.473428,0.454274,0.44557
9,0.4697,1.037023,0.768103,0.483511,0.474278,0.467039
10,0.3804,1.007478,0.777269,0.48298,0.484081,0.478643


[I 2025-03-15 12:11:17,244] Trial 82 finished with value: 0.7143632508757508 and parameters: {'learning_rate': 0.0004777337176508395, 'weight_decay': 0.003, 'adam_beta1': 0.91, 'warmup_steps': 0}. Best is trial 82 with value: 0.7143632508757508.


Trial 83 with params: {'learning_rate': 9.578195459423425e-05, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.7073,3.522892,0.176902,0.003538,0.02,0.006012
2,3.411,3.249518,0.32264,0.070629,0.063402,0.055684
3,3.1718,3.017059,0.409716,0.053035,0.088143,0.063878
4,2.94,2.805521,0.44088,0.086602,0.103927,0.079668
5,2.7635,2.612105,0.470211,0.104646,0.122101,0.097267
6,2.5637,2.447549,0.505041,0.131305,0.142661,0.118231
7,2.3963,2.311339,0.537122,0.221917,0.170629,0.157134
8,2.2689,2.188558,0.593951,0.26301,0.216173,0.205256
9,2.1293,2.073462,0.60495,0.284222,0.226697,0.218458
10,2.006,1.980968,0.646196,0.291521,0.268441,0.258351


[I 2025-03-15 12:12:10,758] Trial 83 pruned. 


Trial 84 with params: {'learning_rate': 0.00048741262994873283, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3249,2.81447,0.417965,0.067871,0.097628,0.07513
2,2.496,2.15665,0.562786,0.184513,0.199188,0.177331
3,1.8852,1.685596,0.655362,0.27538,0.272048,0.250965
4,1.4341,1.426448,0.703941,0.303296,0.330769,0.300393
5,1.1135,1.268262,0.736022,0.38222,0.379284,0.357028
6,0.8789,1.146421,0.752521,0.456053,0.423929,0.418647
7,0.7033,1.116112,0.747938,0.446609,0.430465,0.422981
8,0.5875,1.059765,0.762603,0.468792,0.45331,0.444391
9,0.4748,1.035435,0.765353,0.471296,0.473024,0.463767
10,0.3839,1.030957,0.769019,0.476989,0.486531,0.476884


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--f1/34c46321f42186df33a6260966e34a368f14868d9cc2ba47d142112e2800d233 (last modified on Fri Jan 10 23:14:01 2025) since it couldn't be found locally at evaluate-metric--f1, or remotely on the Hugging Face Hub.
[I 2025-03-15 12:15:28,982] Trial 84 finished with value: 0.7076037019926047 and parameters: {'learning_rate': 0.00048741262994873283, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 0}. Best is trial 82 with value: 0.7143632508757508.


Trial 85 with params: {'learning_rate': 0.0004705937176505686, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3273,2.806662,0.429881,0.069199,0.10305,0.078877
2,2.4846,2.136112,0.55912,0.193336,0.196585,0.177847
3,1.8683,1.668359,0.663611,0.289213,0.286126,0.264684
4,1.4194,1.410944,0.701192,0.298708,0.318091,0.293208
5,1.1104,1.277828,0.735105,0.335808,0.37053,0.33956
6,0.8858,1.152328,0.751604,0.439246,0.403847,0.394528
7,0.7072,1.117365,0.754354,0.477544,0.447988,0.440487
8,0.5913,1.072498,0.762603,0.472906,0.455199,0.445663
9,0.4789,1.046253,0.769936,0.478672,0.474826,0.468644
10,0.3875,1.011477,0.772686,0.48138,0.478271,0.474387


[I 2025-03-15 12:18:12,014] Trial 85 finished with value: 0.7115847713433859 and parameters: {'learning_rate': 0.0004705937176505686, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 0}. Best is trial 82 with value: 0.7143632508757508.


Trial 86 with params: {'learning_rate': 0.0003490952656150358, 'weight_decay': 0.001, 'adam_beta1': 0.93, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.431,3.012853,0.388634,0.06082,0.082714,0.063518
2,2.7426,2.429785,0.48121,0.138654,0.13141,0.109238
3,2.2128,1.985369,0.572869,0.239632,0.202182,0.186531
4,1.7765,1.685714,0.675527,0.316067,0.304039,0.285023
5,1.4576,1.469721,0.715857,0.332833,0.350993,0.326069
6,1.1927,1.302827,0.734189,0.397005,0.377866,0.363961
7,0.9803,1.225322,0.745188,0.399676,0.396339,0.379057
8,0.8365,1.170874,0.754354,0.428525,0.423897,0.406578
9,0.7135,1.124763,0.76077,0.479469,0.461783,0.453461
10,0.6059,1.104119,0.762603,0.464854,0.446902,0.443378


[I 2025-03-15 12:20:02,585] Trial 86 pruned. 


Trial 87 with params: {'learning_rate': 0.00046950442262754487, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.324,2.796517,0.430797,0.066748,0.103265,0.077836
2,2.4717,2.120886,0.560037,0.211778,0.198086,0.180193
3,1.8533,1.65635,0.670027,0.289704,0.289737,0.267288
4,1.4047,1.394502,0.703025,0.297763,0.318284,0.29523
5,1.0965,1.262043,0.736022,0.332055,0.368317,0.336713
6,0.8774,1.147212,0.75527,0.450278,0.413461,0.405583
7,0.6983,1.111436,0.759853,0.495129,0.452325,0.446791
8,0.5841,1.072403,0.769019,0.490546,0.473971,0.468843
9,0.4712,1.038799,0.769019,0.472348,0.481126,0.471234
10,0.3829,1.008147,0.777269,0.489461,0.487641,0.482664


[I 2025-03-15 12:22:54,610] Trial 87 finished with value: 0.708224076819617 and parameters: {'learning_rate': 0.00046950442262754487, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 0}. Best is trial 82 with value: 0.7143632508757508.


Trial 88 with params: {'learning_rate': 0.00040487030930757444, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3725,2.888162,0.415215,0.073933,0.094967,0.074608
2,2.5854,2.244433,0.545371,0.180898,0.177886,0.162759
3,1.998,1.779745,0.644363,0.288013,0.274387,0.259589
4,1.5549,1.503566,0.700275,0.332714,0.316227,0.300302
5,1.2437,1.345172,0.732356,0.364568,0.366232,0.340321
6,1.0138,1.207913,0.747938,0.429198,0.389039,0.377835
7,0.823,1.146963,0.749771,0.432071,0.407228,0.396688
8,0.6937,1.106863,0.76352,0.473054,0.450854,0.445211
9,0.5765,1.055347,0.769936,0.504337,0.474075,0.474317
10,0.479,1.031224,0.772686,0.487189,0.467957,0.466098


[I 2025-03-15 12:24:41,495] Trial 88 pruned. 


Trial 89 with params: {'learning_rate': 0.00047242989374394394, 'weight_decay': 0.003, 'adam_beta1': 0.91, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.326,2.804405,0.429881,0.06891,0.10305,0.078777
2,2.4817,2.133165,0.55912,0.193428,0.196585,0.177894
3,1.8643,1.664954,0.664528,0.290247,0.287035,0.26582
4,1.4149,1.407189,0.701192,0.298338,0.318091,0.293151
5,1.1062,1.275681,0.734189,0.335337,0.370042,0.339011
6,0.882,1.150402,0.753437,0.438674,0.404573,0.395149
7,0.7037,1.115736,0.754354,0.476537,0.446014,0.439515
8,0.5885,1.071773,0.762603,0.472353,0.453986,0.444843
9,0.4764,1.044453,0.769936,0.481841,0.474826,0.468972
10,0.3854,1.011081,0.772686,0.481221,0.478271,0.474315


[I 2025-03-15 12:27:33,386] Trial 89 finished with value: 0.7116427173122123 and parameters: {'learning_rate': 0.00047242989374394394, 'weight_decay': 0.003, 'adam_beta1': 0.91, 'warmup_steps': 0}. Best is trial 82 with value: 0.7143632508757508.


Trial 90 with params: {'learning_rate': 0.0001441843628958433, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6417,3.393258,0.176902,0.003538,0.02,0.006012
2,3.2404,3.037117,0.3978,0.054922,0.085325,0.063413
3,2.9234,2.726604,0.441797,0.082819,0.104958,0.076868
4,2.6209,2.459427,0.500458,0.151289,0.14153,0.119758
5,2.388,2.232408,0.565536,0.226089,0.190803,0.176855


[I 2025-03-15 12:28:01,652] Trial 90 pruned. 


Trial 91 with params: {'learning_rate': 0.0004642583255396689, 'weight_decay': 0.004, 'adam_beta1': 0.93, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3394,2.841142,0.412466,0.070954,0.094876,0.074422
2,2.5298,2.191645,0.551787,0.173751,0.188649,0.168182
3,1.9297,1.722729,0.647113,0.266863,0.264793,0.245135
4,1.4788,1.453552,0.703025,0.309778,0.326763,0.298307
5,1.1574,1.289407,0.730522,0.330781,0.363364,0.334583
6,0.9205,1.169284,0.753437,0.458951,0.423005,0.415725
7,0.7371,1.120841,0.75527,0.463898,0.436043,0.431088
8,0.6166,1.071696,0.759853,0.455643,0.447392,0.440639
9,0.5028,1.048815,0.759853,0.477876,0.468833,0.459939
10,0.4167,1.026127,0.775435,0.464043,0.474022,0.464703


[I 2025-03-15 12:30:44,306] Trial 91 finished with value: 0.7016015758735259 and parameters: {'learning_rate': 0.0004642583255396689, 'weight_decay': 0.004, 'adam_beta1': 0.93, 'warmup_steps': 0}. Best is trial 82 with value: 0.7143632508757508.


Trial 92 with params: {'learning_rate': 0.00044653297878477884, 'weight_decay': 0.004, 'adam_beta1': 0.92, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3475,2.850225,0.416132,0.071476,0.096354,0.075404
2,2.5408,2.198291,0.549954,0.173462,0.188201,0.168284
3,1.9433,1.733757,0.650779,0.263211,0.268853,0.250924
4,1.4974,1.472543,0.702108,0.301238,0.33275,0.300752
5,1.1802,1.307922,0.729606,0.324358,0.368861,0.337181
6,0.947,1.180088,0.751604,0.460874,0.408885,0.405511
7,0.758,1.121903,0.75802,0.481962,0.432449,0.428674
8,0.6329,1.077269,0.769019,0.460393,0.453863,0.446257
9,0.5188,1.050235,0.765353,0.486606,0.474155,0.467847
10,0.4301,1.017237,0.769019,0.470072,0.458267,0.451644


[I 2025-03-15 12:33:29,084] Trial 92 finished with value: 0.7042484657254029 and parameters: {'learning_rate': 0.00044653297878477884, 'weight_decay': 0.004, 'adam_beta1': 0.92, 'warmup_steps': 0}. Best is trial 82 with value: 0.7143632508757508.


Trial 93 with params: {'learning_rate': 0.00043535613421081, 'weight_decay': 0.003, 'adam_beta1': 0.91, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.352,2.853487,0.421632,0.070272,0.098669,0.076626
2,2.5433,2.199224,0.552704,0.18527,0.188077,0.170397
3,1.9455,1.734118,0.654445,0.266721,0.279477,0.258597
4,1.5035,1.475904,0.703941,0.306569,0.334274,0.305952
5,1.1898,1.31378,0.732356,0.329763,0.372191,0.340791
6,0.9595,1.185422,0.748854,0.443184,0.396324,0.388273
7,0.7715,1.124083,0.753437,0.47087,0.426932,0.420445
8,0.6449,1.085651,0.768103,0.481082,0.459249,0.452656
9,0.5303,1.049778,0.767186,0.491288,0.479093,0.474244
10,0.4361,1.01332,0.773602,0.492546,0.476312,0.476718


[I 2025-03-15 12:35:20,187] Trial 93 pruned. 


Trial 94 with params: {'learning_rate': 0.0004558842215169161, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3334,2.813756,0.430797,0.069463,0.102858,0.078927
2,2.4932,2.144271,0.560037,0.210593,0.196523,0.179227
3,1.8827,1.680812,0.666361,0.286069,0.28827,0.2676
4,1.4362,1.417723,0.699358,0.316781,0.319431,0.299506
5,1.1282,1.284216,0.734189,0.332225,0.367654,0.336337
6,0.9107,1.161013,0.752521,0.438197,0.403858,0.394519
7,0.7254,1.12181,0.759853,0.474853,0.446708,0.440433
8,0.607,1.08033,0.768103,0.48935,0.462913,0.458651
9,0.4931,1.047243,0.768103,0.47427,0.479884,0.471035
10,0.4035,1.00578,0.778185,0.491102,0.486567,0.483417


[I 2025-03-15 12:38:00,414] Trial 94 finished with value: 0.7114203169346847 and parameters: {'learning_rate': 0.0004558842215169161, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 0}. Best is trial 82 with value: 0.7143632508757508.


Trial 95 with params: {'learning_rate': 0.0004019951057688599, 'weight_decay': 0.007, 'adam_beta1': 0.91, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3781,2.902429,0.412466,0.074903,0.093313,0.073333
2,2.6039,2.265015,0.538955,0.178774,0.173472,0.15869
3,2.0234,1.803291,0.640697,0.277153,0.268935,0.251522
4,1.5838,1.533855,0.696609,0.330877,0.328912,0.305578
5,1.2672,1.357666,0.733272,0.349782,0.373829,0.345858
6,1.0305,1.21675,0.749771,0.444921,0.395097,0.38698
7,0.8354,1.151662,0.751604,0.454794,0.414391,0.408358
8,0.704,1.109756,0.762603,0.480048,0.448569,0.445737
9,0.5838,1.058818,0.765353,0.498926,0.467691,0.466736
10,0.4859,1.029884,0.771769,0.488249,0.468394,0.466648


[I 2025-03-15 12:39:58,960] Trial 95 pruned. 


Trial 96 with params: {'learning_rate': 0.0002768236517832991, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5169,3.138341,0.311641,0.070356,0.060025,0.048852
2,2.8917,2.590642,0.453712,0.081776,0.112773,0.081386
3,2.3975,2.157327,0.560953,0.23814,0.18618,0.171105
4,1.9818,1.845193,0.656279,0.298529,0.277381,0.263173
5,1.667,1.615433,0.707608,0.356967,0.342363,0.327255
6,1.3938,1.433415,0.715857,0.343283,0.341711,0.323314
7,1.1813,1.334191,0.72319,0.349516,0.353494,0.336844
8,1.0344,1.27982,0.742438,0.381192,0.403538,0.376167
9,0.9073,1.206099,0.747938,0.411168,0.409922,0.398091
10,0.7881,1.158741,0.757104,0.485124,0.42173,0.420705


[I 2025-03-15 12:41:44,831] Trial 96 pruned. 


Trial 97 with params: {'learning_rate': 0.0003121423766556698, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4571,3.048447,0.381302,0.061191,0.080688,0.062555
2,2.7877,2.473094,0.480293,0.137784,0.129172,0.105765
3,2.2649,2.027138,0.585701,0.279752,0.223854,0.213406
4,1.8383,1.729682,0.678277,0.341301,0.302217,0.285955
5,1.5278,1.513347,0.713107,0.333014,0.344993,0.323681
6,1.2701,1.357136,0.722273,0.36485,0.361155,0.342229
7,1.067,1.269866,0.729606,0.373721,0.369742,0.351096
8,0.9241,1.219948,0.750687,0.429872,0.416393,0.399463
9,0.7991,1.152969,0.754354,0.489824,0.428747,0.426355
10,0.684,1.110487,0.761687,0.470753,0.425716,0.42662


[I 2025-03-15 12:43:33,365] Trial 97 pruned. 


Trial 98 with params: {'learning_rate': 0.00020799099595853834, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5678,3.258699,0.205316,0.056533,0.028027,0.019281
2,3.0608,2.815453,0.421632,0.066742,0.097369,0.073137
3,2.6593,2.436573,0.480293,0.139365,0.126757,0.104407
4,2.2948,2.133754,0.572869,0.222372,0.196743,0.180757
5,2.0147,1.894134,0.63978,0.287143,0.254482,0.238017


[I 2025-03-15 12:44:03,720] Trial 98 pruned. 


Trial 99 with params: {'learning_rate': 0.00030690900468464494, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4886,3.082783,0.358387,0.067767,0.073882,0.060385
2,2.822,2.508441,0.462878,0.115407,0.117495,0.089829
3,2.2992,2.058732,0.572869,0.273911,0.203072,0.188941
4,1.8716,1.758078,0.667278,0.319694,0.291644,0.276104
5,1.5563,1.537537,0.71494,0.347039,0.353978,0.333941
6,1.2899,1.368103,0.72594,0.360619,0.364462,0.347372
7,1.0828,1.280535,0.72594,0.347266,0.363344,0.342208
8,0.9412,1.234635,0.749771,0.405506,0.418038,0.393501
9,0.8183,1.166609,0.754354,0.447338,0.422637,0.414985
10,0.7029,1.124783,0.75527,0.472975,0.419082,0.41883


[I 2025-03-15 12:45:53,831] Trial 99 pruned. 


Trial 100 with params: {'learning_rate': 0.0003973352855901667, 'weight_decay': 0.005, 'adam_beta1': 0.91, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.382,2.909615,0.410632,0.074585,0.092022,0.071664
2,2.6128,2.274921,0.538955,0.179491,0.173472,0.158958
3,2.0351,1.813856,0.638863,0.276099,0.265482,0.249104
4,1.596,1.542432,0.697525,0.334698,0.329275,0.306208
5,1.279,1.364062,0.731439,0.347723,0.372041,0.34415
6,1.041,1.222071,0.747938,0.441695,0.390609,0.380972
7,0.8451,1.155713,0.751604,0.454539,0.414391,0.408147
8,0.7129,1.112671,0.761687,0.469578,0.448466,0.442732
9,0.5918,1.060912,0.768103,0.49644,0.468575,0.46678
10,0.493,1.032474,0.770852,0.488506,0.466478,0.4662


[I 2025-03-15 12:48:39,483] Trial 100 finished with value: 0.6655710656038671 and parameters: {'learning_rate': 0.0003973352855901667, 'weight_decay': 0.005, 'adam_beta1': 0.91, 'warmup_steps': 0}. Best is trial 82 with value: 0.7143632508757508.


Trial 101 with params: {'learning_rate': 0.00033577375407781135, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4344,3.005298,0.395967,0.056541,0.08488,0.063615
2,2.7325,2.41033,0.494959,0.169261,0.137051,0.115079
3,2.1917,1.957805,0.606783,0.287882,0.242998,0.233485
4,1.7605,1.667919,0.687443,0.338323,0.310491,0.293562
5,1.4498,1.464416,0.718607,0.340816,0.352977,0.330382
6,1.1996,1.313282,0.72594,0.386161,0.366891,0.352002
7,0.999,1.235761,0.736939,0.398725,0.379929,0.363529
8,0.8583,1.181113,0.75527,0.427092,0.424803,0.408445
9,0.7323,1.129041,0.753437,0.469494,0.433123,0.427237
10,0.6241,1.091623,0.766269,0.486332,0.445278,0.450458


[I 2025-03-15 12:50:27,717] Trial 101 pruned. 


Trial 102 with params: {'learning_rate': 0.00027497643387713064, 'weight_decay': 0.004, 'adam_beta1': 0.91, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4967,3.126118,0.331806,0.068579,0.066831,0.053061
2,2.8865,2.592237,0.449129,0.082566,0.111002,0.079272
3,2.4054,2.17068,0.550871,0.228511,0.178202,0.163302
4,1.9954,1.860813,0.650779,0.279835,0.26766,0.25062
5,1.686,1.623758,0.706691,0.321679,0.333349,0.31166
6,1.4112,1.445042,0.719523,0.340446,0.349996,0.328294
7,1.1952,1.337676,0.72044,0.346097,0.350675,0.333755
8,1.0458,1.277403,0.747021,0.394715,0.404764,0.378471
9,0.9186,1.211984,0.751604,0.430403,0.419228,0.405291
10,0.7977,1.15789,0.753437,0.485583,0.425654,0.425438


[I 2025-03-15 12:52:13,078] Trial 102 pruned. 


Trial 103 with params: {'learning_rate': 1.4169288463186063e-05, 'weight_decay': 0.001, 'adam_beta1': 0.93, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8604,3.809244,0.084326,0.008823,0.030302,0.007778
2,3.7837,3.741844,0.181485,0.010079,0.022061,0.009642
3,3.7315,3.688556,0.186068,0.012077,0.02291,0.010564
4,3.6795,3.641033,0.183318,0.01436,0.021918,0.009344
5,3.645,3.595852,0.183318,0.021071,0.021918,0.009516
6,3.5957,3.553714,0.180568,0.023558,0.021096,0.008119
7,3.5552,3.51359,0.180568,0.023558,0.021096,0.008119
8,3.5227,3.476527,0.181485,0.023561,0.02137,0.00861
9,3.4866,3.441344,0.192484,0.063611,0.024563,0.013832
10,3.4573,3.408248,0.208983,0.083687,0.02944,0.020702


[I 2025-03-15 12:53:07,343] Trial 103 pruned. 


Trial 104 with params: {'learning_rate': 0.0004722321097158757, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4128,2.901514,0.40055,0.075729,0.08752,0.06609
2,2.5673,2.208069,0.533456,0.186301,0.171446,0.155301
3,1.9308,1.714222,0.650779,0.293817,0.281854,0.262214
4,1.462,1.449397,0.701192,0.325876,0.331309,0.306138
5,1.1369,1.31062,0.734189,0.430456,0.408886,0.38772
6,0.8963,1.171513,0.750687,0.47708,0.411005,0.407285
7,0.7195,1.14284,0.751604,0.478884,0.445018,0.44558
8,0.6,1.100064,0.75527,0.473989,0.460569,0.450417
9,0.4781,1.051618,0.762603,0.498562,0.480897,0.476387
10,0.3921,1.047729,0.76352,0.491424,0.479433,0.478053


[I 2025-03-15 12:55:54,410] Trial 104 finished with value: 0.687537608681543 and parameters: {'learning_rate': 0.0004722321097158757, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 4}. Best is trial 82 with value: 0.7143632508757508.


Trial 105 with params: {'learning_rate': 1.4771448129559617e-06, 'weight_decay': 0.007, 'adam_beta1': 0.98, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8902,3.87485,0.007333,0.003625,0.021778,0.002019
2,3.8744,3.865748,0.009166,0.003681,0.021634,0.002025
3,3.8713,3.857579,0.013749,0.004389,0.022153,0.002734
4,3.8606,3.850141,0.016499,0.004503,0.022463,0.003121
5,3.8574,3.843163,0.021082,0.004228,0.022631,0.003255
6,3.8463,3.836663,0.029331,0.005097,0.023914,0.00445
7,3.8396,3.830513,0.036664,0.030221,0.025399,0.006073
8,3.8352,3.824734,0.051329,0.030935,0.027057,0.007247
9,3.83,3.819167,0.055912,0.009334,0.02726,0.006915
10,3.8244,3.81395,0.076077,0.00878,0.02937,0.007855


[I 2025-03-15 12:56:46,485] Trial 105 pruned. 


Trial 106 with params: {'learning_rate': 0.0004422979949673411, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3433,2.83273,0.425298,0.068927,0.099959,0.077034
2,2.5165,2.169928,0.55637,0.21317,0.194114,0.178849
3,1.9124,1.705527,0.664528,0.286557,0.286007,0.266318
4,1.4672,1.439922,0.700275,0.314613,0.320292,0.300363
5,1.1582,1.30111,0.731439,0.33301,0.364906,0.333941
6,0.9382,1.172266,0.751604,0.43523,0.396082,0.385796
7,0.7517,1.122827,0.757104,0.470741,0.436181,0.430889
8,0.6292,1.081494,0.769019,0.489523,0.462829,0.458259
9,0.5147,1.041723,0.769019,0.489476,0.477272,0.474438
10,0.4227,1.005922,0.775435,0.487336,0.483437,0.480676


[I 2025-03-15 12:59:33,186] Trial 106 finished with value: 0.692708547304787 and parameters: {'learning_rate': 0.0004422979949673411, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 0}. Best is trial 82 with value: 0.7143632508757508.


Trial 107 with params: {'learning_rate': 0.0004980467405049822, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3055,2.76191,0.436297,0.064144,0.106276,0.077407
2,2.4256,2.073035,0.566453,0.209299,0.199378,0.177179
3,1.7927,1.606267,0.676444,0.279732,0.288879,0.267591
4,1.3447,1.356084,0.714024,0.329054,0.33041,0.312055
5,1.0391,1.225432,0.735105,0.331365,0.368682,0.338287
6,0.8192,1.131178,0.75527,0.449852,0.412132,0.405395
7,0.6528,1.090843,0.75802,0.495028,0.461904,0.455904
8,0.5442,1.056312,0.767186,0.482825,0.469666,0.466106
9,0.4345,1.015088,0.771769,0.511113,0.498309,0.49244
10,0.3475,0.991901,0.780935,0.508422,0.496431,0.491619


[I 2025-03-15 13:02:24,172] Trial 107 finished with value: 0.6956523753244204 and parameters: {'learning_rate': 0.0004980467405049822, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 0}. Best is trial 82 with value: 0.7143632508757508.


Trial 108 with params: {'learning_rate': 1.5293018718400694e-05, 'weight_decay': 0.01, 'adam_beta1': 0.91, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8581,3.803776,0.099908,0.007981,0.032064,0.008032
2,3.7763,3.731242,0.184235,0.011622,0.022543,0.010142
3,3.7205,3.674609,0.184235,0.013601,0.022192,0.009698
4,3.6648,3.62447,0.186068,0.016097,0.02274,0.010592
5,3.6274,3.574342,0.180568,0.019561,0.021096,0.008097


[I 2025-03-15 13:02:54,350] Trial 108 pruned. 


Trial 109 with params: {'learning_rate': 0.00016595892826902088, 'weight_decay': 0.01, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6364,3.373672,0.176902,0.003538,0.02,0.006012
2,3.2125,3.007541,0.394134,0.057122,0.084418,0.063683
3,2.8827,2.683587,0.457379,0.081345,0.11557,0.085452
4,2.561,2.400956,0.506874,0.141377,0.147972,0.125969
5,2.3103,2.15963,0.569203,0.218145,0.193843,0.177672
6,2.0505,1.967952,0.603116,0.273736,0.227649,0.214286
7,1.8419,1.815635,0.653529,0.318187,0.273342,0.261621
8,1.683,1.68745,0.694775,0.369431,0.325625,0.31541
9,1.5199,1.575008,0.699358,0.349163,0.32474,0.309741
10,1.3798,1.491096,0.71494,0.373637,0.349232,0.335777


[I 2025-03-15 13:04:39,065] Trial 109 pruned. 


Trial 110 with params: {'learning_rate': 0.0004264632669406636, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3623,2.877786,0.412466,0.074439,0.09418,0.074381
2,2.5746,2.234995,0.543538,0.176492,0.183563,0.16629
3,1.9879,1.77244,0.63978,0.267605,0.260992,0.245006
4,1.5427,1.505696,0.694775,0.302143,0.325043,0.292802
5,1.2241,1.33342,0.726856,0.34531,0.366846,0.339091
6,0.9872,1.194474,0.749771,0.449986,0.393907,0.385726
7,0.7927,1.133364,0.750687,0.440156,0.415263,0.407005
8,0.6627,1.094494,0.769936,0.489651,0.459098,0.456536
9,0.5456,1.049623,0.767186,0.488607,0.474648,0.469393
10,0.4515,1.021571,0.768103,0.469008,0.45846,0.452554


[I 2025-03-15 13:05:36,252] Trial 110 pruned. 


Trial 111 with params: {'learning_rate': 0.00035593962840501514, 'weight_decay': 0.001, 'adam_beta1': 0.93, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4247,3.001115,0.393217,0.059948,0.08379,0.064168
2,2.7273,2.413249,0.490376,0.177948,0.137841,0.11864
3,2.1928,1.966419,0.577452,0.249556,0.210863,0.195603
4,1.755,1.66657,0.678277,0.335364,0.306073,0.287383
5,1.4348,1.453949,0.71494,0.329655,0.350719,0.323923
6,1.1713,1.289436,0.734189,0.389831,0.377724,0.362241
7,0.9604,1.215231,0.747021,0.431154,0.406333,0.394977
8,0.8176,1.159761,0.758937,0.44662,0.430385,0.416787
9,0.6958,1.116768,0.762603,0.490818,0.46584,0.460388
10,0.5897,1.098527,0.76352,0.469907,0.449337,0.447238


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--precision/155d3220d6cd4a6553f12da68eeb3d1f97cf431206304a4bc6e2d564c29502e9 (last modified on Fri Jan 10 23:13:59 2025) since it couldn't be found locally at evaluate-metric--precision, or remotely on the Hugging Face Hub.
[I 2025-03-15 13:08:57,445] Trial 111 finished with value: 0.6319116918647526 and parameters: {'learning_rate': 0.00035593962840501514, 'weight_decay': 0.001, 'adam_beta1': 0.93, 'warmup_steps': 0}. Best is trial 82 with value: 0.7143632508757508.


Trial 112 with params: {'learning_rate': 0.0004676508625031947, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3331,2.823088,0.418882,0.068967,0.097985,0.075811
2,2.506,2.160608,0.557287,0.193674,0.193158,0.172319
3,1.897,1.69416,0.660862,0.284402,0.278322,0.258767
4,1.4517,1.442897,0.705775,0.303608,0.336709,0.303954
5,1.1374,1.283657,0.737855,0.364802,0.381661,0.354687
6,0.9038,1.162328,0.747021,0.438902,0.397314,0.389963
7,0.7257,1.115969,0.754354,0.460927,0.432143,0.42683
8,0.6034,1.06958,0.761687,0.463688,0.449096,0.4413
9,0.4903,1.046996,0.764436,0.472166,0.467764,0.458854
10,0.4011,1.013773,0.771769,0.47169,0.47122,0.46561


[I 2025-03-15 13:11:44,873] Trial 112 finished with value: 0.7119012417170726 and parameters: {'learning_rate': 0.0004676508625031947, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 0}. Best is trial 82 with value: 0.7143632508757508.


Trial 113 with params: {'learning_rate': 0.00048823044269866357, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3119,2.773949,0.43538,0.065291,0.106061,0.078104
2,2.441,2.089547,0.565536,0.212092,0.19793,0.17707
3,1.813,1.623415,0.671861,0.282847,0.285526,0.264839
4,1.3642,1.368361,0.71494,0.332339,0.331485,0.312891
5,1.0577,1.237123,0.736022,0.331167,0.368897,0.338254
6,0.837,1.138285,0.758937,0.454818,0.416316,0.409072
7,0.6674,1.101627,0.754354,0.489505,0.452765,0.447735
8,0.5565,1.060639,0.765353,0.491046,0.473412,0.467539
9,0.4461,1.024054,0.768103,0.481034,0.486821,0.475312
10,0.3598,0.99684,0.778185,0.489724,0.49286,0.485338


[I 2025-03-15 13:14:32,620] Trial 113 finished with value: 0.70118613130093 and parameters: {'learning_rate': 0.00048823044269866357, 'weight_decay': 0.003, 'adam_beta1': 0.9, 'warmup_steps': 0}. Best is trial 82 with value: 0.7143632508757508.


Trial 114 with params: {'learning_rate': 0.0004693590307705194, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.91, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3972,2.885262,0.407883,0.073106,0.093269,0.07277
2,2.5506,2.193174,0.530706,0.176215,0.168496,0.151482
3,1.9175,1.695232,0.666361,0.28382,0.288779,0.267697
4,1.4495,1.43257,0.702108,0.314376,0.321409,0.298244
5,1.1249,1.288111,0.731439,0.374735,0.382385,0.357238
6,0.8903,1.163637,0.747021,0.431751,0.401573,0.396215
7,0.7117,1.116448,0.75527,0.467282,0.458743,0.449249
8,0.5967,1.080504,0.765353,0.471542,0.476555,0.463785
9,0.4843,1.036642,0.770852,0.491899,0.483324,0.479738
10,0.3928,1.025573,0.76352,0.461571,0.464228,0.454569


[I 2025-03-15 13:15:24,864] Trial 114 pruned. 


Trial 115 with params: {'learning_rate': 0.00037282715336766075, 'weight_decay': 0.003, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.447,3.023668,0.36022,0.065634,0.073944,0.058428
2,2.7456,2.434915,0.468378,0.151214,0.126836,0.101534
3,2.2076,1.986398,0.571952,0.245983,0.200101,0.185597
4,1.7635,1.682265,0.669111,0.302015,0.292017,0.27421
5,1.4401,1.453401,0.711274,0.350769,0.352042,0.329439
6,1.1639,1.291748,0.729606,0.38446,0.368132,0.352599
7,0.9457,1.21373,0.743355,0.411057,0.397661,0.381413
8,0.804,1.166164,0.76077,0.463811,0.442089,0.43167
9,0.6791,1.117752,0.764436,0.493761,0.469643,0.464917
10,0.5719,1.102473,0.768103,0.506589,0.476726,0.475944


[I 2025-03-15 13:18:14,386] Trial 115 finished with value: 0.6042338226625042 and parameters: {'learning_rate': 0.00037282715336766075, 'weight_decay': 0.003, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1}. Best is trial 82 with value: 0.7143632508757508.


Trial 116 with params: {'learning_rate': 1.274731960791765e-06, 'weight_decay': 0.003, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8907,3.876062,0.008249,0.004092,0.022233,0.002443
2,3.8762,3.868097,0.009166,0.003932,0.021634,0.002036
3,3.8742,3.860952,0.010082,0.003545,0.021738,0.002126
4,3.8643,3.854377,0.014665,0.004199,0.022256,0.002804
5,3.862,3.848276,0.020165,0.004674,0.022878,0.003519
6,3.8519,3.842513,0.026581,0.008922,0.023954,0.004898
7,3.8457,3.83706,0.029331,0.008819,0.024265,0.005087
8,3.8421,3.831947,0.038497,0.027794,0.025787,0.006461
9,3.8375,3.826997,0.050412,0.035958,0.027475,0.008263
10,3.8325,3.822375,0.057745,0.032754,0.028304,0.00866


[I 2025-03-15 13:19:09,620] Trial 116 pruned. 


Trial 117 with params: {'learning_rate': 0.00041782570595644595, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.92, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4073,2.931126,0.396884,0.076443,0.085763,0.064335
2,2.6275,2.288162,0.52429,0.163787,0.158379,0.139147
3,2.0393,1.818049,0.637947,0.269939,0.254892,0.240076
4,1.5824,1.541424,0.693859,0.297561,0.320352,0.292265
5,1.2561,1.361193,0.729606,0.36308,0.372554,0.346358
6,1.0104,1.215136,0.745188,0.458967,0.405063,0.397058
7,0.8222,1.161806,0.748854,0.455614,0.41775,0.410928
8,0.6938,1.113319,0.762603,0.473405,0.451954,0.44519
9,0.5745,1.070449,0.76352,0.49604,0.478682,0.475834
10,0.4799,1.055263,0.767186,0.471885,0.467444,0.460793


[I 2025-03-15 13:21:54,933] Trial 117 finished with value: 0.6879438343160191 and parameters: {'learning_rate': 0.00041782570595644595, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.92, 'warmup_steps': 1}. Best is trial 82 with value: 0.7143632508757508.


Trial 118 with params: {'learning_rate': 0.0004762367229644209, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3318,2.826898,0.416132,0.06868,0.096745,0.074965
2,2.5119,2.172861,0.557287,0.18106,0.194306,0.173241
3,1.9065,1.703373,0.651696,0.257988,0.268859,0.247134
4,1.4558,1.437585,0.703941,0.301886,0.328543,0.298427
5,1.1346,1.27718,0.736022,0.37212,0.379308,0.356024
6,0.8994,1.158073,0.75802,0.466233,0.427509,0.421307
7,0.7193,1.120244,0.751604,0.467244,0.432564,0.427266
8,0.6016,1.069207,0.76077,0.460692,0.444485,0.43637
9,0.4886,1.039061,0.76352,0.482781,0.471679,0.464435
10,0.3995,1.024606,0.766269,0.473203,0.474558,0.467497


[I 2025-03-15 13:24:43,083] Trial 118 finished with value: 0.7156145737360871 and parameters: {'learning_rate': 0.0004762367229644209, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 0}. Best is trial 118 with value: 0.7156145737360871.


Trial 119 with params: {'learning_rate': 0.00048357804883733345, 'weight_decay': 0.002, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3362,2.847936,0.395967,0.072376,0.088198,0.069852
2,2.5403,2.216488,0.539872,0.190707,0.175506,0.158603
3,1.9565,1.762036,0.631531,0.243979,0.262791,0.237417
4,1.5055,1.480479,0.692026,0.290492,0.314977,0.289042
5,1.1827,1.297445,0.728689,0.367541,0.375413,0.354476
6,0.9249,1.16488,0.742438,0.39172,0.384994,0.368497
7,0.7397,1.135142,0.759853,0.473757,0.453046,0.443482
8,0.6199,1.085724,0.765353,0.472944,0.472275,0.46188
9,0.5085,1.047664,0.767186,0.480202,0.46753,0.463185
10,0.4156,1.065252,0.76077,0.488894,0.495697,0.485601


[I 2025-03-15 13:27:32,059] Trial 119 finished with value: 0.7142002441346605 and parameters: {'learning_rate': 0.00048357804883733345, 'weight_decay': 0.002, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0}. Best is trial 118 with value: 0.7156145737360871.


Trial 120 with params: {'learning_rate': 0.00020311718523518122, 'weight_decay': 0.005, 'adam_beta1': 0.96, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5824,3.295966,0.176902,0.003538,0.02,0.006012
2,3.1236,2.916826,0.405133,0.050599,0.087194,0.061937
3,2.7787,2.573864,0.456462,0.083758,0.113326,0.080349
4,2.4328,2.276532,0.528873,0.192056,0.168112,0.149927
5,2.1645,2.034426,0.587534,0.266761,0.219413,0.201701
6,1.8913,1.832912,0.63703,0.295896,0.257241,0.245832
7,1.6681,1.677978,0.683776,0.326011,0.301302,0.28516
8,1.4991,1.548382,0.702108,0.347711,0.328166,0.310006
9,1.3363,1.462773,0.71494,0.34651,0.357031,0.335524
10,1.1961,1.382344,0.730522,0.372149,0.375584,0.355335


[I 2025-03-15 13:29:25,526] Trial 120 pruned. 


Trial 121 with params: {'learning_rate': 0.0001384093567788085, 'weight_decay': 0.001, 'adam_beta1': 0.99, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6605,3.447104,0.176902,0.003538,0.02,0.006012
2,3.3222,3.169027,0.256645,0.074824,0.042754,0.034949
3,3.0869,2.951412,0.3978,0.055639,0.084879,0.063586
4,2.8702,2.759159,0.429881,0.063868,0.099563,0.073193
5,2.7114,2.573675,0.445463,0.061975,0.107626,0.077057
6,2.5123,2.414034,0.461962,0.09928,0.121264,0.094104
7,2.3476,2.282103,0.498625,0.201791,0.153044,0.133895
8,2.2168,2.148837,0.509624,0.199889,0.161631,0.144988
9,2.067,2.037517,0.580202,0.214483,0.208771,0.186689
10,1.9381,1.930723,0.589368,0.240395,0.223691,0.200296


[I 2025-03-15 13:30:19,702] Trial 121 pruned. 


Trial 122 with params: {'learning_rate': 8.637271504022808e-05, 'weight_decay': 0.002, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.7248,3.56095,0.176902,0.003538,0.02,0.006012
2,3.4651,3.32974,0.19615,0.063607,0.025753,0.0162
3,3.2598,3.124934,0.382218,0.060724,0.080583,0.063089
4,3.0582,2.936909,0.420715,0.072398,0.092616,0.068923
5,2.9042,2.767869,0.437214,0.063287,0.103901,0.076735
6,2.7227,2.609559,0.461962,0.103439,0.117341,0.089735
7,2.5646,2.47039,0.483043,0.120572,0.128088,0.10363
8,2.4429,2.348713,0.535289,0.203945,0.171295,0.158182
9,2.3103,2.240109,0.565536,0.22565,0.196514,0.183231
10,2.1906,2.145131,0.590284,0.286602,0.218015,0.206209


[I 2025-03-15 13:31:13,901] Trial 122 pruned. 


Trial 123 with params: {'learning_rate': 0.0003058278334151859, 'weight_decay': 0.0, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4742,3.09811,0.337305,0.067361,0.067793,0.051943
2,2.8535,2.563045,0.448213,0.082024,0.114333,0.084644
3,2.3724,2.144683,0.543538,0.184655,0.171495,0.151198
4,1.9587,1.834716,0.633364,0.271214,0.249753,0.232462
5,1.6422,1.593393,0.698442,0.327305,0.330619,0.309725
6,1.3614,1.40573,0.721357,0.338661,0.347052,0.327821
7,1.1322,1.3058,0.72319,0.385206,0.370141,0.35688
8,0.9814,1.243044,0.749771,0.423162,0.410122,0.390665
9,0.8502,1.180094,0.747938,0.448885,0.419265,0.410102
10,0.7301,1.142304,0.758937,0.454672,0.426608,0.421639


[I 2025-03-15 13:33:13,914] Trial 123 pruned. 


Trial 124 with params: {'learning_rate': 0.00044950350781444275, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3455,2.846181,0.414299,0.070087,0.095752,0.074678
2,2.5357,2.19288,0.551787,0.17345,0.188871,0.168621
3,1.9368,1.72817,0.651696,0.265671,0.271006,0.253181
4,1.4911,1.468371,0.702108,0.301552,0.332898,0.300722
5,1.1742,1.304418,0.731439,0.328344,0.370369,0.338765
6,0.9415,1.177941,0.748854,0.463397,0.404121,0.399527
7,0.7539,1.120845,0.757104,0.4816,0.431922,0.428056
8,0.6292,1.075808,0.767186,0.459834,0.45187,0.444735
9,0.5149,1.050558,0.76352,0.485525,0.4722,0.466278
10,0.4265,1.01704,0.770852,0.472758,0.464767,0.459147


[I 2025-03-15 13:36:01,162] Trial 124 finished with value: 0.7026151622943182 and parameters: {'learning_rate': 0.00044950350781444275, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 0}. Best is trial 118 with value: 0.7156145737360871.


Trial 125 with params: {'learning_rate': 0.0002808306023385571, 'weight_decay': 0.002, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5009,3.151452,0.274977,0.071977,0.047888,0.036941
2,2.9283,2.663586,0.439963,0.063078,0.107863,0.077547
3,2.4875,2.264583,0.534372,0.184451,0.172415,0.154229
4,2.0806,1.938045,0.601283,0.23748,0.230929,0.208023
5,1.7626,1.689424,0.68011,0.321757,0.304361,0.285483
6,1.4759,1.491277,0.712191,0.333017,0.340299,0.318528
7,1.2403,1.371252,0.715857,0.360124,0.346606,0.329013
8,1.0859,1.293081,0.751604,0.422767,0.411116,0.391425
9,0.9472,1.227282,0.742438,0.400978,0.391538,0.375856
10,0.8227,1.181811,0.751604,0.427989,0.412392,0.400039


[I 2025-03-15 13:37:00,118] Trial 125 pruned. 


Trial 126 with params: {'learning_rate': 0.0003066437425122062, 'weight_decay': 0.002, 'adam_beta1': 0.96, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4791,3.117218,0.298808,0.070994,0.054165,0.039407
2,2.8884,2.625527,0.442713,0.062039,0.108939,0.077255
3,2.4453,2.229374,0.540788,0.202925,0.179315,0.162642
4,2.0297,1.894075,0.59945,0.263966,0.233693,0.211562
5,1.7036,1.645789,0.686526,0.312197,0.317941,0.289639
6,1.4161,1.454179,0.712191,0.346966,0.355309,0.334149
7,1.1794,1.34092,0.716774,0.39322,0.36064,0.343259
8,1.0262,1.25639,0.747021,0.431712,0.404167,0.387637
9,0.8923,1.211104,0.742438,0.3909,0.39684,0.375625
10,0.7723,1.16259,0.749771,0.468125,0.421732,0.412672


[I 2025-03-15 13:38:47,982] Trial 126 pruned. 


Trial 127 with params: {'learning_rate': 0.00022812103953476773, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5847,3.250943,0.203483,0.038648,0.027515,0.018527
2,3.0364,2.769402,0.430797,0.060698,0.102545,0.074101
3,2.6,2.367241,0.505958,0.138965,0.144059,0.122543
4,2.2167,2.053342,0.584785,0.247164,0.208946,0.193259
5,1.9187,1.806792,0.673694,0.342009,0.297278,0.289745
6,1.6345,1.608154,0.698442,0.374791,0.320615,0.312727
7,1.4069,1.470955,0.707608,0.329822,0.33001,0.315205
8,1.2489,1.381845,0.734189,0.36803,0.382543,0.360201
9,1.1016,1.302592,0.737855,0.364099,0.380476,0.363449
10,0.9724,1.248597,0.751604,0.424234,0.405285,0.396049


[I 2025-03-15 13:40:34,961] Trial 127 pruned. 


Trial 128 with params: {'learning_rate': 0.0004920368163252596, 'weight_decay': 0.001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3266,2.824339,0.405133,0.068196,0.092546,0.071908
2,2.5092,2.176996,0.55637,0.196124,0.192885,0.173585
3,1.9064,1.705657,0.643446,0.250341,0.264807,0.244108
4,1.4546,1.438911,0.696609,0.287662,0.319191,0.289156
5,1.133,1.270812,0.732356,0.362746,0.37273,0.35104
6,0.8882,1.148274,0.747021,0.439766,0.392617,0.382844
7,0.7073,1.115926,0.752521,0.456206,0.427618,0.423715
8,0.5942,1.069764,0.767186,0.485806,0.483412,0.475943
9,0.4777,1.042754,0.76352,0.474712,0.470958,0.463316
10,0.3889,1.032361,0.771769,0.508365,0.507088,0.500936


[I 2025-03-15 13:43:25,766] Trial 128 finished with value: 0.7100760490936798 and parameters: {'learning_rate': 0.0004920368163252596, 'weight_decay': 0.001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0}. Best is trial 118 with value: 0.7156145737360871.


Trial 129 with params: {'learning_rate': 0.0001297138581409507, 'weight_decay': 0.0, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6869,3.47237,0.176902,0.003538,0.02,0.006012
2,3.3365,3.159709,0.345555,0.064974,0.071212,0.054466
3,3.0615,2.891986,0.419798,0.070418,0.094728,0.07155
4,2.7939,2.64574,0.458295,0.09897,0.115098,0.087404
5,2.5812,2.423324,0.504125,0.172279,0.148422,0.127996


[I 2025-03-15 13:43:51,587] Trial 129 pruned. 


Trial 130 with params: {'learning_rate': 0.00038155303056576956, 'weight_decay': 0.002, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4055,2.969987,0.388634,0.056837,0.08212,0.061572
2,2.6892,2.376675,0.500458,0.18231,0.14277,0.122801
3,2.1464,1.928937,0.582951,0.252266,0.210268,0.196601
4,1.7051,1.627403,0.678277,0.300107,0.301766,0.283442
5,1.3864,1.413743,0.72044,0.369355,0.358401,0.339841
6,1.1255,1.264238,0.739688,0.404045,0.38075,0.369388
7,0.9168,1.195619,0.747021,0.437789,0.404065,0.396877
8,0.7744,1.135765,0.762603,0.467261,0.441639,0.431958
9,0.6526,1.088737,0.768103,0.494213,0.476922,0.471852
10,0.5491,1.079813,0.765353,0.481627,0.46286,0.459946


[I 2025-03-15 13:44:45,258] Trial 130 pruned. 


Trial 131 with params: {'learning_rate': 0.0004901839825889945, 'weight_decay': 0.002, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3277,2.82627,0.404216,0.068191,0.092182,0.071653
2,2.5117,2.179556,0.555454,0.195885,0.192397,0.173001
3,1.9099,1.708843,0.64253,0.249503,0.263855,0.243243
4,1.458,1.440946,0.698442,0.288348,0.319769,0.289793
5,1.1362,1.272527,0.731439,0.351782,0.371301,0.34621
6,0.8913,1.14906,0.747938,0.447385,0.396617,0.387967
7,0.71,1.116704,0.752521,0.456839,0.427618,0.4234
8,0.5963,1.070194,0.766269,0.465695,0.473412,0.462493
9,0.4801,1.042105,0.76352,0.474712,0.470958,0.463316
10,0.3911,1.032462,0.772686,0.508847,0.507542,0.501802


[I 2025-03-15 13:47:25,688] Trial 131 finished with value: 0.7092676994101437 and parameters: {'learning_rate': 0.0004901839825889945, 'weight_decay': 0.002, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0}. Best is trial 118 with value: 0.7156145737360871.


Trial 132 with params: {'learning_rate': 0.0003008083994533252, 'weight_decay': 0.0, 'adam_beta1': 0.96, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4846,3.126962,0.292392,0.071148,0.052265,0.038219
2,2.9014,2.641612,0.441797,0.062556,0.108294,0.077395
3,2.4631,2.246101,0.541705,0.20359,0.17996,0.163168
4,2.0495,1.91159,0.594867,0.240169,0.225112,0.200904
5,1.7259,1.664008,0.681943,0.311228,0.313814,0.285513
6,1.4385,1.470469,0.709441,0.329517,0.341486,0.31769
7,1.2014,1.354151,0.71494,0.386183,0.356309,0.337903
8,1.047,1.267196,0.747021,0.431876,0.404167,0.387856
9,0.9115,1.220213,0.741522,0.389604,0.390515,0.369909
10,0.7901,1.170942,0.749771,0.472617,0.421381,0.414038


[I 2025-03-15 13:49:26,534] Trial 132 pruned. 


Trial 133 with params: {'learning_rate': 0.00048359108302686323, 'weight_decay': 0.003, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3362,2.847959,0.395967,0.072376,0.088198,0.069852
2,2.5403,2.216562,0.540788,0.190729,0.178006,0.160386
3,1.9565,1.762216,0.632447,0.24482,0.263436,0.238085
4,1.5055,1.480443,0.692942,0.291022,0.316516,0.290558
5,1.1826,1.297512,0.728689,0.367645,0.375413,0.354457
6,0.925,1.164794,0.742438,0.39172,0.384994,0.368497
7,0.7397,1.134906,0.759853,0.473757,0.453046,0.443482
8,0.6198,1.08567,0.765353,0.472944,0.472275,0.46188
9,0.5085,1.047502,0.767186,0.479673,0.46753,0.46284
10,0.4155,1.065021,0.759853,0.487943,0.495423,0.484944


[I 2025-03-15 13:52:04,966] Trial 133 finished with value: 0.7139824172173519 and parameters: {'learning_rate': 0.00048359108302686323, 'weight_decay': 0.003, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0}. Best is trial 118 with value: 0.7156145737360871.


Trial 134 with params: {'learning_rate': 0.0004148912984969679, 'weight_decay': 0.003, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4211,2.98007,0.366636,0.062774,0.075747,0.058925
2,2.6908,2.381382,0.472961,0.141099,0.126538,0.099816
3,2.1426,1.930989,0.586618,0.252259,0.212814,0.194065
4,1.6917,1.619503,0.68011,0.312734,0.314054,0.286641
5,1.3619,1.41807,0.708524,0.342508,0.347083,0.321501
6,1.0948,1.251088,0.738772,0.392734,0.38602,0.368869
7,0.8771,1.182856,0.748854,0.42888,0.408947,0.397253
8,0.7334,1.124665,0.761687,0.477103,0.45227,0.445073
9,0.6057,1.081501,0.765353,0.509686,0.478664,0.478025
10,0.5044,1.070312,0.767186,0.486382,0.483878,0.476239


[I 2025-03-15 13:54:44,900] Trial 134 finished with value: 0.6772508813803748 and parameters: {'learning_rate': 0.0004148912984969679, 'weight_decay': 0.003, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 1}. Best is trial 118 with value: 0.7156145737360871.


Trial 135 with params: {'learning_rate': 0.0003168514525961264, 'weight_decay': 0.004, 'adam_beta1': 0.96, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4697,3.100386,0.31439,0.070477,0.059199,0.044105
2,2.8658,2.597438,0.448213,0.101882,0.115027,0.08609
3,2.4145,2.20037,0.541705,0.206145,0.187802,0.169155
4,1.9953,1.864039,0.605866,0.269157,0.239083,0.217194
5,1.6652,1.61522,0.688359,0.305414,0.320628,0.291279
6,1.3781,1.42691,0.714024,0.341324,0.351859,0.32977
7,1.1429,1.320099,0.716774,0.40637,0.364943,0.35018
8,0.9919,1.238951,0.748854,0.430049,0.406613,0.390278
9,0.861,1.195444,0.748854,0.428242,0.410976,0.394332
10,0.7426,1.147944,0.753437,0.45556,0.42928,0.420079


[I 2025-03-15 13:56:37,513] Trial 135 pruned. 


Trial 136 with params: {'learning_rate': 0.0002758057047481744, 'weight_decay': 0.002, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5033,3.151397,0.291476,0.072643,0.053249,0.042615
2,2.924,2.649382,0.439963,0.062403,0.107863,0.077217
3,2.4708,2.241923,0.532539,0.168412,0.165952,0.147332
4,2.0668,1.926888,0.604033,0.2683,0.230502,0.210496
5,1.7557,1.68057,0.681943,0.317829,0.298956,0.281682
6,1.4733,1.486103,0.713107,0.323044,0.330162,0.312097
7,1.2433,1.371599,0.716774,0.33214,0.345073,0.325859
8,1.0897,1.293383,0.750687,0.422519,0.410226,0.390862
9,0.9503,1.231093,0.741522,0.39323,0.401914,0.384912
10,0.8241,1.180663,0.754354,0.468451,0.422722,0.417901


[I 2025-03-15 13:57:28,806] Trial 136 pruned. 


Trial 137 with params: {'learning_rate': 0.00016880670250329712, 'weight_decay': 0.0, 'adam_beta1': 0.96, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.66,3.406834,0.176902,0.003538,0.02,0.006012
2,3.2467,3.049823,0.372136,0.060763,0.07856,0.058688
3,2.9317,2.743094,0.436297,0.081914,0.102913,0.075822
4,2.6224,2.464241,0.480293,0.10499,0.132183,0.103325
5,2.3772,2.224661,0.543538,0.221058,0.184113,0.168986


[I 2025-03-15 13:57:54,225] Trial 137 pruned. 


Trial 138 with params: {'learning_rate': 0.00043741356248507994, 'weight_decay': 0.001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4025,2.934013,0.387718,0.059302,0.082773,0.062342
2,2.6287,2.298649,0.51879,0.159594,0.154543,0.134793
3,2.0452,1.834099,0.6022,0.268467,0.22897,0.212457
4,1.5847,1.532031,0.697525,0.313521,0.326942,0.29618
5,1.2513,1.34459,0.72044,0.386303,0.364355,0.343111
6,0.9892,1.193027,0.746104,0.441921,0.403929,0.397785
7,0.8046,1.165455,0.744271,0.403465,0.401139,0.388527
8,0.6746,1.103342,0.767186,0.482339,0.459248,0.456005
9,0.5584,1.061804,0.769019,0.504846,0.487509,0.482465
10,0.4665,1.061798,0.76352,0.452179,0.475223,0.458128


[I 2025-03-15 13:58:48,023] Trial 138 pruned. 


Trial 139 with params: {'learning_rate': 1.1619982946199614e-06, 'weight_decay': 0.001, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8914,3.877134,0.008249,0.004173,0.022233,0.002446
2,3.8776,3.869755,0.010082,0.004529,0.022089,0.002456
3,3.8761,3.863133,0.010082,0.003595,0.021738,0.002132
4,3.8667,3.857069,0.013749,0.004254,0.022153,0.002678
5,3.8647,3.851296,0.018332,0.00481,0.022671,0.003353
6,3.8551,3.846063,0.023831,0.004968,0.023293,0.003924
7,3.8492,3.841056,0.027498,0.005195,0.023707,0.004323
8,3.846,3.836272,0.032081,0.009093,0.024576,0.005391
9,3.8418,3.831838,0.038497,0.027817,0.025787,0.006472
10,3.8372,3.82757,0.050412,0.036071,0.027475,0.008323


[I 2025-03-15 13:59:46,426] Trial 139 pruned. 


Trial 140 with params: {'learning_rate': 0.00034495997412051937, 'weight_decay': 0.003, 'adam_beta1': 0.96, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4446,3.055337,0.353804,0.065009,0.072165,0.055845
2,2.8054,2.522484,0.452796,0.094242,0.116413,0.087045
3,2.33,2.116951,0.55912,0.197879,0.190976,0.164466
4,1.9027,1.785079,0.626031,0.313759,0.265209,0.24044
5,1.5645,1.54176,0.691109,0.318718,0.328456,0.301085
6,1.2818,1.363593,0.716774,0.374829,0.358921,0.337609
7,1.0533,1.275015,0.726856,0.390659,0.369216,0.353697
8,0.9064,1.19967,0.749771,0.42752,0.414147,0.399249
9,0.7848,1.153547,0.75527,0.462232,0.439968,0.426765
10,0.6722,1.122587,0.754354,0.457817,0.440736,0.429848


[I 2025-03-15 14:01:33,933] Trial 140 pruned. 


Trial 141 with params: {'learning_rate': 0.0004981796332020285, 'weight_decay': 0.002, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3275,2.833004,0.3978,0.070364,0.089058,0.069912
2,2.521,2.197813,0.544455,0.189935,0.182521,0.164565
3,1.9319,1.74099,0.63703,0.254894,0.269276,0.245524
4,1.4793,1.461324,0.699358,0.315726,0.324588,0.302047
5,1.1535,1.281775,0.728689,0.348126,0.370314,0.346289
6,0.8969,1.155271,0.743355,0.39562,0.385468,0.370935
7,0.7146,1.128517,0.75527,0.479145,0.438716,0.432861
8,0.5999,1.078983,0.76352,0.469338,0.466084,0.455578
9,0.4885,1.04263,0.767186,0.519007,0.484311,0.484605
10,0.3992,1.059707,0.758937,0.503335,0.497284,0.491072


[I 2025-03-15 14:04:09,047] Trial 141 finished with value: 0.7006908983668994 and parameters: {'learning_rate': 0.0004981796332020285, 'weight_decay': 0.002, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0}. Best is trial 118 with value: 0.7156145737360871.


Trial 142 with params: {'learning_rate': 0.0004235336546447731, 'weight_decay': 0.007, 'adam_beta1': 0.93, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4366,2.971219,0.362053,0.080552,0.076551,0.056627
2,2.6636,2.330764,0.523373,0.172005,0.164629,0.144924
3,2.0743,1.843183,0.610449,0.246261,0.240902,0.22148
4,1.6033,1.548545,0.688359,0.297665,0.31038,0.284838
5,1.2649,1.363574,0.727773,0.364356,0.364495,0.339055
6,1.0053,1.212561,0.743355,0.445554,0.401595,0.390659
7,0.8135,1.152853,0.752521,0.464502,0.432454,0.423897
8,0.6902,1.103796,0.76077,0.459739,0.452342,0.442251
9,0.5738,1.072501,0.764436,0.513855,0.489304,0.488059
10,0.4784,1.068956,0.759853,0.456846,0.477856,0.46281


[I 2025-03-15 14:06:54,926] Trial 142 finished with value: 0.6773747428897684 and parameters: {'learning_rate': 0.0004235336546447731, 'weight_decay': 0.007, 'adam_beta1': 0.93, 'warmup_steps': 3}. Best is trial 118 with value: 0.7156145737360871.


Trial 143 with params: {'learning_rate': 0.0004823107653084038, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3974,2.861158,0.419798,0.070006,0.097244,0.074938
2,2.5169,2.150416,0.558203,0.202552,0.200948,0.188174
3,1.8679,1.664317,0.670027,0.314355,0.306268,0.288934
4,1.4023,1.401335,0.708524,0.327172,0.328221,0.310687
5,1.0899,1.267919,0.737855,0.408233,0.389305,0.369395
6,0.8543,1.17049,0.747021,0.448486,0.42372,0.418097
7,0.6836,1.125839,0.747938,0.512382,0.455442,0.455777
8,0.5689,1.088361,0.758937,0.501956,0.476137,0.472821
9,0.455,1.029726,0.766269,0.507702,0.48277,0.484231
10,0.3716,1.026881,0.772686,0.512441,0.489676,0.490714


[I 2025-03-15 14:09:35,922] Trial 143 finished with value: 0.7070454037090963 and parameters: {'learning_rate': 0.0004823107653084038, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 4}. Best is trial 118 with value: 0.7156145737360871.


Trial 144 with params: {'learning_rate': 5.195169526885676e-05, 'weight_decay': 0.007, 'adam_beta1': 0.99, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.7913,3.676977,0.191567,0.015932,0.024384,0.012601
2,3.6186,3.53944,0.176902,0.003538,0.02,0.006012
3,3.497,3.417212,0.176902,0.003538,0.02,0.006012
4,3.3736,3.302738,0.180568,0.043551,0.021024,0.008028
5,3.2887,3.200919,0.304308,0.072081,0.057674,0.048962
6,3.1814,3.10876,0.368469,0.064227,0.076488,0.060652
7,3.0857,3.021619,0.389551,0.057822,0.082521,0.062848
8,3.0126,2.94198,0.410632,0.073941,0.089219,0.066681
9,2.9313,2.867162,0.424381,0.068452,0.096871,0.073963
10,2.8626,2.796312,0.43538,0.065031,0.102032,0.076713


[I 2025-03-15 14:11:23,867] Trial 144 pruned. 


Trial 145 with params: {'learning_rate': 0.00015248248744042186, 'weight_decay': 0.005, 'adam_beta1': 0.93, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6351,3.384617,0.176902,0.003538,0.02,0.006012
2,3.2322,3.034082,0.393217,0.057374,0.084226,0.06372
3,2.9173,2.723018,0.441797,0.080298,0.105827,0.077396
4,2.6097,2.452161,0.496792,0.144264,0.140912,0.118963
5,2.3707,2.217189,0.560037,0.22576,0.187475,0.173412
6,2.1204,2.029992,0.593951,0.245687,0.214212,0.196347
7,1.9177,1.878856,0.644363,0.319001,0.265761,0.25679
8,1.7634,1.751952,0.679193,0.336211,0.296826,0.27986
9,1.6025,1.636251,0.691109,0.363841,0.316075,0.306535
10,1.4642,1.549035,0.711274,0.375413,0.340823,0.325391


[I 2025-03-15 14:13:24,029] Trial 145 pruned. 


Trial 146 with params: {'learning_rate': 0.00013897742449221987, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6504,3.412247,0.176902,0.003538,0.02,0.006012
2,3.2674,3.074276,0.390467,0.058628,0.083284,0.063973
3,2.9665,2.778881,0.436297,0.062849,0.102679,0.074858
4,2.674,2.519199,0.477544,0.119665,0.127897,0.102453
5,2.447,2.290409,0.542621,0.207118,0.175492,0.160765
6,2.2062,2.107826,0.586618,0.229013,0.206009,0.187763
7,2.0105,1.958677,0.623281,0.279979,0.237544,0.228771
8,1.8609,1.832364,0.668194,0.328966,0.284279,0.27135
9,1.7035,1.716218,0.68286,0.340903,0.295567,0.286775
10,1.5673,1.625075,0.699358,0.361551,0.323359,0.309028


[I 2025-03-15 14:15:06,620] Trial 146 pruned. 


Trial 147 with params: {'learning_rate': 1.4477251200913327e-05, 'weight_decay': 0.004, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8588,3.806967,0.096242,0.008228,0.031298,0.007638
2,3.7814,3.740043,0.181485,0.01138,0.022061,0.009656
3,3.7297,3.687729,0.186984,0.012843,0.023184,0.010963
4,3.6783,3.639993,0.181485,0.016071,0.02137,0.008529
5,3.6439,3.595624,0.183318,0.021071,0.021918,0.009516


[I 2025-03-15 14:15:32,498] Trial 147 pruned. 


Trial 148 with params: {'learning_rate': 0.00030333774133548387, 'weight_decay': 0.004, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4766,3.10249,0.334555,0.067203,0.066953,0.051488
2,2.8591,2.569742,0.448213,0.062024,0.110548,0.078073
3,2.3804,2.152195,0.543538,0.186185,0.171495,0.151384
4,1.9674,1.841762,0.631531,0.271059,0.248863,0.231577
5,1.6509,1.59964,0.696609,0.331076,0.32721,0.308022
6,1.3698,1.411225,0.721357,0.333157,0.347052,0.326936
7,1.1404,1.310322,0.721357,0.37963,0.365686,0.352327
8,0.9895,1.246827,0.750687,0.413852,0.40894,0.387986
9,0.8577,1.183771,0.746104,0.42746,0.415337,0.404059
10,0.7371,1.145553,0.75802,0.470048,0.426457,0.425042


[I 2025-03-15 14:17:17,413] Trial 148 pruned. 


Trial 149 with params: {'learning_rate': 0.0004499563740490953, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.349,2.858869,0.409716,0.072132,0.093801,0.073905
2,2.5523,2.215647,0.540788,0.16898,0.178683,0.160364
3,1.9588,1.747913,0.640697,0.273792,0.261621,0.246225
4,1.5076,1.4739,0.703025,0.306409,0.330714,0.298926
5,1.1862,1.305043,0.722273,0.315198,0.355448,0.325776
6,0.9455,1.175576,0.748854,0.464182,0.414157,0.406327
7,0.7566,1.12492,0.75527,0.454972,0.432995,0.428937
8,0.6337,1.074999,0.768103,0.481145,0.451322,0.446282
9,0.5198,1.043069,0.764436,0.486491,0.476648,0.468342
10,0.4335,1.024741,0.767186,0.459329,0.461784,0.453779


[W 2025-03-15 14:18:42,874] Trial 149 failed with parameters: {'learning_rate': 0.0004499563740490953, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 0} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/usr/local/lib/python3.10/dist-packages/transformers/integrations/integration_utils.py", line 250, in _objective
    trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2241, in train
    return inner_training_loop(
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2553, in _inner_training_loop
    and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
KeyboardInterrupt
[W 2025-03-15 14:18:42,878] Trial 149 failed with value None.


KeyboardInterrupt: 

In [29]:
print(best_trial)

BestRun(run_id='118', objective=0.7156145737360871, hyperparameters={'learning_rate': 0.0004762367229644209, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 0}, run_summary=None)


In [36]:
base.reset_seed()

In [37]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-distill_fine_hp-search", logging_dir=f"~/logs/{DATASET}/bert-distill_fine_hp-search", remove_unused_columns=False, epochs=num_epochs, batch_size=batch_size)

In [38]:
def hp_space(trial):
    params =  {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 5e-4, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0, 1e-2, step=1e-3),
        "adam_beta1" : trial.suggest_float("adam_beta1", 0.9, 0.99, step=0.01),
        "warmup_steps" : trial.suggest_int("warmup_steps", 0, warm_up),
        "lambda_param": trial.suggest_float("lambda_param",0,1,step=.1),
        "temperature": trial.suggest_float("temperature", 2,7, step=.5)
    }
    print(f"Trial {trial.number} with params: {params}")
    return params

In [39]:
pruner = optuna.pruners.HyperbandPruner(min_resource=min_r, max_resource=max_r, reduction_factor=2, bootstrap_count=2)
sampler = optuna.samplers.TPESampler(seed=42, multivariate=True)



In [40]:
trainer = base.DistilTrainer(
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    model_init = lambda: get_Bert(),
    #callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)
  

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
best_trial2 = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    compute_objective=lambda metrics: metrics["eval_f1"],
    pruner=pruner,
    sampler=sampler,
    study_name="Test-destilace",
    n_trials=150
)

[I 2025-03-15 14:20:09,293] A new study created in memory with name: Test-destilace


Trial 0 with params: {'learning_rate': 1.0253509690168497e-05, 'weight_decay': 0.01, 'adam_beta1': 0.97, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4594,2.429851,0.041247,0.00816,0.026134,0.006414
2,2.4239,2.396845,0.160403,0.009651,0.019518,0.009362
3,2.3961,2.368189,0.189734,0.034879,0.024276,0.012312
4,2.3717,2.344594,0.184235,0.011102,0.022362,0.009811
5,2.3497,2.323391,0.183318,0.014357,0.021918,0.009339


[I 2025-03-15 14:20:35,601] Trial 0 pruned. 


Trial 1 with params: {'learning_rate': 1.4347159517201402e-06, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.96, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4749,2.462585,0.007333,0.003602,0.021778,0.002005
2,2.4689,2.456624,0.009166,0.003642,0.021634,0.002
3,2.4649,2.451249,0.012832,0.004139,0.022049,0.002535
4,2.4608,2.446311,0.016499,0.004379,0.022463,0.003054
5,2.4527,2.44166,0.022915,0.004733,0.023189,0.003783
6,2.4481,2.437341,0.030247,0.009018,0.024369,0.005226
7,2.4454,2.433274,0.036664,0.03286,0.02575,0.006695
8,2.4406,2.429358,0.049496,0.032154,0.027201,0.007719
9,2.4378,2.425663,0.058662,0.011842,0.028624,0.008521
10,2.4342,2.422149,0.07791,0.008839,0.029577,0.007975


[I 2025-03-15 14:22:19,764] Trial 1 pruned. 


Trial 2 with params: {'learning_rate': 0.0001764971584817573, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2917,2.118388,0.178735,0.023545,0.020476,0.006952
2,2.0205,1.875414,0.417049,0.066579,0.092705,0.068026
3,1.7935,1.648747,0.483043,0.142685,0.135698,0.114761
4,1.574,1.461999,0.547204,0.172001,0.174684,0.153992
5,1.4052,1.315813,0.59945,0.240876,0.211836,0.19497
6,1.2448,1.198362,0.657195,0.260667,0.257932,0.240893
7,1.1162,1.104186,0.67461,0.262459,0.27623,0.255129
8,1.0197,1.031983,0.693859,0.265929,0.29257,0.26698
9,0.9279,0.975501,0.699358,0.28639,0.301362,0.2798
10,0.8451,0.926788,0.71494,0.310206,0.318341,0.294888


[I 2025-03-15 14:24:05,306] Trial 2 pruned. 


Trial 3 with params: {'learning_rate': 1.4648955132800731e-05, 'weight_decay': 0.003, 'adam_beta1': 0.96, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4506,2.413191,0.106324,0.008565,0.032609,0.007847
2,2.4027,2.369193,0.184235,0.018169,0.022803,0.010798
3,2.3674,2.335685,0.186984,0.013614,0.023014,0.010754
4,2.3373,2.304905,0.179652,0.018558,0.020822,0.007599
5,2.3097,2.276983,0.180568,0.019561,0.021096,0.008097
6,2.2812,2.250201,0.179652,0.023554,0.020822,0.007615
7,2.2576,2.225717,0.179652,0.023548,0.020822,0.007605
8,2.2358,2.202315,0.179652,0.023548,0.020822,0.007605
9,2.215,2.181003,0.192484,0.043594,0.02454,0.013753
10,2.1973,2.160498,0.215399,0.06368,0.03082,0.02314


[I 2025-03-15 14:26:02,817] Trial 3 pruned. 


Trial 4 with params: {'learning_rate': 1.7018418817029176e-05, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 2, 'lambda_param': 0.6000000000000001, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4492,2.407241,0.128323,0.009385,0.035447,0.008741
2,2.3937,2.355601,0.184235,0.019713,0.022632,0.010541
3,2.353,2.315982,0.181485,0.013581,0.02137,0.008484
4,2.3168,2.281678,0.181485,0.020231,0.02137,0.008582
5,2.2847,2.245561,0.179652,0.023554,0.020822,0.007615


[I 2025-03-15 14:26:28,463] Trial 4 pruned. 


Trial 5 with params: {'learning_rate': 4.3625993625605605e-05, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 6.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4187,2.34003,0.182401,0.013581,0.021644,0.008897
2,2.3103,2.248027,0.178735,0.023545,0.020548,0.007089
3,2.2305,2.1649,0.20165,0.059635,0.026992,0.017953
4,2.1529,2.088111,0.349221,0.066878,0.07023,0.060808
5,2.0874,2.018613,0.3978,0.077292,0.084844,0.065279
6,2.0149,1.953556,0.411549,0.09322,0.089932,0.068166
7,1.9518,1.89372,0.428964,0.087108,0.099525,0.077771
8,1.8972,1.837176,0.455545,0.104949,0.113664,0.092499
9,1.8417,1.785568,0.469294,0.102984,0.12236,0.09944
10,1.7942,1.739937,0.483043,0.101493,0.131862,0.105799


[I 2025-03-15 14:27:21,391] Trial 5 pruned. 


Trial 6 with params: {'learning_rate': 6.639623079859462e-06, 'weight_decay': 0.001, 'adam_beta1': 0.96, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4653,2.442222,0.017415,0.003706,0.022216,0.002785
2,2.4408,2.419843,0.073327,0.006787,0.028708,0.006799
3,2.4217,2.398395,0.15582,0.010183,0.019,0.009355
4,2.4034,2.378804,0.184235,0.021924,0.022973,0.011063
5,2.3838,2.361715,0.190651,0.035348,0.02455,0.01265
6,2.3678,2.346921,0.186984,0.016483,0.023274,0.011169
7,2.3554,2.333164,0.184235,0.014264,0.022192,0.009734
8,2.3427,2.320621,0.181485,0.012675,0.02137,0.00846
9,2.3317,2.309198,0.182401,0.01449,0.021644,0.008931
10,2.3216,2.298605,0.182401,0.016907,0.021644,0.008991


[I 2025-03-15 14:28:14,558] Trial 6 pruned. 


Trial 7 with params: {'learning_rate': 1.2382649697023546e-06, 'weight_decay': 0.01, 'adam_beta1': 0.92, 'warmup_steps': 3, 'lambda_param': 0.30000000000000004, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4753,2.463449,0.008249,0.004114,0.022233,0.002411
2,2.4702,2.458217,0.010082,0.004351,0.022089,0.002433
3,2.4669,2.453497,0.010082,0.003532,0.021738,0.002119
4,2.4633,2.449136,0.014665,0.004238,0.022256,0.002786
5,2.4557,2.445041,0.019248,0.004582,0.022774,0.00338
6,2.4518,2.441245,0.025665,0.008791,0.023851,0.004772
7,2.4494,2.437656,0.029331,0.008898,0.024265,0.005108
8,2.4452,2.4342,0.036664,0.027944,0.02558,0.00626
9,2.4427,2.430972,0.047663,0.035782,0.027164,0.008022
10,2.4395,2.427918,0.056829,0.01213,0.028066,0.008144


[I 2025-03-15 14:30:03,034] Trial 7 pruned. 


Trial 8 with params: {'learning_rate': 2.9891977384599008e-05, 'weight_decay': 0.002, 'adam_beta1': 0.99, 'warmup_steps': 3, 'lambda_param': 1.0, 'temperature': 6.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4338,2.37542,0.188818,0.015851,0.023992,0.011432
2,2.3572,2.314352,0.178735,0.016881,0.020548,0.007079
3,2.3058,2.264966,0.176902,0.003538,0.02,0.006012
4,2.2587,2.219901,0.176902,0.003538,0.02,0.006012
5,2.2196,2.178287,0.176902,0.003538,0.02,0.006012


[I 2025-03-15 14:30:29,564] Trial 8 pruned. 


Trial 9 with params: {'learning_rate': 4.108791545324077e-05, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.413,2.337925,0.182401,0.013581,0.021644,0.008897
2,2.3109,2.251229,0.179652,0.023548,0.020822,0.007605
3,2.2359,2.172647,0.193401,0.063597,0.024742,0.014307
4,2.1626,2.100256,0.341888,0.070403,0.068039,0.060738
5,2.1009,2.034513,0.392301,0.078309,0.083211,0.065104
6,2.0327,1.973692,0.407883,0.073564,0.087633,0.064854
7,1.9732,1.917075,0.417965,0.091995,0.093116,0.071774
8,1.9213,1.863005,0.44363,0.087383,0.10586,0.082665
9,1.8687,1.813754,0.457379,0.102647,0.115298,0.092977
10,1.8238,1.770011,0.476627,0.102889,0.127239,0.1028


[I 2025-03-15 14:33:15,378] Trial 9 finished with value: 0.18778763747230062 and parameters: {'learning_rate': 4.108791545324077e-05, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 3.5}. Best is trial 9 with value: 0.18778763747230062.


Trial 10 with params: {'learning_rate': 6.182305620915354e-05, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.92, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3892,2.297538,0.176902,0.003538,0.02,0.006012
2,2.2582,2.182814,0.178735,0.023545,0.020548,0.007089
3,2.1586,2.079293,0.344638,0.066101,0.068732,0.058711
4,2.0591,1.983368,0.406049,0.053426,0.086837,0.063775
5,1.9741,1.892501,0.428964,0.086049,0.098232,0.075189
6,1.8791,1.808123,0.455545,0.104354,0.114868,0.092778
7,1.7984,1.735307,0.47571,0.103409,0.126702,0.102962
8,1.7323,1.66739,0.494959,0.121061,0.138796,0.112338
9,1.6617,1.6045,0.505041,0.119884,0.143682,0.116026
10,1.6006,1.551492,0.52429,0.186036,0.158349,0.137228


[I 2025-03-15 14:34:07,818] Trial 10 pruned. 


Trial 11 with params: {'learning_rate': 0.00015915550792002775, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3032,2.137296,0.176902,0.003538,0.02,0.006012
2,2.0465,1.908542,0.412466,0.070244,0.089909,0.064655
3,1.8341,1.693793,0.471127,0.123545,0.128563,0.106314
4,1.6267,1.513946,0.537122,0.2008,0.171507,0.154161
5,1.4657,1.37052,0.584785,0.243919,0.200412,0.184173
6,1.3103,1.254927,0.635197,0.281314,0.244787,0.233591
7,1.1839,1.158519,0.664528,0.261774,0.268938,0.250222
8,1.0885,1.084924,0.68011,0.262502,0.285449,0.260973
9,0.9959,1.023059,0.695692,0.303643,0.297096,0.274102
10,0.9129,0.970873,0.703941,0.301225,0.306129,0.282891


[I 2025-03-15 14:35:02,715] Trial 11 pruned. 


Trial 12 with params: {'learning_rate': 0.000419426490802605, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1574,1.862028,0.424381,0.065617,0.100432,0.075538
2,1.6711,1.452709,0.549038,0.168522,0.181672,0.161647
3,1.3003,1.165089,0.636114,0.229216,0.241084,0.224323
4,1.0303,0.998735,0.690192,0.275819,0.288456,0.26411
5,0.8439,0.895196,0.71769,0.322039,0.326894,0.303367
6,0.6972,0.815031,0.736022,0.385046,0.357287,0.342217
7,0.5854,0.778704,0.741522,0.401253,0.380203,0.366018
8,0.5098,0.746761,0.754354,0.404369,0.403072,0.390389
9,0.4471,0.721888,0.762603,0.449973,0.430582,0.42311
10,0.386,0.705953,0.762603,0.471796,0.438014,0.438794


[I 2025-03-15 14:36:51,194] Trial 12 pruned. 


Trial 13 with params: {'learning_rate': 7.100205390479966e-06, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4633,2.439474,0.021998,0.004482,0.023085,0.003645
2,2.4375,2.41515,0.095325,0.008788,0.031897,0.008723
3,2.4165,2.391193,0.172319,0.013088,0.021285,0.010364
4,2.3962,2.369962,0.188818,0.02199,0.024173,0.012163
5,2.3755,2.351939,0.190651,0.017019,0.02437,0.011993
6,2.3585,2.336093,0.184235,0.013023,0.022192,0.009673
7,2.3452,2.321528,0.181485,0.012675,0.02137,0.00846
8,2.3319,2.308127,0.181485,0.013577,0.02137,0.008479
9,2.3195,2.295583,0.181485,0.014685,0.02137,0.008503
10,2.3086,2.283989,0.180568,0.019561,0.021096,0.008097


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--accuracy/f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Sat Oct 12 13:56:14 2024) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.
[I 2025-03-15 14:39:30,372] Trial 13 pruned. 


Trial 14 with params: {'learning_rate': 0.00040386930502167296, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 6.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1895,1.912197,0.402383,0.075821,0.086667,0.06508
2,1.7229,1.50457,0.540788,0.175663,0.178224,0.160821
3,1.3542,1.212899,0.603116,0.21568,0.211411,0.192431
4,1.0776,1.032739,0.683776,0.258796,0.283526,0.26165
5,0.8841,0.92124,0.705775,0.279132,0.310125,0.282158
6,0.7273,0.829004,0.722273,0.336457,0.336827,0.319486
7,0.6132,0.802715,0.72594,0.372123,0.357585,0.343167
8,0.538,0.757456,0.754354,0.403321,0.396305,0.381835
9,0.4701,0.734136,0.750687,0.449472,0.416486,0.409643
10,0.4088,0.720903,0.75802,0.48797,0.436526,0.439056


[I 2025-03-15 14:41:20,666] Trial 14 pruned. 


Trial 15 with params: {'learning_rate': 0.00044802688590348845, 'weight_decay': 0.006, 'adam_beta1': 0.96, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1581,1.878799,0.406966,0.069862,0.091561,0.071852
2,1.6987,1.496223,0.527039,0.172609,0.166609,0.147219
3,1.3471,1.217801,0.593951,0.225662,0.209073,0.182473
4,1.0675,1.037081,0.671861,0.245897,0.281054,0.252933
5,0.8747,0.935401,0.688359,0.277727,0.291308,0.264985
6,0.7315,0.841954,0.728689,0.360447,0.34121,0.319355
7,0.6177,0.798779,0.738772,0.384314,0.366833,0.353885
8,0.5384,0.771288,0.745188,0.417405,0.402997,0.390405
9,0.4707,0.7545,0.752521,0.415441,0.414974,0.39963
10,0.4103,0.736656,0.75802,0.471663,0.438548,0.430987


[I 2025-03-15 14:43:09,215] Trial 15 pruned. 


Trial 16 with params: {'learning_rate': 0.00044745114736637206, 'weight_decay': 0.01, 'adam_beta1': 0.92, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1766,1.872992,0.414299,0.066177,0.093867,0.071857
2,1.669,1.443257,0.549038,0.17741,0.184497,0.164025
3,1.2794,1.145677,0.64253,0.252864,0.243475,0.226989
4,1.001,0.976686,0.696609,0.28911,0.298811,0.277762
5,0.8059,0.870708,0.709441,0.321154,0.322077,0.300886
6,0.6636,0.801028,0.729606,0.390824,0.359514,0.348164
7,0.5553,0.779485,0.743355,0.418907,0.390505,0.382116
8,0.4863,0.745497,0.75527,0.427285,0.412069,0.404461
9,0.4211,0.719476,0.75802,0.497513,0.437105,0.440581
10,0.3625,0.703978,0.762603,0.490838,0.450549,0.45583


[I 2025-03-15 14:45:55,559] Trial 16 finished with value: 0.6171503690738473 and parameters: {'learning_rate': 0.00044745114736637206, 'weight_decay': 0.01, 'adam_beta1': 0.92, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 3.0}. Best is trial 16 with value: 0.6171503690738473.


Trial 17 with params: {'learning_rate': 0.00014978859143602985, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.91, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3288,2.167349,0.176902,0.003538,0.02,0.006012
2,2.0804,1.947633,0.406966,0.073501,0.087928,0.064695
3,1.8766,1.738654,0.461045,0.099849,0.121116,0.0934
4,1.6752,1.561933,0.520623,0.198432,0.158285,0.137817
5,1.516,1.417834,0.56462,0.223026,0.190275,0.172578
6,1.3614,1.299616,0.60495,0.241472,0.220615,0.206211
7,1.2352,1.201646,0.649863,0.267617,0.25783,0.245458
8,1.1379,1.1232,0.681027,0.267683,0.283691,0.261954
9,1.0421,1.058048,0.697525,0.298357,0.297437,0.27531
10,0.9585,1.004148,0.703941,0.304811,0.306632,0.286811


[I 2025-03-15 14:48:38,874] Trial 17 finished with value: 0.38940911186640337 and parameters: {'learning_rate': 0.00014978859143602985, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.91, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 3.0}. Best is trial 16 with value: 0.6171503690738473.


Trial 18 with params: {'learning_rate': 0.0003070981811342446, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2481,2.004475,0.363886,0.065855,0.074486,0.060334
2,1.8413,1.631227,0.483043,0.139391,0.138703,0.114312
3,1.5012,1.338429,0.574702,0.219485,0.194688,0.177663
4,1.2269,1.144347,0.660862,0.264504,0.266697,0.251001
5,1.028,1.009114,0.692026,0.282385,0.291071,0.266003
6,0.8636,0.904463,0.712191,0.298279,0.313708,0.292764
7,0.7371,0.847441,0.708524,0.321434,0.318486,0.300308
8,0.6546,0.814791,0.736939,0.376385,0.362543,0.345304
9,0.5846,0.785707,0.739688,0.38729,0.372418,0.35968
10,0.5157,0.766615,0.745188,0.443191,0.393747,0.389141


[I 2025-03-15 14:51:27,636] Trial 18 finished with value: 0.5830696296591694 and parameters: {'learning_rate': 0.0003070981811342446, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 3.5}. Best is trial 16 with value: 0.6171503690738473.


Trial 19 with params: {'learning_rate': 0.000264337305031989, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 4, 'lambda_param': 0.4, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2685,2.047209,0.303391,0.071965,0.056621,0.045728
2,1.9012,1.705089,0.463795,0.124434,0.128626,0.101793
3,1.59,1.427365,0.545371,0.178082,0.173135,0.153217
4,1.3264,1.225659,0.638863,0.248223,0.243166,0.225761
5,1.1279,1.075925,0.683776,0.272318,0.286331,0.264542
6,0.958,0.966228,0.699358,0.285276,0.296804,0.27563
7,0.8257,0.895058,0.703025,0.304983,0.303086,0.282441
8,0.7365,0.849831,0.724106,0.341724,0.338228,0.316463
9,0.661,0.817992,0.726856,0.357685,0.346013,0.333335
10,0.5872,0.797321,0.743355,0.377273,0.371353,0.355064


[I 2025-03-15 14:53:14,378] Trial 19 pruned. 


Trial 20 with params: {'learning_rate': 0.0004703512078685806, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1829,1.881323,0.410632,0.071544,0.093461,0.072734
2,1.6739,1.449646,0.550871,0.168236,0.190546,0.166774
3,1.2791,1.136615,0.641613,0.241738,0.247125,0.22929
4,0.989,0.961518,0.693859,0.272529,0.292728,0.267577
5,0.7909,0.86941,0.71494,0.308807,0.327055,0.302417
6,0.6479,0.792474,0.739688,0.401374,0.372816,0.361861
7,0.5408,0.767816,0.735105,0.384548,0.380427,0.364745
8,0.4716,0.734059,0.757104,0.421694,0.416171,0.404513
9,0.4088,0.716736,0.756187,0.453611,0.442486,0.435964
10,0.3506,0.700629,0.76352,0.533683,0.456711,0.461454


[I 2025-03-15 14:55:49,121] Trial 20 finished with value: 0.6294873476522568 and parameters: {'learning_rate': 0.0004703512078685806, 'weight_decay': 0.003, 'adam_beta1': 0.93, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 3.0}. Best is trial 20 with value: 0.6294873476522568.


Trial 21 with params: {'learning_rate': 0.00014626977079125038, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 3, 'lambda_param': 0.2, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3322,2.17504,0.176902,0.003538,0.02,0.006012
2,2.0914,1.963308,0.40055,0.055296,0.085993,0.063656
3,1.8954,1.760436,0.460128,0.099223,0.12012,0.092065
4,1.6981,1.586834,0.508708,0.160261,0.152523,0.132463
5,1.54,1.441257,0.558203,0.223298,0.187559,0.169601


[I 2025-03-15 14:56:21,347] Trial 21 pruned. 


Trial 22 with params: {'learning_rate': 0.00022538974253837323, 'weight_decay': 0.005, 'adam_beta1': 0.96, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2956,2.106008,0.176902,0.003538,0.02,0.006012
2,2.0039,1.855937,0.422548,0.065532,0.096099,0.071466
3,1.7634,1.616804,0.48121,0.120112,0.139375,0.111927
4,1.5214,1.411612,0.548121,0.167689,0.179463,0.154941
5,1.3313,1.255288,0.614115,0.234816,0.22438,0.200843
6,1.1561,1.127644,0.661778,0.260442,0.258756,0.239554
7,1.0171,1.036859,0.683776,0.255745,0.284787,0.259815
8,0.9126,0.961436,0.703941,0.300546,0.302714,0.280644
9,0.8224,0.915932,0.710357,0.301587,0.317782,0.29044
10,0.7378,0.875065,0.726856,0.348062,0.339291,0.320624


[I 2025-03-15 14:57:21,683] Trial 22 pruned. 


Trial 23 with params: {'learning_rate': 0.00025168387465058866, 'weight_decay': 0.008, 'adam_beta1': 0.93, 'warmup_steps': 2, 'lambda_param': 0.4, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2665,2.059134,0.256645,0.075323,0.041947,0.034353
2,1.9287,1.747852,0.454629,0.105419,0.122605,0.097452
3,1.6404,1.485167,0.528873,0.160437,0.160313,0.13953
4,1.3832,1.281728,0.593034,0.223821,0.207782,0.185318
5,1.189,1.129622,0.659945,0.259954,0.258891,0.240187
6,1.0172,1.013289,0.691109,0.264813,0.288376,0.265713
7,0.881,0.933952,0.702108,0.28432,0.294518,0.271468
8,0.788,0.87986,0.721357,0.340896,0.330525,0.308038
9,0.7065,0.844064,0.72319,0.332698,0.333653,0.31395
10,0.6294,0.820824,0.738772,0.359444,0.363415,0.341492


[I 2025-03-15 14:59:06,077] Trial 23 pruned. 


Trial 24 with params: {'learning_rate': 0.00012991083381890363, 'weight_decay': 0.004, 'adam_beta1': 0.92, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3468,2.201475,0.176902,0.003538,0.02,0.006012
2,2.1244,2.004223,0.386801,0.060621,0.082081,0.064312
3,1.946,1.81991,0.439047,0.077669,0.105392,0.079687
4,1.7639,1.655985,0.491292,0.118141,0.137178,0.11044
5,1.6151,1.513076,0.535289,0.190479,0.173359,0.155666
6,1.4675,1.397444,0.571036,0.23843,0.195605,0.179421
7,1.3468,1.301489,0.615032,0.247603,0.226896,0.214189
8,1.2513,1.220818,0.659945,0.262886,0.264101,0.248227
9,1.1578,1.15023,0.664528,0.263999,0.269793,0.252103
10,1.0747,1.093551,0.691109,0.285424,0.289981,0.267063


[I 2025-03-15 14:59:56,915] Trial 24 pruned. 


Trial 25 with params: {'learning_rate': 5.945016510735224e-05, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4027,2.309586,0.177819,0.013545,0.020274,0.006555
2,2.2697,2.193073,0.178735,0.023545,0.020548,0.007089
3,2.1689,2.088624,0.341888,0.068436,0.067685,0.059023
4,2.0692,1.993056,0.406049,0.054288,0.086837,0.064371
5,1.9845,1.902678,0.425298,0.087118,0.0972,0.075095
6,1.8908,1.819814,0.453712,0.1049,0.113884,0.092054
7,1.8114,1.74859,0.470211,0.102621,0.123013,0.100457
8,1.7463,1.68058,0.493126,0.100981,0.136901,0.109767
9,1.6766,1.618527,0.505041,0.120765,0.143442,0.116148
10,1.6171,1.566772,0.519707,0.179309,0.15558,0.134637


[I 2025-03-15 15:00:54,446] Trial 25 pruned. 


Trial 26 with params: {'learning_rate': 7.27361096702708e-05, 'weight_decay': 0.01, 'adam_beta1': 0.93, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3865,2.286181,0.176902,0.003538,0.02,0.006012
2,2.2398,2.157298,0.180568,0.043551,0.02106,0.008081
3,2.1279,2.042118,0.377635,0.059867,0.078444,0.062276
4,2.0159,1.933366,0.414299,0.071396,0.090509,0.066806
5,1.9184,1.830871,0.442713,0.103792,0.106779,0.081774


[I 2025-03-15 15:01:19,799] Trial 26 pruned. 


Trial 27 with params: {'learning_rate': 0.00037773988301362265, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 2, 'lambda_param': 0.30000000000000004, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2015,1.919576,0.4033,0.07429,0.087447,0.065358
2,1.7321,1.509305,0.525206,0.161807,0.160611,0.140585
3,1.3596,1.208508,0.640697,0.227563,0.238963,0.218203
4,1.0813,1.033295,0.686526,0.267526,0.287314,0.265539
5,0.8846,0.919691,0.716774,0.324485,0.324443,0.302739
6,0.7341,0.835215,0.72044,0.34396,0.34118,0.324203
7,0.6219,0.801648,0.727773,0.361268,0.353974,0.338887
8,0.5481,0.768894,0.749771,0.377848,0.391309,0.370807
9,0.4814,0.738066,0.754354,0.45623,0.40566,0.4029
10,0.4193,0.719961,0.759853,0.463213,0.426965,0.424769


[I 2025-03-15 15:04:21,321] Trial 27 finished with value: 0.582470702602652 and parameters: {'learning_rate': 0.00037773988301362265, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 2, 'lambda_param': 0.30000000000000004, 'temperature': 3.0}. Best is trial 20 with value: 0.6294873476522568.


Trial 28 with params: {'learning_rate': 0.0004811283747798316, 'weight_decay': 0.01, 'adam_beta1': 0.92, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1582,1.839428,0.430797,0.060537,0.104986,0.073987
2,1.632,1.409088,0.567369,0.195582,0.197484,0.175816
3,1.235,1.111772,0.659028,0.236423,0.264373,0.242422
4,0.9579,0.944991,0.700275,0.289178,0.297635,0.273314
5,0.7654,0.856692,0.712191,0.337835,0.331078,0.307004
6,0.6302,0.792356,0.738772,0.413014,0.381887,0.369676
7,0.5274,0.755292,0.746104,0.405232,0.391939,0.382172
8,0.4598,0.730963,0.75802,0.433994,0.417328,0.410832
9,0.3966,0.709576,0.765353,0.476916,0.444204,0.442372
10,0.3417,0.689811,0.773602,0.513569,0.475085,0.482068


[I 2025-03-15 15:06:58,612] Trial 28 finished with value: 0.6136352532720691 and parameters: {'learning_rate': 0.0004811283747798316, 'weight_decay': 0.01, 'adam_beta1': 0.92, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 2.5}. Best is trial 20 with value: 0.6294873476522568.


Trial 29 with params: {'learning_rate': 0.0003574785066376213, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2089,1.955327,0.387718,0.061434,0.081311,0.063889
2,1.7801,1.573041,0.505041,0.137813,0.148034,0.125431
3,1.4307,1.280717,0.575619,0.217459,0.194735,0.177747
4,1.1549,1.095101,0.665445,0.254524,0.270203,0.251006
5,0.9593,0.968116,0.698442,0.279773,0.296856,0.27206
6,0.7951,0.863995,0.716774,0.313398,0.321879,0.300526
7,0.6726,0.82454,0.71494,0.3429,0.326612,0.313352
8,0.5926,0.789045,0.751604,0.381785,0.386977,0.368287
9,0.5249,0.761002,0.743355,0.409488,0.400422,0.388129
10,0.4592,0.746376,0.752521,0.450909,0.411853,0.407101


[I 2025-03-15 15:09:31,479] Trial 29 finished with value: 0.616637828866353 and parameters: {'learning_rate': 0.0003574785066376213, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 2.5}. Best is trial 20 with value: 0.6294873476522568.


Trial 30 with params: {'learning_rate': 0.000495742016232896, 'weight_decay': 0.01, 'adam_beta1': 0.98, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1757,1.907694,0.351971,0.082298,0.068161,0.049588
2,1.7312,1.54545,0.48396,0.153409,0.138574,0.118088
3,1.4005,1.274576,0.572869,0.174506,0.199053,0.17022
4,1.13,1.112073,0.643446,0.251127,0.260151,0.23319
5,0.9519,0.992843,0.676444,0.264221,0.285219,0.256607
6,0.8063,0.912143,0.708524,0.299377,0.317229,0.289412
7,0.6881,0.867322,0.72044,0.322779,0.338258,0.3125
8,0.597,0.820669,0.736022,0.398692,0.37555,0.361871
9,0.5219,0.799606,0.740605,0.387691,0.393879,0.377626
10,0.4558,0.780482,0.738772,0.426309,0.394684,0.387242


[I 2025-03-15 15:10:29,259] Trial 30 pruned. 


Trial 31 with params: {'learning_rate': 0.0002391890808390922, 'weight_decay': 0.01, 'adam_beta1': 0.91, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.265,2.05933,0.277727,0.073152,0.04864,0.039475
2,1.9279,1.746854,0.455545,0.101974,0.123063,0.097628
3,1.6413,1.485061,0.525206,0.194521,0.156092,0.135101
4,1.3898,1.284471,0.604033,0.245894,0.215469,0.195235
5,1.1972,1.133675,0.672777,0.259603,0.269763,0.249835
6,1.0266,1.019129,0.691109,0.25975,0.290295,0.264877
7,0.8937,0.939716,0.702108,0.276167,0.295479,0.272935
8,0.8023,0.884816,0.716774,0.337012,0.326242,0.304704
9,0.7209,0.848401,0.719523,0.313881,0.326383,0.306577
10,0.6448,0.821634,0.735105,0.330932,0.352375,0.330558


[I 2025-03-15 15:12:13,376] Trial 31 pruned. 


Trial 32 with params: {'learning_rate': 0.00048589250169561336, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1634,1.855191,0.419798,0.063847,0.097807,0.073259
2,1.6561,1.434741,0.554537,0.181122,0.188179,0.167339
3,1.2726,1.149577,0.64253,0.231441,0.246418,0.227026
4,0.9959,0.979448,0.686526,0.252101,0.286872,0.259573
5,0.7988,0.881508,0.707608,0.28774,0.317747,0.2912
6,0.6507,0.811477,0.732356,0.399406,0.367319,0.352754
7,0.5487,0.773827,0.746104,0.395374,0.390797,0.375991
8,0.4804,0.756645,0.753437,0.431694,0.420218,0.409411
9,0.4144,0.722642,0.754354,0.467032,0.443916,0.43632
10,0.3567,0.712446,0.766269,0.471444,0.461243,0.456663


[I 2025-03-15 15:14:52,861] Trial 32 finished with value: 0.6140158009879676 and parameters: {'learning_rate': 0.00048589250169561336, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 3.5}. Best is trial 20 with value: 0.6294873476522568.


Trial 33 with params: {'learning_rate': 0.00025292455107428236, 'weight_decay': 0.007, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2634,2.064291,0.193401,0.037224,0.024263,0.012801
2,1.9456,1.779341,0.439047,0.080794,0.106888,0.076997
3,1.6753,1.524691,0.515124,0.179187,0.15713,0.135383
4,1.4192,1.315652,0.586618,0.225173,0.206215,0.181498
5,1.2211,1.159945,0.64528,0.23177,0.245056,0.221553
6,1.0462,1.041318,0.687443,0.263557,0.287299,0.263381
7,0.9078,0.957401,0.697525,0.278207,0.295103,0.26988
8,0.8126,0.896533,0.71494,0.283238,0.317622,0.289687
9,0.7298,0.863033,0.722273,0.350042,0.338733,0.318488
10,0.6517,0.8328,0.736022,0.379407,0.362388,0.345518


[I 2025-03-15 15:16:38,933] Trial 33 pruned. 


Trial 34 with params: {'learning_rate': 0.00024599196666744047, 'weight_decay': 0.004, 'adam_beta1': 0.93, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2691,2.06389,0.245646,0.056733,0.038844,0.031769
2,1.9366,1.758566,0.450046,0.081218,0.115771,0.087482
3,1.6532,1.49896,0.526123,0.156592,0.158442,0.137096
4,1.3981,1.295734,0.590284,0.202403,0.204067,0.178824
5,1.2055,1.143494,0.655362,0.259221,0.256322,0.237824
6,1.0341,1.026767,0.689276,0.266544,0.288587,0.266374
7,0.898,0.945801,0.702108,0.275525,0.294479,0.271731
8,0.8042,0.88965,0.721357,0.303043,0.324984,0.300132
9,0.7217,0.852781,0.72044,0.308083,0.326593,0.302814
10,0.6437,0.827243,0.738772,0.33938,0.362577,0.340222


[I 2025-03-15 15:18:22,765] Trial 34 pruned. 


Trial 35 with params: {'learning_rate': 0.0003375040445074918, 'weight_decay': 0.01, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.236,1.999726,0.35747,0.067291,0.07226,0.058504
2,1.833,1.628258,0.47846,0.115849,0.133283,0.104348
3,1.4954,1.338022,0.565536,0.219255,0.187629,0.165999
4,1.2143,1.140303,0.644363,0.263529,0.256596,0.239215
5,1.0137,1.000111,0.689276,0.284363,0.289563,0.26556
6,0.8459,0.895659,0.708524,0.301535,0.310118,0.288335
7,0.7153,0.841564,0.713107,0.325829,0.318577,0.30029
8,0.6292,0.805309,0.750687,0.38348,0.379704,0.360813
9,0.5575,0.778364,0.741522,0.411405,0.388386,0.379581
10,0.4893,0.764173,0.745188,0.449363,0.403306,0.394372


[I 2025-03-15 15:19:21,591] Trial 35 pruned. 


Trial 36 with params: {'learning_rate': 0.00013004523237627598, 'weight_decay': 0.003, 'adam_beta1': 0.96, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.343,2.206457,0.176902,0.003538,0.02,0.006012
2,2.1374,2.031454,0.346471,0.06502,0.069839,0.055058
3,1.9809,1.871273,0.424381,0.069282,0.096368,0.071876
4,1.8169,1.713564,0.467461,0.077353,0.12333,0.092049
5,1.676,1.576768,0.495875,0.123451,0.144192,0.119692
6,1.5307,1.458258,0.550871,0.199402,0.181774,0.165211
7,1.4081,1.357096,0.571036,0.240247,0.20005,0.185059
8,1.3089,1.272723,0.619615,0.240087,0.232754,0.214697
9,1.2145,1.199609,0.659945,0.262656,0.263618,0.242465
10,1.1272,1.139411,0.676444,0.276522,0.278479,0.257773


[I 2025-03-15 15:21:04,252] Trial 36 pruned. 


Trial 37 with params: {'learning_rate': 0.0004100230031354973, 'weight_decay': 0.008, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.30000000000000004, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1871,1.906527,0.4033,0.075577,0.087031,0.065124
2,1.716,1.496705,0.540788,0.17612,0.178961,0.161194
3,1.3452,1.205435,0.612282,0.220383,0.217041,0.198785
4,1.0691,1.025558,0.686526,0.260177,0.285105,0.263289
5,0.8758,0.915346,0.707608,0.280391,0.311792,0.283709
6,0.7199,0.82527,0.722273,0.337256,0.336827,0.319984
7,0.6066,0.800136,0.72594,0.372492,0.35631,0.341318
8,0.5324,0.754335,0.75527,0.404954,0.398124,0.383931
9,0.465,0.731384,0.747938,0.417465,0.412077,0.40065
10,0.4034,0.717924,0.759853,0.481821,0.433113,0.43477


[I 2025-03-15 15:23:48,787] Trial 37 finished with value: 0.614622963082204 and parameters: {'learning_rate': 0.0004100230031354973, 'weight_decay': 0.008, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.30000000000000004, 'temperature': 4.0}. Best is trial 20 with value: 0.6294873476522568.


Trial 38 with params: {'learning_rate': 0.00017511093165679915, 'weight_decay': 0.01, 'adam_beta1': 0.92, 'warmup_steps': 0, 'lambda_param': 0.7000000000000001, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2936,2.122743,0.176902,0.003538,0.02,0.006012
2,2.0281,1.887286,0.417049,0.069556,0.092274,0.067571
3,1.8065,1.662534,0.47571,0.141642,0.131922,0.108269
4,1.588,1.475497,0.546288,0.190618,0.177814,0.159821
5,1.4196,1.330065,0.59945,0.227214,0.209995,0.191845


[I 2025-03-15 15:24:19,920] Trial 38 pruned. 


Trial 39 with params: {'learning_rate': 0.0003339258256805627, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 2, 'lambda_param': 0.5, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.233,2.006461,0.318057,0.049174,0.058382,0.041795
2,1.8481,1.652338,0.468378,0.143888,0.129072,0.103184
3,1.5259,1.371607,0.566453,0.198594,0.188365,0.163136
4,1.2451,1.16353,0.63978,0.25023,0.249512,0.22946
5,1.0366,1.02199,0.692942,0.278879,0.2874,0.263745
6,0.8659,0.912595,0.706691,0.29463,0.308355,0.285469
7,0.7337,0.852972,0.71769,0.306612,0.322769,0.300807
8,0.6478,0.821198,0.743355,0.374292,0.370735,0.348446
9,0.5763,0.787645,0.747021,0.442449,0.389476,0.382596
10,0.5075,0.775953,0.747021,0.424581,0.407028,0.391353


[I 2025-03-15 15:25:12,112] Trial 39 pruned. 


Trial 40 with params: {'learning_rate': 0.0004385331610554008, 'weight_decay': 0.01, 'adam_beta1': 0.91, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1775,1.871985,0.417965,0.066174,0.095759,0.072947
2,1.668,1.438529,0.550871,0.194671,0.183635,0.163847
3,1.2776,1.143253,0.648029,0.256289,0.251903,0.234548
4,0.9992,0.971853,0.696609,0.269556,0.294972,0.271959
5,0.8044,0.872184,0.71769,0.327006,0.328359,0.307094
6,0.6655,0.801763,0.731439,0.386129,0.363543,0.352921
7,0.5586,0.778043,0.741522,0.419922,0.389391,0.380513
8,0.4893,0.747417,0.758937,0.424519,0.415043,0.40208
9,0.4262,0.726341,0.75802,0.467181,0.430744,0.427158
10,0.3661,0.701032,0.759853,0.486353,0.434041,0.437083


[I 2025-03-15 15:27:48,178] Trial 40 finished with value: 0.6046267697590215 and parameters: {'learning_rate': 0.0004385331610554008, 'weight_decay': 0.01, 'adam_beta1': 0.91, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 4.0}. Best is trial 20 with value: 0.6294873476522568.


Trial 41 with params: {'learning_rate': 0.00021370315190289845, 'weight_decay': 0.008, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2705,2.08719,0.180568,0.038554,0.020929,0.007825
2,1.981,1.829105,0.426214,0.06461,0.098703,0.073489
3,1.7358,1.585401,0.494042,0.147455,0.144573,0.122912
4,1.4964,1.387401,0.566453,0.233312,0.194909,0.175747
5,1.3123,1.236307,0.620532,0.236118,0.227115,0.206051
6,1.1443,1.115849,0.668194,0.25663,0.269102,0.246839
7,1.0103,1.028134,0.686526,0.261488,0.284105,0.261235
8,0.9114,0.956376,0.705775,0.28236,0.301653,0.274262
9,0.8234,0.911962,0.714024,0.310407,0.318774,0.295619
10,0.7416,0.872066,0.72594,0.339632,0.33396,0.311798


[I 2025-03-15 15:30:23,986] Trial 41 finished with value: 0.4614717654188404 and parameters: {'learning_rate': 0.00021370315190289845, 'weight_decay': 0.008, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 3.5}. Best is trial 20 with value: 0.6294873476522568.


Trial 42 with params: {'learning_rate': 0.00018989107351359612, 'weight_decay': 0.008, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.2, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2956,2.116091,0.176902,0.003538,0.02,0.006012
2,2.0168,1.869662,0.423465,0.068031,0.095796,0.071856
3,1.7824,1.634584,0.48121,0.141559,0.136674,0.112967
4,1.5538,1.441209,0.553621,0.178594,0.179579,0.15747
5,1.3773,1.292598,0.608616,0.22447,0.21735,0.196706
6,1.2135,1.171465,0.663611,0.27543,0.262558,0.246507
7,1.0809,1.07929,0.67461,0.260468,0.274684,0.253222
8,0.9816,1.007445,0.698442,0.285056,0.297488,0.274191
9,0.8902,0.954723,0.705775,0.293226,0.309011,0.287955
10,0.8082,0.909315,0.715857,0.326708,0.32259,0.300801


[I 2025-03-15 15:33:03,475] Trial 42 finished with value: 0.43413979138227565 and parameters: {'learning_rate': 0.00018989107351359612, 'weight_decay': 0.008, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.2, 'temperature': 4.5}. Best is trial 20 with value: 0.6294873476522568.


Trial 43 with params: {'learning_rate': 3.777515239636729e-05, 'weight_decay': 0.007, 'adam_beta1': 0.99, 'warmup_steps': 0, 'lambda_param': 0.8, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4185,2.35269,0.19615,0.020847,0.026013,0.013345
2,2.3306,2.282899,0.176902,0.003538,0.02,0.006012
3,2.2711,2.223492,0.176902,0.003538,0.02,0.006012
4,2.2142,2.171516,0.176902,0.003538,0.02,0.006012
5,2.1702,2.122084,0.180568,0.023551,0.021096,0.008109


[I 2025-03-15 15:33:31,505] Trial 43 pruned. 


Trial 44 with params: {'learning_rate': 0.00027505500686707344, 'weight_decay': 0.0, 'adam_beta1': 0.92, 'warmup_steps': 2, 'lambda_param': 1.0, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2541,2.034586,0.31439,0.072694,0.059835,0.048772
2,1.8872,1.691828,0.466544,0.123237,0.12916,0.101924
3,1.574,1.415198,0.543538,0.198053,0.170391,0.1501
4,1.3088,1.212181,0.636114,0.245126,0.243704,0.225913
5,1.1091,1.064486,0.68286,0.268705,0.286263,0.26283
6,0.9375,0.951684,0.704858,0.311953,0.301694,0.280997
7,0.8052,0.884142,0.708524,0.2992,0.304414,0.282542
8,0.7169,0.838504,0.730522,0.325698,0.343555,0.320239
9,0.6414,0.809253,0.731439,0.347187,0.352324,0.337599
10,0.5689,0.793392,0.744271,0.410154,0.380904,0.369901


[I 2025-03-15 15:34:24,629] Trial 44 pruned. 


Trial 45 with params: {'learning_rate': 0.0004565536432351292, 'weight_decay': 0.007, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1699,1.867165,0.420715,0.065944,0.097592,0.074328
2,1.6684,1.444768,0.557287,0.176562,0.19092,0.17083
3,1.2847,1.155585,0.644363,0.238682,0.247617,0.231305
4,1.0101,0.980705,0.696609,0.269716,0.291111,0.26811
5,0.8166,0.882692,0.713107,0.305578,0.324748,0.298472
6,0.6667,0.808517,0.736939,0.415871,0.370628,0.360101
7,0.56,0.775568,0.738772,0.398092,0.385829,0.373701
8,0.4904,0.744774,0.75527,0.435448,0.409328,0.400466
9,0.4231,0.717157,0.758937,0.471436,0.437779,0.433814
10,0.3667,0.704129,0.769019,0.491621,0.459093,0.461709


[I 2025-03-15 15:37:10,283] Trial 45 finished with value: 0.614362197646446 and parameters: {'learning_rate': 0.0004565536432351292, 'weight_decay': 0.007, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 2.0}. Best is trial 20 with value: 0.6294873476522568.


Trial 46 with params: {'learning_rate': 2.1109501932833057e-06, 'weight_decay': 0.01, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0, 'lambda_param': 0.9, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4731,2.459247,0.008249,0.003659,0.021882,0.002109
2,2.4644,2.45091,0.012832,0.004242,0.022049,0.002572
3,2.4582,2.443428,0.017415,0.003936,0.022216,0.002821
4,2.4523,2.436514,0.029331,0.004949,0.023914,0.004388
5,2.4426,2.429923,0.049496,0.034086,0.027371,0.00817
6,2.436,2.423765,0.065995,0.01062,0.029282,0.008697
7,2.4317,2.417798,0.088909,0.008977,0.031171,0.008609
8,2.4251,2.412005,0.110907,0.009399,0.033478,0.008831
9,2.4203,2.40637,0.129239,0.008834,0.03555,0.008865
10,2.4154,2.40093,0.148488,0.009946,0.038067,0.009585


[I 2025-03-15 15:38:57,142] Trial 46 pruned. 


Trial 47 with params: {'learning_rate': 0.00016690349007326933, 'weight_decay': 0.007, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3101,2.1421,0.176902,0.003538,0.02,0.006012
2,2.0529,1.918375,0.408799,0.071064,0.088335,0.063921
3,1.8418,1.701337,0.469294,0.098033,0.125154,0.093805
4,1.6298,1.517638,0.531622,0.183059,0.165629,0.146771
5,1.4628,1.370743,0.581118,0.225619,0.200489,0.179636


[I 2025-03-15 15:39:22,540] Trial 47 pruned. 


Trial 48 with params: {'learning_rate': 2.1576923653516515e-06, 'weight_decay': 0.004, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 4, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4737,2.459658,0.008249,0.003659,0.021882,0.002109
2,2.4647,2.451114,0.011916,0.003976,0.021945,0.002422
3,2.4583,2.443419,0.017415,0.0039,0.022216,0.002813
4,2.4522,2.436375,0.029331,0.024837,0.024049,0.00475
5,2.4424,2.429635,0.050412,0.034716,0.027475,0.008298
6,2.4356,2.42333,0.067828,0.010822,0.02949,0.008795
7,2.4312,2.417241,0.090742,0.008914,0.031379,0.008602
8,2.4244,2.411304,0.111824,0.009459,0.033581,0.008781
9,2.4196,2.40553,0.130156,0.008848,0.035654,0.008719
10,2.4146,2.399975,0.151237,0.009825,0.018482,0.009423


[I 2025-03-15 15:41:08,500] Trial 48 pruned. 


Trial 49 with params: {'learning_rate': 0.00031126881960135896, 'weight_decay': 0.003, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 3, 'lambda_param': 0.4, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2469,2.020109,0.329056,0.070691,0.063332,0.052605
2,1.865,1.664914,0.471127,0.12382,0.127455,0.098047
3,1.5409,1.382903,0.555454,0.180588,0.180405,0.157432
4,1.2657,1.181349,0.637947,0.244477,0.246698,0.228481
5,1.0658,1.036951,0.68561,0.268126,0.28526,0.259928
6,0.8959,0.927195,0.707608,0.312884,0.307803,0.287784
7,0.7614,0.863462,0.708524,0.32683,0.30958,0.292069
8,0.6724,0.824738,0.744271,0.398202,0.370281,0.350913
9,0.5977,0.796219,0.739688,0.411693,0.378125,0.369428
10,0.5266,0.778105,0.749771,0.44944,0.394926,0.386429


[I 2025-03-15 15:42:02,417] Trial 49 pruned. 


Trial 50 with params: {'learning_rate': 2.356716916016174e-06, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4726,2.45818,0.010082,0.004169,0.022089,0.0024
2,2.4629,2.449006,0.013749,0.004321,0.022153,0.002718
3,2.456,2.440757,0.021998,0.004251,0.022734,0.003339
4,2.4494,2.433104,0.036664,0.032887,0.02575,0.006709
5,2.4391,2.425756,0.060495,0.013877,0.029352,0.009633


[I 2025-03-15 15:42:28,552] Trial 50 pruned. 


Trial 51 with params: {'learning_rate': 0.00047011879549512886, 'weight_decay': 0.008, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1651,1.856589,0.422548,0.064072,0.099312,0.074205
2,1.6556,1.432337,0.560037,0.176664,0.195604,0.17405
3,1.2688,1.142843,0.648029,0.234884,0.250922,0.233238
4,0.9939,0.970445,0.698442,0.267636,0.292758,0.267552
5,0.8006,0.875214,0.710357,0.30586,0.324284,0.29785
6,0.6526,0.805339,0.736939,0.402821,0.373567,0.360351
7,0.5486,0.771551,0.743355,0.404028,0.389743,0.377952
8,0.4811,0.743918,0.754354,0.415796,0.408416,0.395986
9,0.4145,0.715415,0.75802,0.47103,0.441066,0.436705
10,0.3576,0.69992,0.769936,0.504526,0.467606,0.474213


[I 2025-03-15 15:45:28,903] Trial 51 finished with value: 0.613286271525658 and parameters: {'learning_rate': 0.00047011879549512886, 'weight_decay': 0.008, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 3.5}. Best is trial 20 with value: 0.6294873476522568.


Trial 52 with params: {'learning_rate': 0.00046394057610124693, 'weight_decay': 0.001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 4, 'lambda_param': 0.0, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1967,1.906708,0.395967,0.075928,0.085866,0.064499
2,1.6984,1.473129,0.537122,0.168929,0.175749,0.156523
3,1.3077,1.164904,0.640697,0.23322,0.241597,0.223325
4,1.0175,0.986699,0.681943,0.253274,0.283158,0.258737
5,0.8221,0.880598,0.713107,0.298727,0.320447,0.296374
6,0.6712,0.810475,0.730522,0.420348,0.366082,0.358567
7,0.5621,0.78086,0.746104,0.389634,0.386285,0.373631
8,0.4911,0.756375,0.753437,0.420553,0.406935,0.392873
9,0.423,0.740054,0.753437,0.482516,0.450377,0.44606
10,0.3626,0.713023,0.766269,0.51595,0.459226,0.465308


[I 2025-03-15 15:48:05,252] Trial 52 finished with value: 0.6341848954231764 and parameters: {'learning_rate': 0.00046394057610124693, 'weight_decay': 0.001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 4, 'lambda_param': 0.0, 'temperature': 4.0}. Best is trial 52 with value: 0.6341848954231764.


Trial 53 with params: {'learning_rate': 1.2327552056258486e-06, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 4, 'lambda_param': 0.5, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4754,2.463541,0.008249,0.004114,0.022233,0.002411
2,2.4703,2.458344,0.010082,0.004332,0.022089,0.002421
3,2.467,2.453621,0.010082,0.003532,0.021738,0.002119
4,2.4634,2.449282,0.014665,0.004263,0.022256,0.002778
5,2.4558,2.44518,0.018332,0.004395,0.022671,0.00324


[I 2025-03-15 15:48:30,443] Trial 53 pruned. 


Trial 54 with params: {'learning_rate': 0.00031862699468485545, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2474,2.010891,0.348304,0.06816,0.069516,0.055998
2,1.8474,1.640873,0.473877,0.123403,0.12913,0.098973
3,1.5104,1.350851,0.55637,0.217963,0.179956,0.161108
4,1.2325,1.152874,0.648029,0.268385,0.255161,0.240235
5,1.0335,1.014703,0.692942,0.2737,0.291912,0.267536
6,0.8642,0.904154,0.712191,0.301801,0.311936,0.291371
7,0.7338,0.848673,0.711274,0.329949,0.316996,0.299098
8,0.6489,0.811279,0.748854,0.389951,0.378503,0.362
9,0.5771,0.783838,0.743355,0.414469,0.387865,0.381502
10,0.5075,0.764023,0.749771,0.44142,0.397608,0.392078


[I 2025-03-15 15:51:05,901] Trial 54 finished with value: 0.5809509981812864 and parameters: {'learning_rate': 0.00031862699468485545, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 5.5}. Best is trial 52 with value: 0.6341848954231764.


Trial 55 with params: {'learning_rate': 0.0002211776590290286, 'weight_decay': 0.001, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2907,2.099802,0.176902,0.003538,0.02,0.006012
2,1.9941,1.841151,0.429881,0.063402,0.099993,0.073748
3,1.7462,1.596913,0.490376,0.154116,0.144089,0.118867
4,1.5032,1.393042,0.560037,0.202989,0.18726,0.163633
5,1.3123,1.234427,0.629698,0.230092,0.232378,0.211885
6,1.138,1.111088,0.670944,0.259947,0.270097,0.248181
7,1.0012,1.023552,0.688359,0.293673,0.287888,0.26719
8,0.9022,0.953037,0.706691,0.288321,0.306546,0.283058
9,0.8151,0.91098,0.711274,0.309652,0.319866,0.295177
10,0.7331,0.871638,0.725023,0.342597,0.338031,0.319751


[I 2025-03-15 15:52:01,162] Trial 55 pruned. 


Trial 56 with params: {'learning_rate': 0.00020759116475665268, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 4, 'lambda_param': 0.0, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2996,2.106311,0.178735,0.023545,0.020476,0.006952
2,1.9942,1.829888,0.432631,0.081354,0.103602,0.076969
3,1.735,1.580592,0.503208,0.157422,0.147772,0.127263
4,1.4943,1.382203,0.56187,0.195469,0.18439,0.163306
5,1.31,1.229351,0.63428,0.22386,0.232771,0.213188
6,1.1391,1.107232,0.676444,0.268652,0.275086,0.255975
7,1.0055,1.019073,0.694775,0.265669,0.28876,0.266056
8,0.9092,0.953289,0.710357,0.30346,0.309704,0.286746
9,0.821,0.908501,0.708524,0.308766,0.311815,0.290491
10,0.74,0.868586,0.722273,0.347752,0.335567,0.316906


[I 2025-03-15 15:54:50,757] Trial 56 finished with value: 0.44558377947987 and parameters: {'learning_rate': 0.00020759116475665268, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 4, 'lambda_param': 0.0, 'temperature': 4.0}. Best is trial 52 with value: 0.6341848954231764.


Trial 57 with params: {'learning_rate': 0.0004883002811669377, 'weight_decay': 0.0, 'adam_beta1': 0.92, 'warmup_steps': 3, 'lambda_param': 0.2, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1726,1.854656,0.424381,0.064722,0.100016,0.07499
2,1.6408,1.41292,0.562786,0.197475,0.200289,0.177581
3,1.2372,1.106691,0.659945,0.262975,0.265893,0.247168
4,0.9526,0.935316,0.695692,0.274514,0.29276,0.267749
5,0.7555,0.849225,0.714024,0.34156,0.329349,0.305537
6,0.6236,0.782513,0.744271,0.424787,0.39114,0.383643
7,0.5195,0.759364,0.741522,0.380158,0.387268,0.372013
8,0.4534,0.730024,0.761687,0.449291,0.430383,0.420984
9,0.389,0.713078,0.76352,0.483829,0.450017,0.449232
10,0.3338,0.693288,0.762603,0.485518,0.449452,0.452243


[I 2025-03-15 15:57:37,254] Trial 57 finished with value: 0.6318757340004337 and parameters: {'learning_rate': 0.0004883002811669377, 'weight_decay': 0.0, 'adam_beta1': 0.92, 'warmup_steps': 3, 'lambda_param': 0.2, 'temperature': 2.0}. Best is trial 52 with value: 0.6341848954231764.


Trial 58 with params: {'learning_rate': 0.0004685055056855652, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1735,1.8512,0.428048,0.063945,0.10202,0.07568
2,1.6371,1.403444,0.55912,0.200172,0.192342,0.172897
3,1.2317,1.099637,0.666361,0.26814,0.272504,0.250332
4,0.9484,0.931025,0.697525,0.284929,0.297654,0.275022
5,0.7564,0.85086,0.719523,0.348787,0.335068,0.3093
6,0.627,0.788189,0.734189,0.414158,0.378695,0.370715
7,0.5228,0.759042,0.737855,0.382187,0.381276,0.36651
8,0.4565,0.732605,0.759853,0.450021,0.431692,0.41996
9,0.3975,0.716191,0.764436,0.456639,0.435629,0.431502
10,0.3415,0.681467,0.764436,0.505728,0.445914,0.453548


[I 2025-03-15 16:00:18,337] Trial 58 finished with value: 0.6151483561113101 and parameters: {'learning_rate': 0.0004685055056855652, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 2.5}. Best is trial 52 with value: 0.6341848954231764.


Trial 59 with params: {'learning_rate': 0.00024386401983372643, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 4, 'lambda_param': 0.4, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.278,2.064167,0.275894,0.073106,0.048794,0.040709
2,1.9255,1.735531,0.463795,0.127788,0.127904,0.102373
3,1.6262,1.464084,0.535289,0.19838,0.165968,0.149563
4,1.3692,1.262064,0.625115,0.249418,0.229032,0.209777
5,1.1737,1.110541,0.675527,0.258087,0.272462,0.251941
6,1.0043,0.999433,0.694775,0.282918,0.294478,0.273294
7,0.8719,0.923552,0.702108,0.298875,0.300409,0.280148
8,0.7815,0.87386,0.72319,0.35071,0.334991,0.31451
9,0.7031,0.836666,0.72044,0.320509,0.329683,0.310094
10,0.6276,0.808448,0.741522,0.373112,0.364335,0.347438


[I 2025-03-15 16:01:13,955] Trial 59 pruned. 


Trial 60 with params: {'learning_rate': 0.0003853127827969721, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2091,1.930313,0.396884,0.075925,0.085512,0.063887
2,1.7385,1.51437,0.525206,0.183462,0.166061,0.14879
3,1.3628,1.206808,0.640697,0.227444,0.241373,0.22122
4,1.0797,1.03383,0.68011,0.274586,0.284346,0.262538
5,0.8818,0.921006,0.709441,0.316482,0.31724,0.296343
6,0.7282,0.830354,0.718607,0.376766,0.339757,0.325335
7,0.6139,0.7997,0.731439,0.376815,0.359792,0.344908
8,0.5406,0.763667,0.753437,0.40322,0.398183,0.381425
9,0.4738,0.742044,0.749771,0.432248,0.413187,0.406233
10,0.4139,0.720785,0.756187,0.484378,0.430247,0.433888


[I 2025-03-15 16:03:15,104] Trial 60 pruned. 


Trial 61 with params: {'learning_rate': 0.0003439050443277329, 'weight_decay': 0.0, 'adam_beta1': 0.92, 'warmup_steps': 4, 'lambda_param': 0.0, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2352,1.981465,0.370302,0.064225,0.076756,0.061155
2,1.8045,1.588862,0.493126,0.129584,0.142355,0.116429
3,1.4479,1.287763,0.590284,0.221102,0.208715,0.191299
4,1.1667,1.099639,0.667278,0.257534,0.27339,0.254114
5,0.9674,0.975927,0.697525,0.286265,0.300772,0.27657
6,0.8049,0.868776,0.715857,0.329637,0.323907,0.303251
7,0.6807,0.823061,0.716774,0.353263,0.336626,0.323134
8,0.6009,0.787173,0.747938,0.375494,0.380109,0.362208
9,0.5311,0.76465,0.747021,0.409471,0.394183,0.383866
10,0.4663,0.749562,0.751604,0.44529,0.40271,0.39721


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--precision/155d3220d6cd4a6553f12da68eeb3d1f97cf431206304a4bc6e2d564c29502e9 (last modified on Fri Jan 10 23:13:59 2025) since it couldn't be found locally at evaluate-metric--precision, or remotely on the Hugging Face Hub.
[I 2025-03-15 16:06:38,759] Trial 61 finished with value: 0.5978521818185435 and parameters: {'learning_rate': 0.0003439050443277329, 'weight_decay': 0.0, 'adam_beta1': 0.92, 'warmup_steps': 4, 'lambda_param': 0.0, 'temperature': 2.5}. Best is trial 52 with value: 0.6341848954231764.


Trial 62 with params: {'learning_rate': 0.00010037683401527949, 'weight_decay': 0.0, 'adam_beta1': 0.97, 'warmup_steps': 4, 'lambda_param': 0.0, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3713,2.255709,0.176902,0.003538,0.02,0.006012
2,2.199,2.11024,0.178735,0.023545,0.020476,0.006952
3,2.0747,1.98578,0.384968,0.060354,0.081148,0.063165
4,1.949,1.862002,0.431714,0.067298,0.099563,0.074849
5,1.8381,1.747278,0.456462,0.078946,0.118519,0.088499


[I 2025-03-15 16:07:08,329] Trial 62 pruned. 


Trial 63 with params: {'learning_rate': 0.0003066027287823963, 'weight_decay': 0.001, 'adam_beta1': 0.93, 'warmup_steps': 3, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2468,2.016312,0.346471,0.068987,0.06907,0.056775
2,1.8584,1.654972,0.472044,0.122503,0.1281,0.097961
3,1.5292,1.370617,0.553621,0.195674,0.177078,0.156106
4,1.2557,1.173112,0.63703,0.262887,0.246173,0.23009
5,1.0584,1.031587,0.692026,0.272763,0.289812,0.265564
6,0.8885,0.920408,0.709441,0.313757,0.308352,0.288647
7,0.7567,0.860582,0.709441,0.326502,0.31237,0.294713
8,0.67,0.820853,0.735105,0.366885,0.354898,0.334016
9,0.5971,0.790181,0.737855,0.365629,0.368613,0.353324
10,0.5257,0.773641,0.751604,0.46211,0.395345,0.39025


[I 2025-03-15 16:08:06,101] Trial 63 pruned. 


Trial 64 with params: {'learning_rate': 0.0003021574030153618, 'weight_decay': 0.001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 4, 'lambda_param': 0.0, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2561,2.031007,0.300642,0.071703,0.054093,0.042437
2,1.8806,1.682675,0.467461,0.121728,0.129829,0.102377
3,1.5616,1.403411,0.541705,0.177362,0.171627,0.150187
4,1.2881,1.198638,0.628781,0.242118,0.23517,0.214453
5,1.0868,1.052824,0.68286,0.265771,0.282778,0.257258
6,0.9159,0.940559,0.703941,0.30743,0.30377,0.28472
7,0.7793,0.874045,0.706691,0.302445,0.304628,0.282008
8,0.6895,0.833702,0.739688,0.377562,0.365658,0.345088
9,0.6129,0.803321,0.741522,0.41108,0.377198,0.368524
10,0.5404,0.78751,0.746104,0.430554,0.389171,0.378358


[I 2025-03-15 16:09:54,788] Trial 64 pruned. 


Trial 65 with params: {'learning_rate': 0.00035835584343043295, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 2, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2108,1.939397,0.395967,0.075777,0.085273,0.063852
2,1.7569,1.537018,0.519707,0.161841,0.15959,0.140356
3,1.3923,1.237861,0.633364,0.223672,0.234691,0.214548
4,1.115,1.06086,0.68011,0.26083,0.282796,0.260647
5,0.9184,0.940448,0.711274,0.301648,0.316443,0.294104
6,0.7628,0.84724,0.719523,0.32981,0.33138,0.311538
7,0.6464,0.809931,0.71769,0.360962,0.340888,0.326356
8,0.5705,0.77901,0.745188,0.379756,0.387089,0.368509
9,0.5043,0.754336,0.748854,0.43238,0.399907,0.393234
10,0.4422,0.733425,0.753437,0.4537,0.412721,0.410058


[I 2025-03-15 16:12:40,154] Trial 65 finished with value: 0.600788124130334 and parameters: {'learning_rate': 0.00035835584343043295, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 2, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}. Best is trial 52 with value: 0.6341848954231764.


Trial 66 with params: {'learning_rate': 2.174477018969353e-05, 'weight_decay': 0.005, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.9, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4418,2.391338,0.171402,0.0117,0.020411,0.008672
2,2.3757,2.334175,0.181485,0.011924,0.02137,0.008441
3,2.3295,2.288096,0.179652,0.018554,0.020822,0.007594
4,2.2862,2.246684,0.179652,0.023551,0.020822,0.00761
5,2.2487,2.204117,0.181485,0.023558,0.02137,0.008605
6,2.2093,2.168125,0.199817,0.063631,0.026493,0.017199
7,2.1751,2.132962,0.27406,0.074352,0.047563,0.044091
8,2.1439,2.100139,0.340055,0.070613,0.067335,0.060022
9,2.1147,2.070334,0.362053,0.063735,0.073941,0.062233
10,2.0882,2.042365,0.388634,0.079052,0.081905,0.064961


[I 2025-03-15 16:13:37,159] Trial 66 pruned. 


Trial 67 with params: {'learning_rate': 0.0001282754863777374, 'weight_decay': 0.0, 'adam_beta1': 0.92, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3409,2.196843,0.176902,0.003538,0.02,0.006012
2,2.1218,2.002682,0.386801,0.059936,0.081932,0.063948
3,1.9458,1.821072,0.43813,0.076731,0.104623,0.078558
4,1.7662,1.659499,0.490376,0.117877,0.136409,0.110041
5,1.6197,1.518095,0.535289,0.191193,0.173235,0.155996


[I 2025-03-15 16:14:05,165] Trial 67 pruned. 


Trial 68 with params: {'learning_rate': 0.00029808734586146555, 'weight_decay': 0.01, 'adam_beta1': 0.91, 'warmup_steps': 3, 'lambda_param': 0.4, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2464,2.007869,0.367553,0.067049,0.075495,0.061999
2,1.8494,1.642337,0.479377,0.135279,0.136027,0.110772
3,1.5159,1.354553,0.560037,0.214704,0.182983,0.164487
4,1.2446,1.159204,0.660862,0.272127,0.266283,0.251238
5,1.0465,1.021368,0.692942,0.265658,0.290655,0.265509
6,0.8815,0.917725,0.711274,0.299864,0.312239,0.291855
7,0.7555,0.857761,0.710357,0.301081,0.314719,0.294231
8,0.6712,0.822261,0.736022,0.373429,0.358209,0.338278
9,0.6002,0.790746,0.740605,0.390241,0.37236,0.360196
10,0.5297,0.771244,0.749771,0.453809,0.391047,0.385217


[I 2025-03-15 16:15:50,498] Trial 68 pruned. 


Trial 69 with params: {'learning_rate': 8.217756913819263e-06, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 4, 'lambda_param': 0.9, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4635,2.437492,0.025665,0.024872,0.023985,0.004711
2,2.4339,2.409627,0.11549,0.009057,0.033996,0.008832
3,2.4098,2.382557,0.180568,0.014948,0.022388,0.010667
4,2.3869,2.359769,0.187901,0.021445,0.023728,0.011759
5,2.3653,2.340321,0.188818,0.019265,0.023822,0.011932
6,2.3464,2.322715,0.181485,0.012675,0.02137,0.00846
7,2.3316,2.306541,0.182401,0.01449,0.021644,0.008931
8,2.3168,2.291396,0.179652,0.018558,0.020822,0.007599
9,2.3027,2.27712,0.180568,0.019561,0.021096,0.008097
10,2.2907,2.264025,0.179652,0.018558,0.020822,0.007599


[I 2025-03-15 16:17:37,387] Trial 69 pruned. 


Trial 70 with params: {'learning_rate': 6.0142611090541214e-05, 'weight_decay': 0.01, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3969,2.307804,0.177819,0.023541,0.020274,0.006558
2,2.271,2.200875,0.177819,0.023541,0.020274,0.006558
3,2.1771,2.104767,0.263061,0.072242,0.044536,0.039549
4,2.0852,2.015182,0.396884,0.056724,0.08439,0.064179
5,2.0074,1.930626,0.417049,0.0725,0.09183,0.068494


[I 2025-03-15 16:18:03,362] Trial 70 pruned. 


Trial 71 with params: {'learning_rate': 0.0003340918907489455, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2284,1.967752,0.389551,0.060396,0.083055,0.063793
2,1.7911,1.57235,0.510541,0.140747,0.150697,0.128043
3,1.4327,1.273083,0.615949,0.227215,0.224812,0.207288
4,1.1574,1.091805,0.679193,0.262771,0.279562,0.259928
5,0.9631,0.969889,0.704858,0.300376,0.307377,0.285686
6,0.8047,0.869708,0.716774,0.315307,0.323983,0.302345
7,0.685,0.82288,0.71769,0.338964,0.335247,0.316626
8,0.6052,0.793428,0.746104,0.370614,0.381207,0.361507
9,0.5372,0.76223,0.747021,0.389338,0.387729,0.372792
10,0.472,0.745359,0.748854,0.455841,0.399108,0.393992


[I 2025-03-15 16:19:52,086] Trial 71 pruned. 


Trial 72 with params: {'learning_rate': 0.0003523606388342241, 'weight_decay': 0.005, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2298,1.987539,0.366636,0.064746,0.075338,0.060094
2,1.8147,1.608563,0.47846,0.112474,0.133499,0.104292
3,1.4704,1.314303,0.574702,0.22022,0.193173,0.172327
4,1.1871,1.119107,0.650779,0.266615,0.261655,0.24439
5,0.9863,0.982166,0.692942,0.286357,0.293619,0.271124
6,0.8197,0.879256,0.715857,0.309776,0.322827,0.301386
7,0.6912,0.831309,0.71494,0.316877,0.324729,0.306166
8,0.6073,0.795827,0.748854,0.393479,0.379602,0.361438
9,0.5366,0.770454,0.745188,0.42863,0.395404,0.387104
10,0.4705,0.757778,0.745188,0.449947,0.403916,0.395855


[I 2025-03-15 16:22:30,415] Trial 72 finished with value: 0.5896153129552318 and parameters: {'learning_rate': 0.0003523606388342241, 'weight_decay': 0.005, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 3.5}. Best is trial 52 with value: 0.6341848954231764.


Trial 73 with params: {'learning_rate': 3.292397722386191e-05, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.97, 'warmup_steps': 2, 'lambda_param': 1.0, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4287,2.365949,0.193401,0.017296,0.025192,0.012699
2,2.3448,2.297573,0.177819,0.023541,0.020274,0.006558
3,2.2871,2.239503,0.177819,0.023541,0.020274,0.006558
4,2.2311,2.185321,0.178735,0.023545,0.020548,0.007089
5,2.185,2.133239,0.226398,0.079733,0.034367,0.027248


[I 2025-03-15 16:22:56,140] Trial 73 pruned. 


Trial 74 with params: {'learning_rate': 0.00041831093045197835, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.92, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1895,1.90069,0.404216,0.073533,0.08806,0.066566
2,1.7025,1.479269,0.545371,0.169734,0.180913,0.162054
3,1.3221,1.179222,0.636114,0.229324,0.235711,0.216934
4,1.0417,1.005973,0.691109,0.272573,0.293127,0.270068
5,0.8441,0.89249,0.716774,0.329656,0.324451,0.304476
6,0.6951,0.81598,0.724106,0.34972,0.345275,0.330264
7,0.5839,0.792363,0.734189,0.385869,0.372504,0.358952
8,0.5118,0.75154,0.75802,0.409368,0.414185,0.40003
9,0.4465,0.734764,0.751604,0.459566,0.422534,0.418249
10,0.3874,0.711398,0.764436,0.479796,0.439253,0.44199


[I 2025-03-15 16:25:56,336] Trial 74 finished with value: 0.5843257811205536 and parameters: {'learning_rate': 0.00041831093045197835, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.92, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 2.0}. Best is trial 52 with value: 0.6341848954231764.


Trial 75 with params: {'learning_rate': 0.0004740324993790357, 'weight_decay': 0.002, 'adam_beta1': 0.96, 'warmup_steps': 4, 'lambda_param': 0.30000000000000004, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2002,1.927376,0.378552,0.058403,0.079123,0.059895
2,1.7303,1.519622,0.505958,0.146064,0.155597,0.133826
3,1.3628,1.225812,0.6022,0.223362,0.215836,0.186089
4,1.0667,1.042808,0.670944,0.264197,0.28079,0.253391
5,0.8644,0.92412,0.701192,0.295849,0.307627,0.279883
6,0.712,0.843377,0.72594,0.403121,0.35075,0.339201
7,0.5961,0.816331,0.725023,0.377944,0.370147,0.360004
8,0.5185,0.775887,0.744271,0.381733,0.398558,0.377344
9,0.4493,0.759048,0.741522,0.448642,0.422313,0.418221
10,0.3897,0.736863,0.761687,0.482634,0.450914,0.447453


[I 2025-03-15 16:28:35,975] Trial 75 finished with value: 0.6005175043364164 and parameters: {'learning_rate': 0.0004740324993790357, 'weight_decay': 0.002, 'adam_beta1': 0.96, 'warmup_steps': 4, 'lambda_param': 0.30000000000000004, 'temperature': 4.5}. Best is trial 52 with value: 0.6341848954231764.


Trial 76 with params: {'learning_rate': 0.00015001917556377228, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3098,2.14898,0.176902,0.003538,0.02,0.006012
2,2.063,1.93017,0.409716,0.051459,0.088292,0.06293
3,1.8606,1.723329,0.458295,0.102153,0.118077,0.091868
4,1.6606,1.54833,0.51879,0.201412,0.157206,0.13799
5,1.5041,1.406351,0.574702,0.246539,0.194695,0.179475
6,1.3519,1.292668,0.608616,0.241814,0.223008,0.206926
7,1.2275,1.195377,0.653529,0.24092,0.257988,0.240158
8,1.1322,1.119395,0.676444,0.2622,0.281989,0.259512
9,1.0393,1.055907,0.690192,0.280706,0.289161,0.265685
10,0.9564,1.001718,0.698442,0.283964,0.29796,0.274836


[I 2025-03-15 16:30:21,394] Trial 76 pruned. 


Trial 77 with params: {'learning_rate': 0.00041891462370218397, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 1, 'lambda_param': 0.4, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1757,1.87434,0.423465,0.065211,0.099076,0.074827
2,1.6746,1.446445,0.554537,0.219646,0.185696,0.168986
3,1.2902,1.151284,0.657195,0.250084,0.257988,0.238965
4,1.0154,0.982911,0.689276,0.263149,0.289699,0.264824
5,0.8221,0.881014,0.71769,0.317319,0.33008,0.304329
6,0.6829,0.807987,0.728689,0.368985,0.35809,0.348224
7,0.577,0.779204,0.734189,0.393099,0.375728,0.362174
8,0.5062,0.744078,0.759853,0.401319,0.414079,0.398292
9,0.4417,0.715642,0.759853,0.451572,0.423079,0.41661
10,0.3823,0.700259,0.769019,0.487604,0.443299,0.442795


[I 2025-03-15 16:33:03,098] Trial 77 finished with value: 0.6118265793276343 and parameters: {'learning_rate': 0.00041891462370218397, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 1, 'lambda_param': 0.4, 'temperature': 5.0}. Best is trial 52 with value: 0.6341848954231764.


Trial 78 with params: {'learning_rate': 1.3245726232440134e-06, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.96, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4748,2.462725,0.007333,0.003602,0.021778,0.002005
2,2.4693,2.457204,0.009166,0.003781,0.021634,0.002018
3,2.4657,2.452202,0.010999,0.003713,0.021842,0.002253
4,2.4619,2.44765,0.014665,0.004291,0.022256,0.002818
5,2.4541,2.443338,0.019248,0.004444,0.022774,0.003354


[I 2025-03-15 16:33:30,689] Trial 78 pruned. 


Trial 79 with params: {'learning_rate': 0.0002475885664259343, 'weight_decay': 0.008, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0, 'lambda_param': 0.5, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.252,2.059629,0.215399,0.057202,0.030015,0.019825
2,1.9415,1.776989,0.434464,0.061822,0.104251,0.075379
3,1.6773,1.526794,0.520623,0.18145,0.160388,0.139922
4,1.4245,1.321145,0.582035,0.224563,0.202428,0.178076
5,1.2311,1.16742,0.643446,0.21264,0.241038,0.215045
6,1.0591,1.049191,0.684693,0.26186,0.284809,0.259541
7,0.9218,0.965206,0.697525,0.280666,0.295983,0.271454
8,0.8248,0.899943,0.715857,0.293613,0.315122,0.2899
9,0.7426,0.866628,0.72319,0.331285,0.335363,0.311021
10,0.6622,0.833254,0.735105,0.354239,0.358105,0.338725


[I 2025-03-15 16:34:32,197] Trial 79 pruned. 


Trial 80 with params: {'learning_rate': 0.000482322168974171, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1612,1.839872,0.425298,0.061343,0.100387,0.07334
2,1.6278,1.398153,0.55637,0.195288,0.193775,0.17162
3,1.2288,1.106354,0.656279,0.277236,0.259075,0.242219
4,0.9548,0.941348,0.702108,0.290195,0.303141,0.281906
5,0.7639,0.850263,0.71769,0.331523,0.334469,0.312299
6,0.6306,0.788956,0.732356,0.381975,0.37599,0.360361
7,0.5265,0.767219,0.740605,0.432223,0.395485,0.389176
8,0.4608,0.734182,0.756187,0.420929,0.412914,0.404288
9,0.3943,0.705041,0.76077,0.489619,0.436199,0.437093
10,0.3391,0.690017,0.766269,0.53575,0.461446,0.473544


[I 2025-03-15 16:37:12,325] Trial 80 finished with value: 0.6434396489980481 and parameters: {'learning_rate': 0.000482322168974171, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 2.5}. Best is trial 80 with value: 0.6434396489980481.


Trial 81 with params: {'learning_rate': 0.00030341891380744656, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2363,1.992979,0.373052,0.064683,0.07729,0.061874
2,1.8279,1.620351,0.506874,0.168646,0.151278,0.131572
3,1.4905,1.329911,0.584785,0.22176,0.20073,0.182643
4,1.219,1.136013,0.670027,0.262731,0.276372,0.257597
5,1.0196,1.000922,0.697525,0.276067,0.299423,0.274854
6,0.8569,0.897906,0.714024,0.298187,0.319336,0.296959
7,0.7341,0.846137,0.710357,0.320632,0.316674,0.297373
8,0.6519,0.816225,0.738772,0.363098,0.368583,0.346285
9,0.5832,0.786489,0.739688,0.38438,0.372425,0.358413
10,0.5147,0.764073,0.748854,0.454066,0.39223,0.386221


[I 2025-03-15 16:38:59,383] Trial 81 pruned. 


Trial 82 with params: {'learning_rate': 0.0004900879987755265, 'weight_decay': 0.007, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1408,1.838497,0.421632,0.063213,0.100505,0.073931
2,1.6449,1.432655,0.542621,0.206721,0.175691,0.155222
3,1.273,1.148571,0.63703,0.235702,0.244914,0.226192
4,0.9977,0.984525,0.675527,0.255653,0.286469,0.257503
5,0.8067,0.877949,0.708524,0.287782,0.312031,0.286806
6,0.6622,0.803322,0.738772,0.379846,0.36359,0.345493
7,0.5557,0.769475,0.757104,0.419864,0.411931,0.395133
8,0.4825,0.747883,0.75527,0.396575,0.417395,0.395628
9,0.4163,0.717354,0.761687,0.446065,0.43999,0.432959
10,0.3585,0.722656,0.766269,0.477282,0.45767,0.453741


[I 2025-03-15 16:40:47,230] Trial 82 pruned. 


Trial 83 with params: {'learning_rate': 8.639644976082928e-06, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 6.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4624,2.435615,0.029331,0.025137,0.0244,0.005086
2,2.4315,2.406449,0.122823,0.008605,0.034825,0.008681
3,2.4063,2.378402,0.184235,0.016814,0.022973,0.011024
4,2.3827,2.355246,0.187901,0.021613,0.023728,0.011787
5,2.3609,2.335207,0.186984,0.012798,0.023014,0.010683
6,2.3413,2.317022,0.181485,0.012675,0.02137,0.00846
7,2.3259,2.300238,0.180568,0.014996,0.021096,0.008055
8,2.3105,2.284456,0.179652,0.018558,0.020822,0.007599
9,2.2958,2.26951,0.179652,0.018558,0.020822,0.007599
10,2.2834,2.255983,0.179652,0.023554,0.020822,0.007615


[I 2025-03-15 16:41:40,171] Trial 83 pruned. 


Trial 84 with params: {'learning_rate': 0.00023656822479860973, 'weight_decay': 0.006, 'adam_beta1': 0.93, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2798,2.077249,0.203483,0.05889,0.027312,0.018344
2,1.955,1.780882,0.44363,0.081551,0.111808,0.084574
3,1.6782,1.522669,0.523373,0.189621,0.159951,0.141225
4,1.4253,1.319507,0.582951,0.201355,0.19888,0.174603
5,1.2343,1.166366,0.648029,0.260793,0.24709,0.228775
6,1.0618,1.047582,0.686526,0.266763,0.284809,0.262693
7,0.9256,0.965253,0.697525,0.274113,0.291397,0.269393
8,0.8301,0.905067,0.71769,0.30182,0.322352,0.297365
9,0.7454,0.866358,0.718607,0.309409,0.324884,0.302473
10,0.6664,0.837476,0.735105,0.328373,0.356565,0.332639


[I 2025-03-15 16:42:33,533] Trial 84 pruned. 


Trial 85 with params: {'learning_rate': 0.0003930226508449561, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2,1.923379,0.3978,0.055562,0.085273,0.06326
2,1.7341,1.515202,0.530706,0.188014,0.173291,0.158306
3,1.363,1.213517,0.629698,0.228124,0.229605,0.214003
4,1.0815,1.03889,0.68286,0.284562,0.284129,0.262978
5,0.8838,0.920658,0.707608,0.30678,0.314635,0.290908
6,0.7277,0.831513,0.72319,0.345481,0.343329,0.326037
7,0.6131,0.804213,0.727773,0.370742,0.355466,0.341293
8,0.5385,0.760204,0.752521,0.398152,0.399787,0.384374
9,0.4734,0.738277,0.752521,0.434396,0.415767,0.40944
10,0.4112,0.717087,0.75802,0.477832,0.427854,0.429857


[I 2025-03-15 16:45:20,201] Trial 85 finished with value: 0.5949393258686231 and parameters: {'learning_rate': 0.0003930226508449561, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 3.0}. Best is trial 80 with value: 0.6434396489980481.


Trial 86 with params: {'learning_rate': 4.058724127258315e-06, 'weight_decay': 0.008, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 4, 'lambda_param': 0.7000000000000001, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4703,2.45223,0.010999,0.003701,0.021842,0.002246
2,2.4543,2.437497,0.024748,0.024715,0.023531,0.004358
3,2.4423,2.424098,0.057745,0.007315,0.027297,0.006646
4,2.4311,2.411335,0.111824,0.0093,0.033581,0.008767
5,2.4165,2.398489,0.156737,0.009821,0.019274,0.009774
6,2.4038,2.386508,0.174152,0.012936,0.021492,0.010217
7,2.3945,2.375893,0.185151,0.017037,0.023247,0.011264
8,2.3842,2.366325,0.186068,0.019959,0.02318,0.01114
9,2.3767,2.357591,0.186068,0.015098,0.023,0.010688
10,2.369,2.349743,0.185151,0.016119,0.022726,0.010514


[I 2025-03-15 16:47:21,121] Trial 86 pruned. 


Trial 87 with params: {'learning_rate': 0.0004727674222465423, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.92, 'warmup_steps': 1, 'lambda_param': 0.4, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1611,1.845547,0.429881,0.061149,0.10434,0.074419
2,1.6398,1.416634,0.566453,0.195187,0.196259,0.174681
3,1.2449,1.119457,0.654445,0.234709,0.258649,0.238439
4,0.9679,0.9508,0.700275,0.276843,0.295902,0.269671
5,0.7752,0.859662,0.71494,0.32858,0.331918,0.307708
6,0.638,0.79556,0.736022,0.400893,0.38191,0.369876
7,0.5346,0.758101,0.743355,0.400968,0.384933,0.37396
8,0.4662,0.733127,0.757104,0.431943,0.416079,0.408988
9,0.4023,0.709892,0.762603,0.471756,0.442364,0.439821
10,0.3473,0.691321,0.766269,0.496595,0.459362,0.466443


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--recall/11f90e583db35601050aed380d48e83202a896976b9608432fba9244fb447f24 (last modified on Fri Jan 10 23:14:00 2025) since it couldn't be found locally at evaluate-metric--recall, or remotely on the Hugging Face Hub.
[I 2025-03-15 16:50:37,086] Trial 87 finished with value: 0.6108822045896217 and parameters: {'learning_rate': 0.0004727674222465423, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.92, 'warmup_steps': 1, 'lambda_param': 0.4, 'temperature': 3.5}. Best is trial 80 with value: 0.6434396489980481.


Trial 88 with params: {'learning_rate': 0.0004096873003513377, 'weight_decay': 0.002, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2161,1.951073,0.383135,0.060163,0.081252,0.061843
2,1.7569,1.540393,0.511457,0.151392,0.154141,0.129401
3,1.3862,1.233472,0.603116,0.226573,0.218753,0.200941
4,1.0969,1.049171,0.669111,0.262536,0.27384,0.252474
5,0.8955,0.925911,0.699358,0.275904,0.302434,0.276485
6,0.7366,0.840209,0.719523,0.377726,0.342531,0.327919
7,0.6186,0.808501,0.72594,0.395123,0.361744,0.353749
8,0.5419,0.77174,0.748854,0.413084,0.397768,0.386027
9,0.472,0.751963,0.754354,0.453666,0.434956,0.431833
10,0.4113,0.734373,0.759853,0.46822,0.446076,0.44136


[I 2025-03-15 16:53:17,892] Trial 88 finished with value: 0.6081234749779998 and parameters: {'learning_rate': 0.0004096873003513377, 'weight_decay': 0.002, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 3.5}. Best is trial 80 with value: 0.6434396489980481.


Trial 89 with params: {'learning_rate': 0.0004912331295718279, 'weight_decay': 0.003, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 1, 'lambda_param': 0.9, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1654,1.863477,0.414299,0.065695,0.095633,0.072657
2,1.6682,1.448784,0.544455,0.180533,0.177974,0.157583
3,1.2899,1.171771,0.627864,0.233,0.231773,0.212286
4,1.0107,0.995473,0.683776,0.26326,0.284817,0.256609
5,0.8165,0.889097,0.709441,0.287773,0.313427,0.288925
6,0.6682,0.819208,0.731439,0.406264,0.364352,0.355231
7,0.5616,0.784343,0.746104,0.390866,0.394835,0.378037
8,0.4862,0.757763,0.747021,0.401067,0.403804,0.387773
9,0.4183,0.727938,0.752521,0.44343,0.43644,0.43131
10,0.362,0.725344,0.76077,0.484962,0.461202,0.456979


[I 2025-03-15 16:55:22,762] Trial 89 pruned. 


Trial 90 with params: {'learning_rate': 0.0003984445702332602, 'weight_decay': 0.004, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1917,1.917307,0.3978,0.055825,0.085013,0.063308
2,1.7292,1.511801,0.535289,0.172064,0.174238,0.156975
3,1.3624,1.219666,0.595784,0.21547,0.205688,0.187492
4,1.0855,1.039284,0.684693,0.260385,0.28363,0.262366
5,0.8919,0.926767,0.706691,0.276847,0.309423,0.280457
6,0.7343,0.832689,0.72319,0.341314,0.338408,0.322213
7,0.6194,0.804957,0.726856,0.354395,0.355687,0.339805
8,0.5434,0.760317,0.753437,0.39735,0.394942,0.381302
9,0.4755,0.735927,0.749771,0.440588,0.416031,0.409434
10,0.4138,0.722363,0.76077,0.486921,0.438408,0.440559


[I 2025-03-15 16:58:14,256] Trial 90 finished with value: 0.6125792440648197 and parameters: {'learning_rate': 0.0003984445702332602, 'weight_decay': 0.004, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 2.0}. Best is trial 80 with value: 0.6434396489980481.


Trial 91 with params: {'learning_rate': 0.0004541839293619056, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1405,1.825048,0.430797,0.062248,0.104986,0.074881
2,1.6225,1.400143,0.560953,0.197745,0.196221,0.174042
3,1.2398,1.110647,0.661778,0.237201,0.26742,0.244366
4,0.9713,0.953107,0.700275,0.262183,0.29861,0.27132
5,0.7859,0.861525,0.72044,0.333573,0.333826,0.310629
6,0.6509,0.79553,0.746104,0.388666,0.378792,0.362561
7,0.5463,0.75721,0.747021,0.400476,0.386358,0.372173
8,0.4768,0.734016,0.76352,0.451426,0.425137,0.41675
9,0.4157,0.707046,0.768103,0.441711,0.438176,0.426322
10,0.3578,0.694362,0.768103,0.512951,0.463199,0.46785


[I 2025-03-15 17:00:55,809] Trial 91 finished with value: 0.6106490970611218 and parameters: {'learning_rate': 0.0004541839293619056, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 2.5}. Best is trial 80 with value: 0.6434396489980481.


Trial 92 with params: {'learning_rate': 0.0001522132346482949, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.93, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3245,2.164983,0.176902,0.003538,0.02,0.006012
2,2.0814,1.953317,0.404216,0.055251,0.08683,0.06433
3,1.8843,1.74929,0.461962,0.078575,0.121721,0.091693
4,1.6837,1.572867,0.510541,0.153551,0.153298,0.133009
5,1.5232,1.42599,0.560953,0.220735,0.188788,0.169936


[I 2025-03-15 17:01:22,463] Trial 92 pruned. 


Trial 93 with params: {'learning_rate': 3.7262129220584106e-05, 'weight_decay': 0.006, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 6.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4217,2.353207,0.191567,0.021862,0.024644,0.012595
2,2.3292,2.275892,0.178735,0.023545,0.020548,0.007089
3,2.263,2.20781,0.177819,0.023541,0.020274,0.006558
4,2.1988,2.145761,0.219065,0.078708,0.032116,0.025198
5,2.1453,2.085881,0.340972,0.069318,0.067721,0.059726
6,2.0853,2.031475,0.389551,0.080005,0.082394,0.06558
7,2.032,1.980226,0.407883,0.054273,0.087416,0.064461
8,1.9851,1.931487,0.420715,0.092476,0.09336,0.07043
9,1.9374,1.884888,0.432631,0.086565,0.100336,0.076695
10,1.8954,1.842573,0.446379,0.104663,0.108513,0.083745


[I 2025-03-15 17:02:17,801] Trial 93 pruned. 


Trial 94 with params: {'learning_rate': 0.0003669577048396535, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 2, 'lambda_param': 0.30000000000000004, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2067,1.930734,0.3978,0.075357,0.085727,0.063836
2,1.7459,1.524736,0.52429,0.181289,0.163908,0.146521
3,1.3777,1.224636,0.636114,0.223522,0.235944,0.215866
4,1.1,1.049281,0.681943,0.260873,0.284193,0.261488
5,0.9031,0.930969,0.713107,0.299131,0.320589,0.297189
6,0.7495,0.841673,0.71769,0.328602,0.330562,0.311127
7,0.6349,0.806005,0.722273,0.359838,0.345797,0.331954
8,0.5598,0.773852,0.746104,0.372782,0.388941,0.368668
9,0.4928,0.74649,0.753437,0.439894,0.403315,0.400152
10,0.4313,0.726615,0.753437,0.435794,0.412508,0.408217


[I 2025-03-15 17:03:15,857] Trial 94 pruned. 


Trial 95 with params: {'learning_rate': 0.0004930404167082212, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1711,1.85131,0.423465,0.064792,0.099801,0.074909
2,1.6358,1.407231,0.560953,0.197176,0.199528,0.177492
3,1.2308,1.102859,0.660862,0.260788,0.266218,0.24714
4,0.9476,0.933515,0.698442,0.278866,0.296434,0.273132
5,0.751,0.846184,0.71769,0.339928,0.332548,0.307176
6,0.6195,0.78516,0.738772,0.416813,0.387462,0.379502
7,0.5162,0.754072,0.745188,0.38629,0.395099,0.379181
8,0.4519,0.729053,0.759853,0.451727,0.423251,0.418485
9,0.3878,0.710077,0.767186,0.515461,0.457492,0.462243
10,0.3324,0.686855,0.765353,0.524873,0.452048,0.458879


[I 2025-03-15 17:05:56,426] Trial 95 finished with value: 0.6329485293395112 and parameters: {'learning_rate': 0.0004930404167082212, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 2.0}. Best is trial 80 with value: 0.6434396489980481.


Trial 96 with params: {'learning_rate': 0.0004284611116378763, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1851,1.891037,0.406049,0.068802,0.08892,0.067207
2,1.6906,1.466537,0.545371,0.168383,0.183196,0.162436
3,1.3065,1.166556,0.638863,0.262524,0.241098,0.224396
4,1.0267,0.994735,0.691109,0.297359,0.294194,0.271284
5,0.8299,0.884194,0.713107,0.322457,0.322054,0.301074
6,0.683,0.810534,0.727773,0.382957,0.356642,0.345912
7,0.5731,0.78631,0.736939,0.408877,0.382791,0.373974
8,0.5028,0.748353,0.757104,0.420944,0.411293,0.399824
9,0.4384,0.733877,0.749771,0.458233,0.419612,0.417003
10,0.3783,0.709839,0.76077,0.491151,0.441049,0.445468


[I 2025-03-15 17:07:50,829] Trial 96 pruned. 


Trial 97 with params: {'learning_rate': 0.0004019522458764724, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2087,1.938532,0.378552,0.077102,0.080319,0.060279
2,1.7456,1.529682,0.531622,0.184905,0.173611,0.156397
3,1.3746,1.221599,0.605866,0.220026,0.219224,0.201771
4,1.0848,1.036798,0.675527,0.266932,0.281734,0.260018
5,0.8835,0.916528,0.709441,0.313591,0.315667,0.29214
6,0.7218,0.824974,0.725023,0.382445,0.341503,0.325939
7,0.6066,0.797023,0.738772,0.401417,0.373184,0.362799
8,0.5334,0.760409,0.745188,0.429021,0.394895,0.384201
9,0.4668,0.740283,0.751604,0.440101,0.425091,0.416393
10,0.4063,0.723547,0.76352,0.482276,0.437347,0.439665


[I 2025-03-15 17:10:40,078] Trial 97 finished with value: 0.5968315955539868 and parameters: {'learning_rate': 0.0004019522458764724, 'weight_decay': 0.002, 'adam_beta1': 0.93, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 3.0}. Best is trial 80 with value: 0.6434396489980481.


Trial 98 with params: {'learning_rate': 3.508998405099427e-05, 'weight_decay': 0.003, 'adam_beta1': 0.92, 'warmup_steps': 0, 'lambda_param': 0.8, 'temperature': 6.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4209,2.353206,0.187901,0.021133,0.023548,0.011417
2,2.33,2.276414,0.180568,0.019554,0.021096,0.008087
3,2.2644,2.208102,0.178735,0.023545,0.020548,0.007089
4,2.2007,2.145925,0.248396,0.076661,0.040534,0.035716
5,2.1471,2.086784,0.353804,0.065348,0.071203,0.060984
6,2.0883,2.03438,0.391384,0.078778,0.082996,0.065253
7,2.0363,1.984298,0.407883,0.07466,0.087656,0.065592
8,1.9909,1.936973,0.417049,0.09216,0.092365,0.069887
9,1.9447,1.892182,0.432631,0.088401,0.100169,0.077539
10,1.9045,1.851851,0.447296,0.10575,0.10837,0.08503


[I 2025-03-15 17:12:27,410] Trial 98 pruned. 


Trial 99 with params: {'learning_rate': 2.8092689649211102e-06, 'weight_decay': 0.002, 'adam_beta1': 0.97, 'warmup_steps': 1, 'lambda_param': 0.2, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.472,2.456541,0.009166,0.003654,0.021634,0.002009
2,2.4605,2.445854,0.016499,0.004287,0.022113,0.00281
3,2.4522,2.436333,0.026581,0.024996,0.024089,0.004825
4,2.4445,2.427523,0.049496,0.008794,0.027066,0.007054
5,2.4332,2.419011,0.085243,0.00805,0.030406,0.008002


[I 2025-03-15 17:12:56,534] Trial 99 pruned. 


Trial 100 with params: {'learning_rate': 0.00031247000160355786, 'weight_decay': 0.002, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 4, 'lambda_param': 0.0, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2539,2.03004,0.272227,0.051025,0.045897,0.035807
2,1.8825,1.688236,0.466544,0.121142,0.127952,0.100736
3,1.5692,1.41427,0.555454,0.20005,0.182409,0.158408
4,1.2931,1.201154,0.633364,0.254216,0.241035,0.220316
5,1.084,1.054075,0.675527,0.260374,0.269939,0.245711
6,0.9115,0.943815,0.700275,0.281101,0.298793,0.275323
7,0.7772,0.875425,0.712191,0.298723,0.309676,0.289083
8,0.6873,0.83559,0.738772,0.372141,0.362534,0.340858
9,0.6107,0.802968,0.741522,0.39614,0.371583,0.359147
10,0.539,0.787309,0.743355,0.412075,0.38983,0.378635


[I 2025-03-15 17:14:50,623] Trial 100 pruned. 


Trial 101 with params: {'learning_rate': 3.6527199497217975e-06, 'weight_decay': 0.002, 'adam_beta1': 0.99, 'warmup_steps': 0, 'lambda_param': 0.8, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.47,2.452723,0.010999,0.003764,0.021842,0.002259
2,2.4554,2.439488,0.021082,0.004634,0.022982,0.003626
3,2.4448,2.427739,0.042163,0.00852,0.025886,0.006148
4,2.435,2.416688,0.07791,0.008405,0.029577,0.007174
5,2.4218,2.406082,0.131989,0.010812,0.036032,0.009076


[I 2025-03-15 17:15:18,580] Trial 101 pruned. 


Trial 102 with params: {'learning_rate': 0.00043327985348239675, 'weight_decay': 0.008, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1891,1.907102,0.399633,0.075481,0.086022,0.06448
2,1.712,1.495747,0.537122,0.171797,0.177832,0.15883
3,1.3412,1.204152,0.605866,0.212086,0.218742,0.197401
4,1.0598,1.02499,0.681027,0.24797,0.283511,0.257663
5,0.8623,0.908323,0.705775,0.283022,0.311517,0.285457
6,0.7058,0.825777,0.724106,0.372613,0.348679,0.336158
7,0.5928,0.796247,0.733272,0.380373,0.370682,0.358837
8,0.5188,0.761729,0.75527,0.412985,0.410479,0.396296
9,0.451,0.741191,0.751604,0.440795,0.430063,0.421238
10,0.3907,0.718575,0.761687,0.500716,0.446203,0.446965


[I 2025-03-15 17:18:12,594] Trial 102 finished with value: 0.5815200451182833 and parameters: {'learning_rate': 0.00043327985348239675, 'weight_decay': 0.008, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 4.0}. Best is trial 80 with value: 0.6434396489980481.


Trial 103 with params: {'learning_rate': 0.00021433149271320713, 'weight_decay': 0.01, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1, 'lambda_param': 0.30000000000000004, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.283,2.095176,0.177819,0.023541,0.020238,0.006488
2,1.9887,1.832732,0.430797,0.063062,0.101499,0.0748
3,1.7372,1.587104,0.496792,0.156789,0.147351,0.125438
4,1.496,1.385781,0.571036,0.199103,0.192226,0.168731
5,1.3093,1.233766,0.635197,0.254525,0.238142,0.219579
6,1.1398,1.111964,0.672777,0.26291,0.276766,0.253703
7,1.0044,1.023592,0.68286,0.291431,0.283307,0.262395
8,0.9065,0.9559,0.708524,0.282807,0.305651,0.278437
9,0.8179,0.90954,0.709441,0.312025,0.314625,0.292243
10,0.7371,0.871049,0.719523,0.338324,0.328839,0.308191


[I 2025-03-15 17:21:06,429] Trial 103 finished with value: 0.44796836715617006 and parameters: {'learning_rate': 0.00021433149271320713, 'weight_decay': 0.01, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1, 'lambda_param': 0.30000000000000004, 'temperature': 2.5}. Best is trial 80 with value: 0.6434396489980481.


Trial 104 with params: {'learning_rate': 0.0004426928660035583, 'weight_decay': 0.008, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1749,1.878299,0.417049,0.067434,0.095871,0.073997
2,1.6819,1.458836,0.552704,0.178361,0.184218,0.164558
3,1.3017,1.169682,0.633364,0.239033,0.235527,0.218443
4,1.027,0.993084,0.691109,0.267239,0.287982,0.265184
5,0.8339,0.890703,0.711274,0.285251,0.32031,0.292231
6,0.6822,0.810972,0.732356,0.39481,0.360665,0.349567
7,0.5728,0.782125,0.734189,0.40361,0.36825,0.357693
8,0.5019,0.745551,0.754354,0.408398,0.400606,0.387045
9,0.4345,0.721341,0.757104,0.487173,0.437587,0.436484
10,0.3769,0.711175,0.766269,0.489292,0.447127,0.450301


[I 2025-03-15 17:23:51,331] Trial 104 finished with value: 0.6147910496206811 and parameters: {'learning_rate': 0.0004426928660035583, 'weight_decay': 0.008, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 2.0}. Best is trial 80 with value: 0.6434396489980481.


Trial 105 with params: {'learning_rate': 0.00042239367254374836, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 3, 'lambda_param': 0.2, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1984,1.911746,0.396884,0.074833,0.087381,0.066579
2,1.7112,1.485274,0.536205,0.165902,0.173724,0.154042
3,1.3266,1.170916,0.643446,0.248776,0.244778,0.225923
4,1.0342,0.997757,0.688359,0.270992,0.288657,0.263869
5,0.8343,0.89174,0.714024,0.315229,0.326237,0.300842
6,0.6848,0.810611,0.726856,0.381968,0.361363,0.34896
7,0.575,0.78523,0.738772,0.387409,0.379322,0.365482
8,0.5041,0.746596,0.745188,0.392155,0.39112,0.37655
9,0.4387,0.7318,0.754354,0.443071,0.439193,0.433874
10,0.382,0.712796,0.76077,0.478933,0.446363,0.446017


[I 2025-03-15 17:26:39,257] Trial 105 finished with value: 0.6010673472486326 and parameters: {'learning_rate': 0.00042239367254374836, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 3, 'lambda_param': 0.2, 'temperature': 2.0}. Best is trial 80 with value: 0.6434396489980481.


Trial 106 with params: {'learning_rate': 0.0004967417678089857, 'weight_decay': 0.007, 'adam_beta1': 0.92, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.128,1.80343,0.434464,0.0607,0.106299,0.073716
2,1.5934,1.377466,0.574702,0.226731,0.206159,0.183338
3,1.2067,1.084097,0.654445,0.247005,0.262157,0.240156
4,0.9386,0.929987,0.698442,0.277927,0.300001,0.27056
5,0.7551,0.849577,0.721357,0.346862,0.336231,0.312638
6,0.623,0.780425,0.750687,0.429296,0.392016,0.380567
7,0.5162,0.751801,0.756187,0.399829,0.397048,0.38243
8,0.4488,0.713038,0.769019,0.44657,0.432601,0.424656
9,0.3838,0.704151,0.767186,0.48689,0.453389,0.448986
10,0.3295,0.695027,0.777269,0.543887,0.48348,0.493705


[I 2025-03-15 17:29:25,062] Trial 106 finished with value: 0.6158337605951746 and parameters: {'learning_rate': 0.0004967417678089857, 'weight_decay': 0.007, 'adam_beta1': 0.92, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}. Best is trial 80 with value: 0.6434396489980481.


Trial 107 with params: {'learning_rate': 7.723958845300466e-05, 'weight_decay': 0.006, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.374,2.271842,0.176902,0.003538,0.02,0.006012
2,2.2249,2.141753,0.186068,0.043571,0.022668,0.010941
3,2.1114,2.025009,0.386801,0.058969,0.081384,0.063467
4,1.9974,1.913824,0.421632,0.071456,0.093261,0.069376
5,1.8979,1.809557,0.447296,0.103677,0.109823,0.08481
6,1.7887,1.712882,0.477544,0.101762,0.128383,0.101871
7,1.6952,1.629927,0.500458,0.161381,0.144981,0.122439
8,1.6196,1.556277,0.522456,0.213527,0.158456,0.138892
9,1.5413,1.489417,0.551787,0.226358,0.183737,0.168716
10,1.4711,1.432701,0.575619,0.240237,0.199093,0.182512


[I 2025-03-15 17:30:19,811] Trial 107 pruned. 


Trial 108 with params: {'learning_rate': 0.0004945187043874618, 'weight_decay': 0.005, 'adam_beta1': 0.92, 'warmup_steps': 1, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1537,1.830289,0.433547,0.059676,0.106491,0.07384
2,1.6198,1.397166,0.570119,0.19529,0.201003,0.178496
3,1.2199,1.099652,0.663611,0.256666,0.269407,0.247353
4,0.9422,0.936614,0.698442,0.28736,0.29862,0.27443
5,0.7501,0.85216,0.71494,0.344647,0.333449,0.31084
6,0.6173,0.785748,0.743355,0.425448,0.388676,0.380623
7,0.516,0.756949,0.742438,0.389789,0.387591,0.374682
8,0.4499,0.733511,0.76077,0.439187,0.428058,0.41951
9,0.3875,0.704464,0.768103,0.494043,0.457184,0.459874
10,0.3333,0.687145,0.770852,0.509156,0.478116,0.483099


[I 2025-03-15 17:33:07,893] Trial 108 finished with value: 0.6117411500762256 and parameters: {'learning_rate': 0.0004945187043874618, 'weight_decay': 0.005, 'adam_beta1': 0.92, 'warmup_steps': 1, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}. Best is trial 80 with value: 0.6434396489980481.


Trial 109 with params: {'learning_rate': 0.00047340348163873975, 'weight_decay': 0.006, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1397,1.829575,0.428048,0.061278,0.103873,0.074085
2,1.6305,1.412046,0.565536,0.207433,0.197211,0.174673
3,1.2539,1.126022,0.643446,0.229003,0.248883,0.23072
4,0.9845,0.96092,0.690192,0.252516,0.289589,0.262491
5,0.7974,0.870122,0.71494,0.315256,0.326717,0.299867
6,0.651,0.796825,0.747938,0.411442,0.376201,0.36085
7,0.5431,0.753604,0.758937,0.402296,0.398828,0.38679
8,0.4741,0.728568,0.761687,0.422218,0.42224,0.411607
9,0.4094,0.712462,0.75802,0.479335,0.442455,0.436262
10,0.3515,0.708191,0.772686,0.526648,0.469112,0.472238


[I 2025-03-15 17:35:58,161] Trial 109 finished with value: 0.6151267952214267 and parameters: {'learning_rate': 0.00047340348163873975, 'weight_decay': 0.006, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 3.0}. Best is trial 80 with value: 0.6434396489980481.


Trial 110 with params: {'learning_rate': 0.0002690764526187635, 'weight_decay': 0.004, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2369,2.027408,0.330889,0.071073,0.063603,0.051141
2,1.8881,1.700961,0.450962,0.102441,0.119399,0.092898
3,1.591,1.437846,0.542621,0.163289,0.170539,0.14934
4,1.3343,1.241783,0.600367,0.219442,0.216414,0.196362
5,1.1428,1.092333,0.67736,0.256125,0.272079,0.251365
6,0.9731,0.978407,0.695692,0.281136,0.294011,0.269976
7,0.8388,0.905176,0.708524,0.314901,0.306265,0.286204
8,0.7446,0.855869,0.724106,0.312386,0.334383,0.309936
9,0.6673,0.820265,0.728689,0.324708,0.345018,0.321965
10,0.5929,0.798253,0.747938,0.372282,0.372882,0.350651


[I 2025-03-15 17:36:54,427] Trial 110 pruned. 


Trial 111 with params: {'learning_rate': 0.0004573571215089706, 'weight_decay': 0.01, 'adam_beta1': 0.93, 'warmup_steps': 3, 'lambda_param': 0.30000000000000004, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.188,1.892822,0.405133,0.072974,0.090904,0.070237
2,1.6876,1.46257,0.547204,0.170906,0.183777,0.162342
3,1.2963,1.148003,0.640697,0.240543,0.242577,0.224622
4,1.0039,0.974327,0.692026,0.27248,0.290819,0.266292
5,0.8047,0.871029,0.714024,0.313539,0.325638,0.300424
6,0.6571,0.799536,0.736939,0.391225,0.366396,0.351218
7,0.5519,0.777753,0.739688,0.39403,0.387136,0.368955
8,0.482,0.742215,0.753437,0.407223,0.40702,0.391846
9,0.4191,0.730688,0.757104,0.464129,0.4437,0.437308
10,0.3612,0.70949,0.761687,0.521767,0.451716,0.458344


[I 2025-03-15 17:39:46,280] Trial 111 finished with value: 0.6028985974782685 and parameters: {'learning_rate': 0.0004573571215089706, 'weight_decay': 0.01, 'adam_beta1': 0.93, 'warmup_steps': 3, 'lambda_param': 0.30000000000000004, 'temperature': 3.5}. Best is trial 80 with value: 0.6434396489980481.


Trial 112 with params: {'learning_rate': 0.00028695272086166576, 'weight_decay': 0.007, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.227,2.00824,0.361137,0.06649,0.072995,0.059026
2,1.8613,1.668346,0.462878,0.14242,0.126167,0.101912
3,1.5521,1.399514,0.549954,0.201982,0.174713,0.153572
4,1.2911,1.207059,0.627864,0.260226,0.23652,0.219048
5,1.1002,1.06025,0.683776,0.260588,0.283627,0.260731
6,0.931,0.948507,0.709441,0.3022,0.307025,0.285664
7,0.7984,0.881115,0.709441,0.319294,0.308192,0.288522
8,0.7063,0.836626,0.728689,0.309717,0.338979,0.311652
9,0.632,0.801104,0.742438,0.368403,0.373464,0.358308
10,0.5606,0.792081,0.747021,0.423483,0.391009,0.376003


[I 2025-03-15 17:42:49,679] Trial 112 finished with value: 0.5669131632795275 and parameters: {'learning_rate': 0.00028695272086166576, 'weight_decay': 0.007, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}. Best is trial 80 with value: 0.6434396489980481.


Trial 113 with params: {'learning_rate': 0.00031654079483112716, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2052,1.954818,0.399633,0.055622,0.085444,0.063795
2,1.7898,1.579841,0.508708,0.141763,0.147665,0.124889
3,1.4519,1.298661,0.588451,0.219796,0.204667,0.187111
4,1.1865,1.117799,0.670944,0.256162,0.273659,0.253378
5,0.9956,0.986746,0.701192,0.290259,0.300952,0.277954
6,0.8372,0.889564,0.719523,0.301144,0.322609,0.301505
7,0.7182,0.835463,0.718607,0.337826,0.332248,0.313309
8,0.6354,0.806138,0.740605,0.358304,0.364257,0.342856
9,0.566,0.768592,0.747021,0.384593,0.394128,0.37905
10,0.4989,0.756601,0.754354,0.454385,0.404951,0.401052


[I 2025-03-15 17:43:44,156] Trial 113 pruned. 


Trial 114 with params: {'learning_rate': 0.0001989498646178862, 'weight_decay': 0.007, 'adam_beta1': 0.92, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2775,2.095581,0.190651,0.039156,0.023548,0.012018
2,1.9891,1.834742,0.426214,0.066224,0.098703,0.073713
3,1.745,1.595624,0.494042,0.169757,0.141939,0.121838
4,1.5126,1.403831,0.56187,0.212828,0.184177,0.163604
5,1.3362,1.254871,0.613199,0.236183,0.221277,0.202815


[I 2025-03-15 17:44:10,753] Trial 114 pruned. 


Trial 115 with params: {'learning_rate': 0.0004126342357093529, 'weight_decay': 0.007, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1, 'lambda_param': 0.4, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.189,1.914884,0.396884,0.076212,0.085205,0.064439
2,1.7268,1.512942,0.534372,0.173573,0.173775,0.156121
3,1.3638,1.225156,0.5967,0.216159,0.206639,0.185785
4,1.0855,1.043268,0.673694,0.248693,0.277131,0.253868
5,0.8907,0.923197,0.701192,0.282843,0.304048,0.278044
6,0.7343,0.831985,0.719523,0.345888,0.334901,0.319537
7,0.6186,0.802685,0.732356,0.385574,0.362323,0.348866
8,0.5423,0.764789,0.749771,0.400109,0.395024,0.378259
9,0.4722,0.740555,0.748854,0.465585,0.418453,0.415178
10,0.4099,0.722265,0.759853,0.442362,0.435556,0.4261


[I 2025-03-15 17:45:06,661] Trial 115 pruned. 


Trial 116 with params: {'learning_rate': 0.00037502909704266116, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2105,1.929825,0.399633,0.075181,0.086282,0.064303
2,1.7396,1.514488,0.526123,0.1663,0.162589,0.143573
3,1.3641,1.210308,0.644363,0.247707,0.245712,0.226212
4,1.0854,1.03627,0.683776,0.260895,0.285806,0.263464
5,0.8895,0.924084,0.711274,0.303545,0.318719,0.297574
6,0.737,0.836011,0.722273,0.343527,0.339855,0.323258
7,0.6235,0.801567,0.731439,0.367502,0.358337,0.34342
8,0.5495,0.771676,0.748854,0.377783,0.390816,0.371129
9,0.4834,0.741452,0.750687,0.45118,0.402309,0.397262
10,0.4206,0.719981,0.757104,0.47734,0.419949,0.421953


[I 2025-03-15 17:46:15,621] Trial 116 pruned. 


Trial 117 with params: {'learning_rate': 0.0004309777887751602, 'weight_decay': 0.008, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.2, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1792,1.888159,0.412466,0.071227,0.092748,0.072397
2,1.6937,1.471798,0.545371,0.174969,0.181071,0.162166
3,1.3168,1.18209,0.632447,0.240837,0.235703,0.21922
4,1.0416,1.004092,0.690192,0.261349,0.287728,0.26505
5,0.8486,0.897964,0.713107,0.284299,0.319606,0.290624
6,0.6958,0.814067,0.728689,0.384581,0.352358,0.33928
7,0.5849,0.788888,0.734189,0.396601,0.366468,0.353525
8,0.5136,0.745389,0.757104,0.404685,0.401503,0.386562
9,0.4452,0.725303,0.752521,0.459674,0.427182,0.419503
10,0.3866,0.713682,0.759853,0.489508,0.429908,0.433696


[I 2025-03-15 17:49:04,151] Trial 117 finished with value: 0.6166420359604712 and parameters: {'learning_rate': 0.0004309777887751602, 'weight_decay': 0.008, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.2, 'temperature': 2.0}. Best is trial 80 with value: 0.6434396489980481.


Trial 118 with params: {'learning_rate': 2.6227222122491564e-06, 'weight_decay': 0.004, 'adam_beta1': 0.93, 'warmup_steps': 0, 'lambda_param': 0.7000000000000001, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4721,2.457042,0.009166,0.003781,0.021634,0.002018
2,2.4614,2.446976,0.014665,0.00426,0.022256,0.002838
3,2.4536,2.437929,0.027498,0.005112,0.023707,0.0043
4,2.4463,2.429494,0.04583,0.03421,0.026957,0.007786
5,2.4354,2.421363,0.08066,0.01083,0.030409,0.009066


[I 2025-03-15 17:49:33,035] Trial 118 pruned. 


Trial 119 with params: {'learning_rate': 2.5882714663975125e-05, 'weight_decay': 0.005, 'adam_beta1': 0.93, 'warmup_steps': 3, 'lambda_param': 0.4, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4384,2.381711,0.183318,0.032052,0.022529,0.010424
2,2.3635,2.318153,0.179652,0.012139,0.020822,0.007555
3,2.3111,2.265212,0.179652,0.023548,0.020822,0.007605
4,2.2609,2.216817,0.179652,0.023548,0.020822,0.007605
5,2.2185,2.169059,0.2044,0.063645,0.027958,0.018723


[I 2025-03-15 17:50:00,368] Trial 119 pruned. 


Trial 120 with params: {'learning_rate': 0.0004264639421009551, 'weight_decay': 0.008, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.4, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1809,1.891959,0.411549,0.071194,0.092103,0.071821
2,1.6983,1.4769,0.542621,0.176828,0.179674,0.161008
3,1.3227,1.186864,0.628781,0.220507,0.231169,0.213177
4,1.0473,1.008272,0.690192,0.262149,0.287057,0.265105
5,0.8544,0.901242,0.711274,0.283229,0.316273,0.287757
6,0.7011,0.815961,0.728689,0.37439,0.350955,0.337847
7,0.5897,0.791814,0.729606,0.383607,0.362408,0.349226
8,0.5183,0.74768,0.754354,0.40015,0.397781,0.382372
9,0.4499,0.7259,0.752521,0.456709,0.42576,0.418508
10,0.3905,0.714142,0.757104,0.492068,0.429735,0.43307


[I 2025-03-15 17:52:47,498] Trial 120 finished with value: 0.6176431183358355 and parameters: {'learning_rate': 0.0004264639421009551, 'weight_decay': 0.008, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.4, 'temperature': 2.0}. Best is trial 80 with value: 0.6434396489980481.


Trial 121 with params: {'learning_rate': 0.0002747483312779424, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 1, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2449,2.019988,0.348304,0.068189,0.068801,0.057566
2,1.8732,1.677106,0.469294,0.144501,0.130614,0.10516
3,1.5608,1.402766,0.548121,0.170187,0.173828,0.152759
4,1.2976,1.204252,0.647113,0.266273,0.248133,0.229639
5,1.1018,1.059887,0.689276,0.267377,0.287483,0.263578
6,0.934,0.952653,0.703025,0.290765,0.298846,0.276301
7,0.806,0.886732,0.705775,0.298864,0.306443,0.28516
8,0.7182,0.842596,0.725023,0.319425,0.334908,0.312089
9,0.6438,0.811231,0.730522,0.364568,0.355187,0.341496
10,0.5712,0.786559,0.742438,0.360683,0.36754,0.349411


[I 2025-03-15 17:53:48,737] Trial 121 pruned. 


Trial 122 with params: {'learning_rate': 0.00024573288218512017, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.5, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2643,2.062002,0.24473,0.054461,0.038662,0.029749
2,1.9352,1.759344,0.450962,0.101261,0.119895,0.09461
3,1.6536,1.498522,0.527039,0.177338,0.161636,0.141716
4,1.3978,1.294189,0.593034,0.245064,0.209511,0.187922
5,1.2042,1.14236,0.656279,0.246988,0.252816,0.232322
6,1.0326,1.025274,0.683776,0.260159,0.283033,0.25981
7,0.8969,0.945041,0.703941,0.286915,0.295388,0.273217
8,0.8037,0.889376,0.724106,0.344947,0.333392,0.312222
9,0.721,0.851624,0.721357,0.310207,0.327129,0.304742
10,0.6438,0.82735,0.735105,0.333122,0.358072,0.334044


[I 2025-03-15 17:54:44,876] Trial 122 pruned. 


Trial 123 with params: {'learning_rate': 0.000225476497722167, 'weight_decay': 0.008, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2826,2.090776,0.176902,0.003538,0.02,0.006012
2,1.9839,1.830105,0.428964,0.063612,0.099348,0.073218
3,1.7342,1.584567,0.492209,0.148903,0.144949,0.119053
4,1.49,1.381029,0.569203,0.20311,0.192833,0.168034
5,1.2985,1.222084,0.623281,0.234273,0.229326,0.208756
6,1.1251,1.101674,0.670944,0.258951,0.27123,0.248973
7,0.9889,1.015421,0.693859,0.294304,0.291469,0.270627
8,0.89,0.944778,0.709441,0.288481,0.306404,0.282662
9,0.8034,0.904753,0.711274,0.301858,0.320862,0.293622
10,0.7218,0.865963,0.722273,0.346322,0.335664,0.318346


[I 2025-03-15 17:55:37,535] Trial 123 pruned. 


Trial 124 with params: {'learning_rate': 0.00027081581353237687, 'weight_decay': 0.005, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1, 'lambda_param': 0.5, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2521,2.043668,0.271311,0.051983,0.045682,0.035193
2,1.909,1.728611,0.453712,0.101472,0.122509,0.096488
3,1.6181,1.465507,0.535289,0.143932,0.162637,0.139521
4,1.355,1.256922,0.604033,0.241191,0.217417,0.193762
5,1.155,1.10454,0.664528,0.243912,0.257922,0.234257


[I 2025-03-15 17:56:06,476] Trial 124 pruned. 


Trial 125 with params: {'learning_rate': 0.0001133580450782148, 'weight_decay': 0.004, 'adam_beta1': 0.99, 'warmup_steps': 2, 'lambda_param': 0.7000000000000001, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3576,2.237941,0.176902,0.003538,0.02,0.006012
2,2.1813,2.097282,0.176902,0.003538,0.02,0.006012
3,2.0638,1.983472,0.352887,0.063397,0.07091,0.05438
4,1.9495,1.874308,0.409716,0.07007,0.089115,0.064062
5,1.8531,1.769125,0.445463,0.07737,0.111635,0.084002
6,1.7396,1.671148,0.460128,0.122066,0.124776,0.099617
7,1.6444,1.590039,0.489459,0.151337,0.147124,0.122794
8,1.5652,1.511156,0.494959,0.167672,0.149985,0.128503
9,1.4792,1.440922,0.546288,0.202617,0.18687,0.169434
10,1.4038,1.374349,0.560037,0.21917,0.192556,0.17515


[I 2025-03-15 17:58:01,162] Trial 125 pruned. 


Trial 126 with params: {'learning_rate': 8.333610690449199e-05, 'weight_decay': 0.007, 'adam_beta1': 0.93, 'warmup_steps': 3, 'lambda_param': 0.7000000000000001, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3798,2.271132,0.176902,0.003538,0.02,0.006012
2,2.2184,2.128913,0.208066,0.07896,0.028879,0.020745
3,2.0942,2.0017,0.393217,0.055413,0.083232,0.062732
4,1.969,1.879192,0.430797,0.068261,0.099237,0.075472
5,1.8592,1.766047,0.460128,0.103561,0.119376,0.093947


[I 2025-03-15 17:58:27,853] Trial 126 pruned. 


Trial 127 with params: {'learning_rate': 0.0003327161612826989, 'weight_decay': 0.01, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2046,1.967865,0.384968,0.060712,0.080665,0.062735
2,1.8069,1.607489,0.486709,0.164852,0.141528,0.122249
3,1.4783,1.328476,0.56462,0.196065,0.184429,0.163399
4,1.2066,1.140086,0.653529,0.266929,0.25648,0.240257
5,1.0145,0.99999,0.689276,0.285146,0.288331,0.267212
6,0.8512,0.898704,0.710357,0.299444,0.312732,0.291633
7,0.7232,0.842848,0.716774,0.334335,0.327239,0.308152
8,0.637,0.804187,0.747938,0.359762,0.371948,0.34905
9,0.5665,0.772183,0.744271,0.386958,0.379301,0.363444
10,0.4983,0.763451,0.754354,0.410337,0.399783,0.384162


[I 2025-03-15 18:00:18,263] Trial 127 pruned. 


Trial 128 with params: {'learning_rate': 4.050179936036871e-06, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4699,2.451859,0.010999,0.00363,0.021842,0.002225
2,2.454,2.437075,0.025665,0.024725,0.023634,0.004437
3,2.442,2.423541,0.063245,0.011083,0.028791,0.008402
4,2.4307,2.41053,0.112741,0.009003,0.033685,0.008729
5,2.4159,2.397321,0.159487,0.009592,0.019585,0.00978


[I 2025-03-15 18:00:45,401] Trial 128 pruned. 


Trial 129 with params: {'learning_rate': 0.00012611009075295032, 'weight_decay': 0.0, 'adam_beta1': 0.97, 'warmup_steps': 3, 'lambda_param': 0.8, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3507,2.218286,0.176902,0.003538,0.02,0.006012
2,2.1532,2.053837,0.298808,0.07371,0.055308,0.046489
3,2.0089,1.906645,0.411549,0.07148,0.089568,0.065167
4,1.8576,1.760359,0.453712,0.078845,0.116809,0.089669
5,1.7264,1.630583,0.487626,0.138349,0.142074,0.115997
6,1.5872,1.512621,0.520623,0.159603,0.160252,0.139034
7,1.4677,1.41423,0.56187,0.240137,0.192834,0.17567
8,1.373,1.333377,0.595784,0.245119,0.218082,0.19849
9,1.2771,1.252372,0.627864,0.240285,0.240385,0.220834
10,1.1885,1.186586,0.660862,0.268321,0.266295,0.248085


[I 2025-03-15 18:02:36,226] Trial 129 pruned. 


Trial 130 with params: {'learning_rate': 0.00037726309733610843, 'weight_decay': 0.0, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 4, 'lambda_param': 0.5, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.228,1.976296,0.363886,0.063502,0.075102,0.058859
2,1.7938,1.583103,0.490376,0.144459,0.142663,0.115606
3,1.4374,1.28016,0.582951,0.226116,0.206238,0.187207
4,1.1483,1.088148,0.659028,0.268594,0.267422,0.249586
5,0.9455,0.954381,0.696609,0.28501,0.296455,0.273324
6,0.7818,0.860272,0.71769,0.332445,0.328381,0.308675
7,0.6578,0.8181,0.715857,0.366335,0.340811,0.330505
8,0.5759,0.782223,0.747938,0.375023,0.383787,0.364415
9,0.5043,0.759071,0.750687,0.440334,0.421815,0.415267
10,0.4413,0.745977,0.751604,0.452315,0.424144,0.420275


[I 2025-03-15 18:03:36,545] Trial 130 pruned. 


Trial 131 with params: {'learning_rate': 4.259068386605202e-05, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.91, 'warmup_steps': 2, 'lambda_param': 1.0, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4166,2.33989,0.182401,0.013581,0.021644,0.008897
2,2.3115,2.25089,0.178735,0.023545,0.020548,0.007089
3,2.2342,2.170177,0.192484,0.063594,0.024527,0.013886
4,2.159,2.095795,0.340972,0.070351,0.067847,0.060249
5,2.0955,2.02818,0.394134,0.077749,0.083813,0.065061


[I 2025-03-15 18:04:04,451] Trial 131 pruned. 


Trial 132 with params: {'learning_rate': 0.00033352490482726013, 'weight_decay': 0.008, 'adam_beta1': 0.92, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1999,1.950496,0.396884,0.0567,0.084798,0.063752
2,1.7825,1.575207,0.514207,0.159847,0.1535,0.133142
3,1.443,1.293167,0.577452,0.219109,0.196789,0.182912
4,1.1736,1.108505,0.671861,0.256914,0.273413,0.253807
5,0.9813,0.979032,0.697525,0.28839,0.297479,0.2754
6,0.8209,0.877371,0.716774,0.298196,0.321211,0.299057
7,0.6995,0.827421,0.719523,0.337134,0.330505,0.313754
8,0.617,0.791912,0.756187,0.387726,0.383188,0.36455
9,0.5482,0.760347,0.750687,0.404568,0.400785,0.386334
10,0.482,0.748168,0.757104,0.441488,0.411905,0.403208


[I 2025-03-15 18:04:59,868] Trial 132 pruned. 


Trial 133 with params: {'learning_rate': 0.000370529114492857, 'weight_decay': 0.001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 2, 'lambda_param': 0.30000000000000004, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.215,1.965557,0.367553,0.063682,0.075079,0.059285
2,1.7871,1.584666,0.490376,0.161582,0.145084,0.121573
3,1.44,1.289012,0.581118,0.2174,0.199786,0.179838
4,1.1548,1.094723,0.658112,0.268126,0.266697,0.249111
5,0.9509,0.959768,0.698442,0.285642,0.294509,0.270403
6,0.787,0.861272,0.71769,0.331708,0.328996,0.312198
7,0.6625,0.822258,0.716774,0.320437,0.329786,0.310539
8,0.581,0.787707,0.751604,0.395166,0.387477,0.370507
9,0.5147,0.756852,0.747021,0.41832,0.405376,0.395869
10,0.4492,0.745866,0.749771,0.46621,0.4116,0.402676


[I 2025-03-15 18:05:54,712] Trial 133 pruned. 


Trial 134 with params: {'learning_rate': 6.975941321850453e-05, 'weight_decay': 0.007, 'adam_beta1': 0.98, 'warmup_steps': 4, 'lambda_param': 0.2, 'temperature': 6.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3966,2.302834,0.176902,0.003538,0.02,0.006012
2,2.2633,2.194788,0.176902,0.003538,0.02,0.006012
3,2.169,2.102807,0.182401,0.063558,0.021477,0.008912
4,2.0816,2.020543,0.36572,0.063763,0.075328,0.060556
5,2.0095,1.937204,0.406966,0.053761,0.087201,0.063888
6,1.921,1.856056,0.430797,0.061523,0.100638,0.073926
7,1.8407,1.779932,0.447296,0.080692,0.111437,0.084325
8,1.7725,1.709583,0.471127,0.098346,0.126446,0.095335
9,1.6989,1.642884,0.474794,0.097713,0.12978,0.099316
10,1.6336,1.584071,0.494959,0.144188,0.141519,0.115493


[I 2025-03-15 18:06:49,954] Trial 134 pruned. 


Trial 135 with params: {'learning_rate': 0.0004237596334706623, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2009,1.921538,0.389551,0.076746,0.084651,0.064125
2,1.7233,1.504215,0.537122,0.159875,0.173674,0.152025
3,1.3445,1.191721,0.620532,0.220916,0.228272,0.211094
4,1.052,1.011369,0.681027,0.271706,0.285815,0.261695
5,0.8509,0.897661,0.71769,0.34437,0.329492,0.309132
6,0.6938,0.814218,0.731439,0.40085,0.364764,0.352362
7,0.5827,0.789888,0.734189,0.399394,0.375123,0.36311
8,0.5119,0.752312,0.752521,0.421109,0.398378,0.384834
9,0.4465,0.738057,0.754354,0.442154,0.430493,0.423578
10,0.3873,0.717542,0.76077,0.479596,0.441777,0.441985


[I 2025-03-15 18:09:40,927] Trial 135 finished with value: 0.6010794807545907 and parameters: {'learning_rate': 0.0004237596334706623, 'weight_decay': 0.0, 'adam_beta1': 0.93, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 2.5}. Best is trial 80 with value: 0.6434396489980481.


Trial 136 with params: {'learning_rate': 1.078901511573681e-06, 'weight_decay': 0.006, 'adam_beta1': 0.98, 'warmup_steps': 4, 'lambda_param': 0.9, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4757,2.464223,0.008249,0.004205,0.022233,0.002464
2,2.4712,2.459575,0.008249,0.003659,0.021882,0.002109
3,2.4684,2.455349,0.010082,0.003827,0.021738,0.002161
4,2.4653,2.451499,0.011916,0.003876,0.021945,0.002385
5,2.4581,2.447879,0.014665,0.004291,0.022256,0.002818
6,2.4546,2.444537,0.018332,0.004349,0.022671,0.003241
7,2.4526,2.441371,0.021998,0.004492,0.023085,0.003634
8,2.4488,2.438374,0.026581,0.005001,0.023603,0.004186
9,2.4468,2.435532,0.032081,0.02517,0.024711,0.005281
10,2.444,2.432899,0.035747,0.029962,0.025295,0.005892


[I 2025-03-15 18:10:34,621] Trial 136 pruned. 


Trial 137 with params: {'learning_rate': 0.0004391250190024249, 'weight_decay': 0.007, 'adam_beta1': 0.92, 'warmup_steps': 2, 'lambda_param': 0.30000000000000004, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1803,1.880683,0.410632,0.066511,0.091716,0.07009
2,1.6784,1.453476,0.550871,0.181967,0.185478,0.165678
3,1.291,1.154616,0.64528,0.256149,0.246035,0.230022
4,1.0121,0.984695,0.693859,0.29838,0.296791,0.274991
5,0.8165,0.877073,0.710357,0.321543,0.322053,0.300946
6,0.672,0.80493,0.729606,0.394703,0.361631,0.352468
7,0.5627,0.782912,0.740605,0.408879,0.388883,0.379385
8,0.4931,0.747336,0.75802,0.422434,0.41653,0.406648
9,0.4282,0.727446,0.75527,0.480337,0.430588,0.432317
10,0.3689,0.707455,0.762603,0.495095,0.449356,0.455646


[I 2025-03-15 18:13:22,525] Trial 137 finished with value: 0.6026684427898672 and parameters: {'learning_rate': 0.0004391250190024249, 'weight_decay': 0.007, 'adam_beta1': 0.92, 'warmup_steps': 2, 'lambda_param': 0.30000000000000004, 'temperature': 2.5}. Best is trial 80 with value: 0.6434396489980481.


Trial 138 with params: {'learning_rate': 0.0003042767857655148, 'weight_decay': 0.01, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.233,2.003091,0.361137,0.066988,0.072521,0.059529
2,1.8499,1.651209,0.468378,0.137093,0.130277,0.105341
3,1.5279,1.373121,0.554537,0.204371,0.177267,0.15757
4,1.2585,1.179804,0.640697,0.268915,0.244671,0.227597
5,1.0645,1.036598,0.691109,0.261506,0.287284,0.262062
6,0.8957,0.925361,0.703941,0.2777,0.301601,0.278017
7,0.7648,0.867059,0.708524,0.311167,0.307938,0.287912
8,0.678,0.826555,0.731439,0.316264,0.342719,0.315823
9,0.605,0.795565,0.738772,0.387708,0.374636,0.361959
10,0.5334,0.783692,0.747021,0.418307,0.393776,0.381747


[I 2025-03-15 18:16:18,110] Trial 138 finished with value: 0.5524819558256364 and parameters: {'learning_rate': 0.0003042767857655148, 'weight_decay': 0.01, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 3.0}. Best is trial 80 with value: 0.6434396489980481.


Trial 139 with params: {'learning_rate': 0.000497979662262894, 'weight_decay': 0.008, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.156,1.837092,0.428964,0.060101,0.104407,0.073346
2,1.6312,1.409424,0.569203,0.177828,0.203896,0.180017
3,1.2402,1.119877,0.651696,0.234037,0.258883,0.239226
4,0.9637,0.954404,0.700275,0.275717,0.294001,0.268986
5,0.772,0.865735,0.711274,0.308849,0.326954,0.300727
6,0.6294,0.795453,0.736022,0.407948,0.376276,0.365495
7,0.5265,0.763241,0.742438,0.417145,0.39365,0.384696
8,0.4596,0.738738,0.756187,0.440772,0.418765,0.411697
9,0.3952,0.717988,0.76077,0.470838,0.445579,0.44548
10,0.3416,0.702184,0.765353,0.517615,0.47017,0.475436


[I 2025-03-15 18:19:12,638] Trial 139 finished with value: 0.6142365945161642 and parameters: {'learning_rate': 0.000497979662262894, 'weight_decay': 0.008, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 3.0}. Best is trial 80 with value: 0.6434396489980481.


Trial 140 with params: {'learning_rate': 0.0003297414087448796, 'weight_decay': 0.008, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.30000000000000004, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2212,1.980249,0.380385,0.064266,0.079144,0.063484
2,1.8159,1.612411,0.484876,0.150275,0.139979,0.117842
3,1.4795,1.324996,0.562786,0.190731,0.183327,0.163393
4,1.2049,1.136547,0.652612,0.264711,0.258267,0.241762
5,1.0106,1.000685,0.692026,0.259678,0.289353,0.262093
6,0.8435,0.891581,0.711274,0.311693,0.313563,0.293984
7,0.7162,0.843459,0.710357,0.33432,0.318258,0.300719
8,0.6333,0.805327,0.747938,0.385083,0.376063,0.357339
9,0.5637,0.776309,0.740605,0.398317,0.390738,0.379417
10,0.4948,0.761678,0.754354,0.452294,0.40842,0.400904


[I 2025-03-15 18:22:07,705] Trial 140 finished with value: 0.6026238894106295 and parameters: {'learning_rate': 0.0003297414087448796, 'weight_decay': 0.008, 'adam_beta1': 0.93, 'warmup_steps': 1, 'lambda_param': 0.30000000000000004, 'temperature': 2.5}. Best is trial 80 with value: 0.6434396489980481.


Trial 141 with params: {'learning_rate': 2.6313721811247065e-05, 'weight_decay': 0.01, 'adam_beta1': 0.99, 'warmup_steps': 4, 'lambda_param': 0.4, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4394,2.384453,0.184235,0.013786,0.022792,0.010418
2,2.3683,2.328525,0.181485,0.014679,0.02137,0.008494
3,2.321,2.282811,0.176902,0.003538,0.02,0.006012
4,2.2789,2.241368,0.176902,0.003538,0.02,0.006012
5,2.2419,2.204625,0.176902,0.003538,0.02,0.006012


[I 2025-03-15 18:22:38,081] Trial 141 pruned. 


Trial 142 with params: {'learning_rate': 0.00046070802354624074, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1381,1.820137,0.430797,0.061634,0.105097,0.074544
2,1.616,1.393587,0.567369,0.200122,0.200301,0.177415
3,1.2321,1.103963,0.664528,0.244052,0.269848,0.247269
4,0.9637,0.9476,0.703025,0.260354,0.302915,0.272938
5,0.7787,0.85805,0.72044,0.343115,0.331906,0.308228
6,0.6442,0.791061,0.747938,0.389228,0.380069,0.364267
7,0.5393,0.752147,0.750687,0.402536,0.392751,0.379393
8,0.4697,0.722323,0.76352,0.426852,0.422245,0.411699
9,0.408,0.706043,0.770852,0.489447,0.454325,0.451815
10,0.351,0.691024,0.773602,0.537155,0.465941,0.474266


[I 2025-03-15 18:25:31,980] Trial 142 finished with value: 0.6095136457433374 and parameters: {'learning_rate': 0.00046070802354624074, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 3.5}. Best is trial 80 with value: 0.6434396489980481.


Trial 143 with params: {'learning_rate': 5.3985154766476e-05, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4063,2.318637,0.179652,0.018551,0.020822,0.00759
2,2.283,2.211671,0.178735,0.023545,0.020548,0.007089
3,2.1904,2.115975,0.300642,0.072735,0.055364,0.050009
4,2.0992,2.027546,0.391384,0.076715,0.082996,0.06424
5,2.0222,1.945466,0.412466,0.092933,0.090295,0.068101


[I 2025-03-15 18:26:04,356] Trial 143 pruned. 


Trial 144 with params: {'learning_rate': 0.00028379285700209597, 'weight_decay': 0.007, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2451,2.03081,0.308891,0.071173,0.056257,0.043665
2,1.89,1.704148,0.458295,0.124632,0.125449,0.100119
3,1.5909,1.437336,0.545371,0.171203,0.171692,0.149521
4,1.3247,1.23105,0.610449,0.235738,0.22593,0.204871
5,1.1246,1.080837,0.671861,0.247834,0.265394,0.242161
6,0.9533,0.967821,0.693859,0.275306,0.293372,0.269016
7,0.8186,0.897263,0.707608,0.314022,0.302426,0.280731
8,0.7281,0.853344,0.722273,0.310115,0.333786,0.307229
9,0.6513,0.818769,0.725023,0.343778,0.342549,0.323061
10,0.5778,0.801679,0.743355,0.397927,0.378638,0.363131


[I 2025-03-15 18:28:13,406] Trial 144 pruned. 


Trial 145 with params: {'learning_rate': 0.00017861054002006786, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 0, 'lambda_param': 0.9, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2893,2.113309,0.187901,0.038257,0.022834,0.010958
2,2.0123,1.863186,0.421632,0.065907,0.0955,0.070922
3,1.78,1.63414,0.486709,0.142302,0.137326,0.11719
4,1.5595,1.447328,0.549954,0.179428,0.176546,0.156152
5,1.3895,1.300874,0.605866,0.241907,0.215118,0.198509


[I 2025-03-15 18:28:44,051] Trial 145 pruned. 


Trial 146 with params: {'learning_rate': 0.00045386877226676113, 'weight_decay': 0.007, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1803,1.886778,0.407883,0.07039,0.089804,0.068665
2,1.6887,1.470497,0.543538,0.179657,0.188226,0.166673
3,1.3132,1.183164,0.627864,0.231124,0.235623,0.218371
4,1.0346,1.006347,0.681943,0.249246,0.282518,0.256593
5,0.8391,0.900215,0.704858,0.280472,0.312577,0.284776
6,0.6845,0.822978,0.721357,0.37038,0.353257,0.338652
7,0.5743,0.793037,0.739688,0.412499,0.385268,0.376695
8,0.5016,0.75539,0.756187,0.427212,0.419013,0.406404
9,0.4334,0.73612,0.75527,0.474996,0.441738,0.440472
10,0.3752,0.717092,0.762603,0.49681,0.455639,0.457173


[I 2025-03-15 18:31:54,419] Trial 146 finished with value: 0.6101474990181154 and parameters: {'learning_rate': 0.00045386877226676113, 'weight_decay': 0.007, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 2.5}. Best is trial 80 with value: 0.6434396489980481.


Trial 147 with params: {'learning_rate': 0.00040321605879245205, 'weight_decay': 0.004, 'adam_beta1': 0.92, 'warmup_steps': 3, 'lambda_param': 0.4, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2054,1.926899,0.392301,0.076294,0.084866,0.06395
2,1.7313,1.508573,0.531622,0.184518,0.173421,0.156387
3,1.3536,1.196201,0.637947,0.249097,0.24112,0.223632
4,1.0634,1.021551,0.681943,0.273899,0.285375,0.262189
5,0.8636,0.909241,0.713107,0.320536,0.321662,0.298487
6,0.7091,0.819861,0.725023,0.388903,0.351664,0.339415
7,0.5961,0.792656,0.734189,0.392658,0.371032,0.357003
8,0.5238,0.75124,0.750687,0.407121,0.395119,0.381093
9,0.4574,0.737653,0.756187,0.436051,0.428708,0.423023
10,0.3979,0.71749,0.764436,0.481506,0.447054,0.447616


[I 2025-03-15 18:34:59,680] Trial 147 finished with value: 0.6123898549200514 and parameters: {'learning_rate': 0.00040321605879245205, 'weight_decay': 0.004, 'adam_beta1': 0.92, 'warmup_steps': 3, 'lambda_param': 0.4, 'temperature': 4.0}. Best is trial 80 with value: 0.6434396489980481.


Trial 148 with params: {'learning_rate': 0.00026678595121843395, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2598,2.032393,0.340055,0.068879,0.067265,0.055954
2,1.8858,1.687211,0.470211,0.122042,0.13002,0.102129
3,1.5712,1.40911,0.555454,0.19927,0.179783,0.159659
4,1.308,1.210186,0.641613,0.246044,0.245828,0.228493
5,1.1122,1.066414,0.68286,0.264231,0.28582,0.262705
6,0.9464,0.960065,0.704858,0.297639,0.303135,0.284143
7,0.8184,0.891023,0.702108,0.301677,0.303268,0.282622
8,0.7305,0.849802,0.726856,0.316367,0.336772,0.311448
9,0.6564,0.81409,0.734189,0.344431,0.353958,0.337748
10,0.5825,0.791032,0.745188,0.368322,0.368417,0.350451


[I 2025-03-15 18:35:57,531] Trial 148 pruned. 


Trial 149 with params: {'learning_rate': 8.049073025334763e-05, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3781,2.268718,0.176902,0.003538,0.02,0.006012
2,2.2161,2.123328,0.24198,0.07462,0.038501,0.033579
3,2.0896,1.994844,0.401467,0.054775,0.085465,0.063619
4,1.9627,1.873021,0.429881,0.090485,0.099238,0.077283
5,1.8543,1.760185,0.467461,0.105408,0.123442,0.10063
6,1.7404,1.663596,0.494959,0.121527,0.138103,0.112556
7,1.6447,1.58285,0.505041,0.147642,0.147991,0.126248
8,1.5685,1.507406,0.553621,0.222359,0.181137,0.164422
9,1.4882,1.438184,0.565536,0.242104,0.191957,0.176919
10,1.4175,1.382984,0.59945,0.241767,0.218564,0.203927


[I 2025-03-15 18:37:58,916] Trial 149 pruned. 


In [42]:
print(best_trial2)

BestRun(run_id='80', objective=0.6434396489980481, hyperparameters={'learning_rate': 0.000482322168974171, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 2.5}, run_summary=None)


In [43]:
base.reset_seed()

In [44]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-base_fine_aug_hp-search", logging_dir=f"~/logs/{DATASET}/bert-base_fine_aug_hp-search", epochs=num_epochs, batch_size=batch_size)

In [45]:
#Nápočet epoch na steps
data_length = len(all_train_data)
min_r = math.ceil(data_length/batch_size)*5
max_r = math.ceil(data_length/batch_size)*num_epochs
warm_up = math.ceil(data_length/batch_size/10)

In [46]:
def hp_space(trial):
    params =  {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 5e-4, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0, 1e-2, step=1e-3),
        "adam_beta1" : trial.suggest_float("adam_beta1", 0.9, 0.99, step=0.01),
        "warmup_steps" : trial.suggest_int("warmup_steps", 0, warm_up),
    }
    print(f"Trial {trial.number} with params: {params}")
    return params

In [47]:
pruner = optuna.pruners.HyperbandPruner(min_resource=min_r, max_resource=max_r, reduction_factor=2, bootstrap_count=2)
sampler = optuna.samplers.TPESampler(seed=42, multivariate=True)



In [48]:
trainer = Trainer(
    args=training_args,
    train_dataset=train_aug,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    model_init = lambda: get_Bert(),
    #callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)
  

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [49]:
best_trial3 = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    compute_objective=lambda metrics: metrics["eval_f1"],
    pruner=pruner,
    sampler=sampler,
    study_name="Test-base-aug",
    n_trials=150
)

[I 2025-03-15 18:38:00,160] A new study created in memory with name: Test-base-aug


Trial 0 with params: {'learning_rate': 1.0253509690168497e-05, 'weight_decay': 0.01, 'adam_beta1': 0.97, 'warmup_steps': 32}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5869,3.324262,0.327223,0.043095,0.065415,0.047883
2,3.0917,2.904099,0.439047,0.109918,0.110082,0.094993
3,2.7039,2.5745,0.497709,0.129979,0.142067,0.118518
4,2.3897,2.30823,0.549954,0.215766,0.182629,0.165596
5,2.1258,2.09833,0.593951,0.26119,0.223329,0.206112


[I 2025-03-15 18:40:04,534] Trial 0 pruned. 


Trial 1 with params: {'learning_rate': 2.636875533972305e-06, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 46}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.7859,3.688891,0.212649,0.019839,0.031375,0.015624
2,3.6147,3.542825,0.210816,0.037997,0.030065,0.018667
3,3.4782,3.411683,0.292392,0.047128,0.054592,0.043705
4,3.3542,3.292677,0.345555,0.040596,0.071558,0.049448
5,3.2378,3.188476,0.35472,0.075693,0.075678,0.052887


[I 2025-03-15 18:42:11,238] Trial 1 pruned. 


Trial 2 with params: {'learning_rate': 4.191711516695204e-05, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 52}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9634,2.257744,0.563703,0.221807,0.19146,0.175286
2,1.7367,1.547541,0.705775,0.321468,0.332184,0.311201
3,1.1417,1.261183,0.744271,0.3817,0.401359,0.379198
4,0.8183,1.129119,0.762603,0.449964,0.444318,0.432488
5,0.6194,1.068489,0.770852,0.51421,0.47743,0.464806
6,0.4874,1.03228,0.773602,0.519168,0.498308,0.496572
7,0.3823,1.019468,0.772686,0.520672,0.504214,0.498168
8,0.3099,0.996127,0.776352,0.547521,0.528828,0.526161
9,0.2532,1.005063,0.776352,0.591663,0.536512,0.542552
10,0.2075,1.01369,0.786434,0.623133,0.570096,0.580459


[I 2025-03-15 18:46:34,051] Trial 2 pruned. 


Trial 3 with params: {'learning_rate': 0.0001764971584817573, 'weight_decay': 0.002, 'adam_beta1': 0.91, 'warmup_steps': 9}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7186,1.124124,0.76352,0.430012,0.452198,0.426921
2,0.443,0.98409,0.780018,0.597037,0.545303,0.548502
3,0.1776,1.028383,0.789184,0.731124,0.646664,0.667369
4,0.0878,1.095383,0.793767,0.759182,0.648766,0.683444
5,0.0512,1.127443,0.799267,0.79174,0.719675,0.741851
6,0.0356,1.223752,0.781852,0.794673,0.678727,0.714091
7,0.0242,1.268112,0.791017,0.796494,0.695541,0.722982
8,0.0201,1.269241,0.794684,0.80461,0.718443,0.735761
9,0.0164,1.357253,0.785518,0.785889,0.703764,0.726356
10,0.0124,1.391741,0.787351,0.79482,0.698062,0.720098


[I 2025-03-15 18:50:45,261] Trial 3 pruned. 


Trial 4 with params: {'learning_rate': 6.624310605949985e-06, 'weight_decay': 0.005, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 15}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6624,3.472984,0.238313,0.050701,0.03841,0.029223
2,3.3086,3.158221,0.364803,0.070743,0.07884,0.057232
3,3.017,2.909736,0.433547,0.109096,0.107549,0.092126
4,2.7785,2.697402,0.480293,0.107169,0.13258,0.111028
5,2.567,2.512783,0.504125,0.121687,0.144439,0.120082
6,2.3855,2.35606,0.529789,0.198279,0.165411,0.147753
7,2.2246,2.222272,0.572869,0.219792,0.204131,0.188635
8,2.0892,2.106822,0.588451,0.251378,0.217953,0.201807
9,1.9687,2.004083,0.601283,0.272815,0.23019,0.211588
10,1.8608,1.914625,0.617782,0.268337,0.241521,0.225232


[I 2025-03-15 18:59:00,364] Trial 4 pruned. 


Trial 5 with params: {'learning_rate': 4.480975918214949e-05, 'weight_decay': 0.001, 'adam_beta1': 0.92, 'warmup_steps': 19}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8853,2.177733,0.589368,0.249971,0.22098,0.206135
2,1.6517,1.487526,0.710357,0.345959,0.345297,0.324896
3,1.0703,1.227243,0.748854,0.394352,0.41005,0.388869
4,0.759,1.107255,0.766269,0.450003,0.449766,0.436824
5,0.5689,1.055633,0.769019,0.500121,0.474739,0.467482
6,0.4423,1.02647,0.771769,0.513475,0.497831,0.492131
7,0.3408,1.012889,0.771769,0.535987,0.511849,0.50724
8,0.2727,1.000166,0.779102,0.592805,0.541523,0.547104
9,0.2214,1.015615,0.780935,0.622227,0.558774,0.572243
10,0.18,1.031643,0.782768,0.652405,0.584265,0.597838


[I 2025-03-15 19:07:22,009] Trial 5 pruned. 


Trial 6 with params: {'learning_rate': 1.7018418817029176e-05, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 27}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4077,3.005297,0.415215,0.095431,0.09688,0.08133
2,2.6729,2.431465,0.527956,0.168556,0.162442,0.141776
3,2.1559,2.037402,0.6022,0.259777,0.233576,0.214332
4,1.7821,1.7592,0.671861,0.319855,0.29298,0.275694
5,1.4988,1.558361,0.702108,0.337296,0.333356,0.312666
6,1.2891,1.416147,0.722273,0.390636,0.370928,0.357678
7,1.1207,1.313405,0.741522,0.437197,0.410695,0.400339
8,0.9948,1.243192,0.751604,0.427287,0.435693,0.416039
9,0.8909,1.190155,0.748854,0.444377,0.436403,0.42025
10,0.8014,1.150756,0.752521,0.441773,0.446109,0.428558


[I 2025-03-15 19:11:30,303] Trial 6 pruned. 


Trial 7 with params: {'learning_rate': 3.971084710792477e-05, 'weight_decay': 0.0, 'adam_beta1': 0.96, 'warmup_steps': 9}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9947,2.341497,0.547204,0.237477,0.183381,0.170401
2,1.8276,1.616623,0.702108,0.345354,0.326708,0.308671
3,1.2152,1.303025,0.742438,0.379732,0.394977,0.374846
4,0.8741,1.158569,0.757104,0.445515,0.433808,0.42062
5,0.6644,1.085915,0.762603,0.482219,0.463584,0.453397
6,0.5248,1.041956,0.767186,0.481346,0.476196,0.466718
7,0.4149,1.01357,0.773602,0.494922,0.495012,0.486267
8,0.3387,1.001805,0.774519,0.528371,0.509116,0.50089
9,0.278,1.003636,0.775435,0.592619,0.535255,0.541886
10,0.2296,1.013578,0.780018,0.618266,0.556763,0.564935


[I 2025-03-15 19:15:34,294] Trial 7 pruned. 


Trial 8 with params: {'learning_rate': 1.4982086432155468e-06, 'weight_decay': 0.01, 'adam_beta1': 0.99, 'warmup_steps': 43}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8281,3.76717,0.169569,0.028655,0.021744,0.010808
2,3.7204,3.6774,0.208066,0.021155,0.029835,0.015985
3,3.6383,3.601206,0.208066,0.015872,0.029315,0.016829
4,3.5656,3.530887,0.217232,0.058733,0.032126,0.021555
5,3.4965,3.464697,0.252979,0.049745,0.042739,0.034375


[I 2025-03-15 19:17:39,941] Trial 8 pruned. 


Trial 9 with params: {'learning_rate': 6.639623079859462e-06, 'weight_decay': 0.001, 'adam_beta1': 0.96, 'warmup_steps': 23}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6679,3.48085,0.233731,0.050931,0.036986,0.027908
2,3.3165,3.166579,0.362053,0.066615,0.077716,0.05541
3,3.0245,2.916885,0.43538,0.111535,0.107559,0.092148
4,2.7854,2.70486,0.477544,0.106412,0.131265,0.110208
5,2.5737,2.519529,0.503208,0.122073,0.144224,0.120137


[I 2025-03-15 19:19:45,277] Trial 9 pruned. 


Trial 10 with params: {'learning_rate': 1.2001988398838804e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 15}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.518,3.20522,0.350137,0.055049,0.074043,0.051524
2,2.946,2.74173,0.477544,0.105751,0.133018,0.110928
3,2.5186,2.385682,0.528873,0.170257,0.163522,0.142114
4,2.1823,2.115238,0.589368,0.248729,0.218283,0.20186
5,1.9088,1.901465,0.622365,0.274511,0.246825,0.230142
6,1.6946,1.733898,0.672777,0.304223,0.287003,0.26853
7,1.5157,1.599418,0.689276,0.314555,0.309381,0.289547
8,1.3732,1.496353,0.708524,0.362585,0.349149,0.330063
9,1.2532,1.410484,0.716774,0.38271,0.354997,0.338563
10,1.1494,1.346732,0.726856,0.416617,0.38185,0.367937


[I 2025-03-15 19:23:59,479] Trial 10 pruned. 


Trial 11 with params: {'learning_rate': 1.577858185676612e-05, 'weight_decay': 0.006, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 12}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.438,3.067446,0.388634,0.099407,0.085457,0.06706
2,2.7517,2.519074,0.507791,0.121991,0.146988,0.122942
3,2.2533,2.129996,0.591201,0.266787,0.221865,0.205547
4,1.8853,1.847402,0.643446,0.317796,0.267326,0.252052
5,1.6001,1.637958,0.688359,0.330592,0.309501,0.294627
6,1.3847,1.484595,0.709441,0.32419,0.336137,0.317289
7,1.2102,1.371491,0.72319,0.371356,0.361823,0.345801
8,1.0785,1.293762,0.744271,0.395413,0.4041,0.38596
9,0.9701,1.232378,0.748854,0.420703,0.426944,0.410746
10,0.876,1.188044,0.752521,0.474565,0.441655,0.430435


[I 2025-03-15 19:29:05,127] Trial 11 pruned. 


Trial 12 with params: {'learning_rate': 5.635479708422883e-06, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 6}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6844,3.517073,0.215399,0.059229,0.031489,0.020824
2,3.374,3.237232,0.352887,0.07487,0.074986,0.052266
3,3.1137,3.012577,0.409716,0.098528,0.092812,0.076391
4,2.8988,2.821987,0.455545,0.10717,0.11953,0.102494
5,2.7089,2.654968,0.485793,0.106385,0.135661,0.112958


[I 2025-03-15 19:31:39,283] Trial 12 pruned. 


Trial 13 with params: {'learning_rate': 5.27784544496764e-05, 'weight_decay': 0.003, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 32}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8083,2.047607,0.60495,0.271546,0.236932,0.22121
2,1.4944,1.37899,0.726856,0.379249,0.371657,0.355182
3,0.9267,1.160872,0.758937,0.456119,0.43961,0.426242
4,0.6383,1.067515,0.768103,0.488189,0.468614,0.460067
5,0.4631,1.027599,0.772686,0.5007,0.496194,0.486961
6,0.3458,1.007094,0.773602,0.555935,0.523478,0.523892
7,0.2571,1.009299,0.777269,0.591353,0.536645,0.540437
8,0.2003,1.008855,0.784601,0.644614,0.578214,0.593452
9,0.1597,1.033532,0.787351,0.667741,0.60205,0.616064
10,0.1269,1.058804,0.786434,0.712476,0.621315,0.643494


[I 2025-03-15 19:42:15,324] Trial 13 pruned. 


Trial 14 with params: {'learning_rate': 0.00012841880767531, 'weight_decay': 0.001, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 35}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0921,1.290634,0.738772,0.385464,0.392653,0.369899
2,0.6656,1.033666,0.767186,0.475244,0.477607,0.469061
3,0.308,0.994893,0.779102,0.598345,0.566967,0.56721
4,0.1616,1.02046,0.791934,0.689133,0.616881,0.63631
5,0.0943,1.04497,0.791934,0.721774,0.648095,0.663717
6,0.0616,1.123988,0.782768,0.752811,0.641055,0.671302
7,0.0416,1.185724,0.791017,0.791623,0.688428,0.710994
8,0.0313,1.19825,0.800183,0.798203,0.71784,0.734623
9,0.0252,1.245417,0.794684,0.791395,0.712059,0.732071
10,0.0181,1.323773,0.789184,0.812131,0.709286,0.733499


[I 2025-03-15 19:52:35,669] Trial 14 pruned. 


Trial 15 with params: {'learning_rate': 0.0003261896776611827, 'weight_decay': 0.003, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 37}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3916,1.015563,0.773602,0.49401,0.500846,0.48494
2,0.2135,1.066737,0.783685,0.735549,0.667635,0.676495
3,0.0746,1.185036,0.790101,0.746064,0.739786,0.726835
4,0.0397,1.210986,0.791017,0.796874,0.695829,0.719563
5,0.0267,1.243421,0.79835,0.779564,0.719003,0.729639
6,0.0198,1.294853,0.799267,0.801863,0.710933,0.72803
7,0.0146,1.446721,0.781852,0.761606,0.689506,0.705208
8,0.0126,1.428022,0.791017,0.797977,0.718841,0.732409
9,0.009,1.434727,0.791017,0.765918,0.691926,0.712911
10,0.0076,1.504764,0.788268,0.756271,0.711473,0.708808


[I 2025-03-15 20:03:08,265] Trial 15 pruned. 


Trial 16 with params: {'learning_rate': 3.5590132604984735e-05, 'weight_decay': 0.001, 'adam_beta1': 0.97, 'warmup_steps': 49}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.1321,2.510415,0.500458,0.116679,0.143112,0.118432
2,2.0042,1.756272,0.681027,0.341124,0.301523,0.286849
3,1.3662,1.393194,0.721357,0.346478,0.361893,0.340934
4,0.9963,1.216884,0.742438,0.391722,0.396933,0.374981
5,0.766,1.123113,0.759853,0.470034,0.450826,0.439427
6,0.6121,1.066812,0.768103,0.502409,0.474088,0.466103
7,0.4933,1.032897,0.771769,0.498869,0.491994,0.485765
8,0.4108,1.016768,0.773602,0.481445,0.496145,0.480947
9,0.3418,1.004886,0.773602,0.519367,0.505438,0.50057
10,0.2858,1.005497,0.776352,0.603822,0.531833,0.539771


[I 2025-03-15 20:13:14,002] Trial 16 pruned. 


Trial 17 with params: {'learning_rate': 6.221860592744965e-05, 'weight_decay': 0.004, 'adam_beta1': 0.92, 'warmup_steps': 33}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6644,1.867824,0.63428,0.290563,0.262346,0.246877
2,1.2995,1.27079,0.746104,0.406841,0.407151,0.386309
3,0.7772,1.103319,0.768103,0.474195,0.465939,0.448104
4,0.521,1.034183,0.766269,0.473539,0.477752,0.468557
5,0.3621,1.004436,0.769019,0.53454,0.510236,0.507868
6,0.2598,1.004044,0.781852,0.596013,0.557661,0.563668
7,0.1871,1.026653,0.782768,0.64263,0.578725,0.593554
8,0.1431,1.029493,0.790101,0.668006,0.611261,0.625813
9,0.1121,1.063238,0.786434,0.680567,0.610919,0.62612
10,0.0873,1.106198,0.784601,0.724305,0.639814,0.661283


[I 2025-03-15 20:24:12,034] Trial 17 pruned. 


Trial 18 with params: {'learning_rate': 0.00033236239862177063, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 45}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.333,1.003325,0.774519,0.483513,0.492872,0.475628
2,0.1994,1.079759,0.784601,0.722475,0.635924,0.658828
3,0.072,1.146371,0.796517,0.800383,0.732839,0.747129
4,0.036,1.264172,0.797434,0.834719,0.714626,0.750067
5,0.0254,1.240007,0.799267,0.801909,0.701405,0.731451
6,0.018,1.325714,0.802933,0.835235,0.718061,0.753269
7,0.0137,1.495553,0.779102,0.788596,0.714663,0.730713
8,0.0127,1.466256,0.794684,0.805968,0.723509,0.742602
9,0.0104,1.598888,0.777269,0.788865,0.698032,0.721095
10,0.0079,1.559587,0.791017,0.789035,0.722463,0.730372


[I 2025-03-15 20:39:34,470] Trial 18 finished with value: 0.7095491000571279 and parameters: {'learning_rate': 0.00033236239862177063, 'weight_decay': 0.0, 'adam_beta1': 0.91, 'warmup_steps': 45}. Best is trial 18 with value: 0.7095491000571279.


Trial 19 with params: {'learning_rate': 0.0001885868710330995, 'weight_decay': 0.0, 'adam_beta1': 0.92, 'warmup_steps': 47}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7618,1.106892,0.767186,0.434876,0.453564,0.432211
2,0.4228,0.997398,0.779102,0.576723,0.540282,0.540795
3,0.1644,1.031646,0.791017,0.715873,0.63803,0.659793
4,0.0801,1.088272,0.790101,0.786082,0.675245,0.710852
5,0.0473,1.159673,0.792851,0.781179,0.691055,0.71866
6,0.0335,1.247871,0.786434,0.793928,0.671261,0.708902
7,0.022,1.301323,0.788268,0.789726,0.682305,0.713468
8,0.0181,1.321424,0.788268,0.79175,0.714405,0.732323
9,0.016,1.389257,0.790101,0.782838,0.698515,0.720692
10,0.0107,1.436843,0.789184,0.808398,0.720292,0.73999


[I 2025-03-15 20:55:11,746] Trial 19 finished with value: 0.7293782518516025 and parameters: {'learning_rate': 0.0001885868710330995, 'weight_decay': 0.0, 'adam_beta1': 0.92, 'warmup_steps': 47}. Best is trial 19 with value: 0.7293782518516025.


Trial 20 with params: {'learning_rate': 0.00019671164081178758, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 49}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7117,1.087168,0.76077,0.417537,0.446871,0.421858
2,0.3998,0.994715,0.783685,0.603763,0.557557,0.564897
3,0.1535,1.02517,0.792851,0.747009,0.657746,0.680321
4,0.0733,1.088808,0.789184,0.762234,0.664006,0.696375
5,0.0436,1.176934,0.7956,0.788408,0.706269,0.730901
6,0.0308,1.229449,0.784601,0.782159,0.665636,0.700621
7,0.0201,1.28631,0.791934,0.746222,0.669789,0.689779
8,0.0167,1.31042,0.791017,0.795618,0.717965,0.735032
9,0.0147,1.325223,0.787351,0.801403,0.715271,0.736571
10,0.011,1.42781,0.786434,0.815206,0.703819,0.731925


[I 2025-03-15 21:05:37,787] Trial 20 pruned. 


Trial 21 with params: {'learning_rate': 6.692771043764605e-05, 'weight_decay': 0.001, 'adam_beta1': 0.93, 'warmup_steps': 49}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6454,1.820614,0.654445,0.296389,0.28128,0.267993
2,1.2424,1.237769,0.742438,0.402995,0.402565,0.383434
3,0.7267,1.084884,0.765353,0.4447,0.46651,0.446404
4,0.4787,1.022774,0.769019,0.473594,0.483103,0.472806
5,0.3245,0.999379,0.775435,0.556285,0.520718,0.518452
6,0.2282,1.007837,0.789184,0.657801,0.585006,0.60204
7,0.1619,1.036838,0.787351,0.676877,0.610132,0.626865
8,0.1225,1.038494,0.791017,0.683872,0.624066,0.637959
9,0.0953,1.077365,0.786434,0.67899,0.607269,0.624735
10,0.0737,1.129463,0.785518,0.731533,0.650182,0.666795


[I 2025-03-15 21:16:14,632] Trial 21 pruned. 


Trial 22 with params: {'learning_rate': 0.00033322985870060107, 'weight_decay': 0.001, 'adam_beta1': 0.93, 'warmup_steps': 31}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3235,1.007401,0.779102,0.51492,0.516108,0.504191
2,0.1989,1.068182,0.787351,0.730961,0.679215,0.687097
3,0.0718,1.204379,0.787351,0.758063,0.735313,0.731556
4,0.0378,1.236222,0.7956,0.816665,0.717763,0.748263
5,0.0254,1.332075,0.791934,0.781751,0.710425,0.722848
6,0.0199,1.368381,0.7956,0.800132,0.72229,0.736602
7,0.0142,1.448631,0.783685,0.790442,0.700948,0.717226
8,0.0119,1.46504,0.782768,0.780825,0.689671,0.70723
9,0.0101,1.549994,0.779102,0.777309,0.71108,0.720557
10,0.0076,1.603639,0.781852,0.773912,0.730199,0.729329


[I 2025-03-15 21:26:43,370] Trial 22 pruned. 


Trial 23 with params: {'learning_rate': 0.00025227866809873626, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'warmup_steps': 31}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4828,1.028372,0.768103,0.437391,0.462936,0.442513
2,0.2823,1.013941,0.783685,0.690228,0.623191,0.639235
3,0.1012,1.105249,0.796517,0.769195,0.676353,0.701982
4,0.05,1.151805,0.8011,0.834723,0.699904,0.741974
5,0.0324,1.203143,0.802016,0.78552,0.710887,0.730607
6,0.0232,1.310153,0.784601,0.808203,0.6955,0.730582
7,0.0171,1.359257,0.793767,0.801669,0.713174,0.734022
8,0.0132,1.361596,0.79835,0.793151,0.72752,0.738203
9,0.0132,1.406247,0.785518,0.783688,0.702108,0.721136
10,0.0096,1.498919,0.788268,0.800705,0.71798,0.73706


[I 2025-03-15 21:37:03,324] Trial 23 pruned. 


Trial 24 with params: {'learning_rate': 0.0003159078969519084, 'weight_decay': 0.0, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 47}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4331,1.017331,0.772686,0.493664,0.495554,0.4811
2,0.2229,1.063717,0.783685,0.739332,0.666617,0.678182
3,0.0784,1.134166,0.792851,0.798712,0.735757,0.752997
4,0.0404,1.185715,0.802933,0.80407,0.728038,0.742955
5,0.0264,1.300021,0.787351,0.78363,0.718017,0.734415
6,0.0194,1.312599,0.794684,0.798902,0.712221,0.725761
7,0.0145,1.411966,0.785518,0.771265,0.714233,0.716459
8,0.0124,1.427894,0.785518,0.778986,0.705402,0.710105
9,0.0105,1.516367,0.783685,0.76747,0.707711,0.719539
10,0.008,1.543465,0.791017,0.788013,0.726808,0.733974


[I 2025-03-15 21:47:32,689] Trial 24 pruned. 


Trial 25 with params: {'learning_rate': 0.00048446272517392336, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 47}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1174,0.97026,0.790101,0.576787,0.568899,0.558608
2,0.1305,1.130399,0.782768,0.755671,0.672607,0.694595
3,0.0513,1.256247,0.783685,0.758938,0.715798,0.722223
4,0.0313,1.372818,0.785518,0.79988,0.689285,0.726349
5,0.0211,1.47604,0.772686,0.780938,0.717914,0.727421
6,0.0183,1.439839,0.785518,0.796146,0.701585,0.723265
7,0.0142,1.5244,0.783685,0.780277,0.689377,0.711391
8,0.0103,1.600577,0.768103,0.786636,0.710888,0.727766
9,0.0095,1.763174,0.764436,0.822301,0.678056,0.718699
10,0.0088,1.764102,0.76352,0.773673,0.677125,0.698395


[I 2025-03-15 21:52:38,324] Trial 25 pruned. 


Trial 26 with params: {'learning_rate': 0.0002839050984979592, 'weight_decay': 0.002, 'adam_beta1': 0.92, 'warmup_steps': 48}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4602,1.024715,0.769936,0.466006,0.477354,0.458433
2,0.2467,1.050202,0.794684,0.738425,0.654285,0.674106
3,0.0867,1.157279,0.789184,0.784642,0.725564,0.739725
4,0.0438,1.2216,0.788268,0.787593,0.694904,0.719129
5,0.0295,1.284423,0.793767,0.787736,0.704574,0.729469
6,0.0211,1.336457,0.7956,0.777146,0.693288,0.715129
7,0.0157,1.429591,0.784601,0.798793,0.683921,0.716521
8,0.0129,1.440112,0.787351,0.798932,0.704419,0.726258
9,0.0113,1.477205,0.796517,0.809408,0.736848,0.755253
10,0.0093,1.54837,0.780018,0.794245,0.725512,0.737981


[I 2025-03-15 22:03:03,925] Trial 26 pruned. 


Trial 27 with params: {'learning_rate': 5.316302127492754e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 36}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8095,2.045164,0.605866,0.272239,0.237578,0.221823
2,1.4901,1.376288,0.725023,0.354461,0.365943,0.345687
3,0.9222,1.158988,0.758937,0.45849,0.43961,0.426217
4,0.6343,1.066763,0.769019,0.487616,0.469025,0.460062
5,0.4594,1.027455,0.772686,0.500912,0.496194,0.487101
6,0.3426,1.007323,0.773602,0.575713,0.524748,0.527235
7,0.2542,1.00993,0.777269,0.591052,0.536645,0.540277
8,0.1977,1.009111,0.783685,0.642501,0.578314,0.592713
9,0.1574,1.034934,0.786434,0.668094,0.601686,0.615965
10,0.1249,1.061218,0.786434,0.712476,0.621315,0.643494


[I 2025-03-15 22:08:10,834] Trial 27 pruned. 


Trial 28 with params: {'learning_rate': 0.0003898152024004867, 'weight_decay': 0.006, 'adam_beta1': 0.96, 'warmup_steps': 10}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2463,1.04208,0.774519,0.54875,0.539158,0.526853
2,0.1779,1.099286,0.789184,0.725423,0.668886,0.677882
3,0.0655,1.24415,0.781852,0.752234,0.718355,0.720524
4,0.0345,1.318205,0.791017,0.793225,0.715,0.733433
5,0.0232,1.374488,0.790101,0.805535,0.727654,0.746984
6,0.0183,1.445723,0.773602,0.788912,0.695596,0.717655
7,0.0131,1.511199,0.779102,0.748357,0.702699,0.706477
8,0.0122,1.554726,0.778185,0.762733,0.706959,0.718601
9,0.0117,1.590509,0.792851,0.797205,0.704456,0.729457
10,0.0089,1.611882,0.783685,0.794802,0.704588,0.726317


[I 2025-03-15 22:23:49,085] Trial 28 finished with value: 0.6960462624930169 and parameters: {'learning_rate': 0.0003898152024004867, 'weight_decay': 0.006, 'adam_beta1': 0.96, 'warmup_steps': 10}. Best is trial 19 with value: 0.7293782518516025.


Trial 29 with params: {'learning_rate': 0.00025682128316432246, 'weight_decay': 0.007, 'adam_beta1': 0.98, 'warmup_steps': 17}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6495,1.094053,0.748854,0.466516,0.455685,0.437316
2,0.3214,1.054227,0.783685,0.677948,0.626052,0.634821
3,0.1135,1.123925,0.782768,0.7468,0.679551,0.691869
4,0.0562,1.212044,0.786434,0.787061,0.683446,0.715011
5,0.0358,1.271371,0.788268,0.784491,0.725155,0.735671
6,0.0241,1.318774,0.788268,0.799127,0.710977,0.737665
7,0.0184,1.390123,0.791017,0.772217,0.718419,0.728939
8,0.0142,1.382954,0.785518,0.781731,0.701331,0.72057
9,0.0114,1.424714,0.788268,0.784259,0.715317,0.73468
10,0.0097,1.532331,0.780935,0.762916,0.721341,0.72029


[I 2025-03-15 22:39:19,391] Trial 29 finished with value: 0.7312857459802197 and parameters: {'learning_rate': 0.00025682128316432246, 'weight_decay': 0.007, 'adam_beta1': 0.98, 'warmup_steps': 17}. Best is trial 29 with value: 0.7312857459802197.


Trial 30 with params: {'learning_rate': 0.00010867727717767609, 'weight_decay': 0.008, 'adam_beta1': 0.99, 'warmup_steps': 26}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4948,1.644828,0.666361,0.32061,0.292672,0.278166
2,0.9804,1.111855,0.751604,0.442625,0.437244,0.415043
3,0.4787,1.018669,0.769936,0.527403,0.5074,0.502519
4,0.2659,1.019922,0.780018,0.64712,0.578632,0.593813
5,0.1586,1.072082,0.779102,0.712248,0.631681,0.64808
6,0.1002,1.127903,0.777269,0.74142,0.645769,0.665252
7,0.0684,1.157525,0.787351,0.75341,0.668721,0.687592
8,0.0499,1.172029,0.791934,0.799457,0.700268,0.725848
9,0.0379,1.209348,0.792851,0.771054,0.706845,0.72033
10,0.0294,1.270574,0.790101,0.774654,0.718415,0.725732


[I 2025-03-15 22:54:53,318] Trial 30 finished with value: 0.7348231007366749 and parameters: {'learning_rate': 0.00010867727717767609, 'weight_decay': 0.008, 'adam_beta1': 0.99, 'warmup_steps': 26}. Best is trial 30 with value: 0.7348231007366749.


Trial 31 with params: {'learning_rate': 5.416464753898959e-05, 'weight_decay': 0.007, 'adam_beta1': 0.99, 'warmup_steps': 17}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9456,2.283973,0.515124,0.193392,0.16895,0.154026
2,1.6926,1.485265,0.709441,0.372059,0.345579,0.326999
3,1.0232,1.18998,0.748854,0.415357,0.410271,0.393129
4,0.6875,1.068546,0.766269,0.43991,0.461087,0.442363
5,0.4907,1.031021,0.772686,0.480328,0.488084,0.476257
6,0.3602,1.008843,0.769019,0.470876,0.493291,0.477668
7,0.2666,1.002442,0.779102,0.62718,0.5533,0.563441
8,0.2071,1.012375,0.781852,0.643569,0.574397,0.587679
9,0.1626,1.026069,0.784601,0.666899,0.609996,0.625429
10,0.1291,1.051076,0.788268,0.732149,0.647149,0.664724


[I 2025-03-15 23:10:45,557] Trial 31 finished with value: 0.7351045121613455 and parameters: {'learning_rate': 5.416464753898959e-05, 'weight_decay': 0.007, 'adam_beta1': 0.99, 'warmup_steps': 17}. Best is trial 31 with value: 0.7351045121613455.


Trial 32 with params: {'learning_rate': 0.0003098517363425933, 'weight_decay': 0.007, 'adam_beta1': 0.99, 'warmup_steps': 24}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6621,1.151937,0.734189,0.415586,0.434383,0.410713
2,0.3088,1.154022,0.769019,0.640229,0.605613,0.608737
3,0.1012,1.219595,0.779102,0.688662,0.678395,0.673228
4,0.0486,1.329433,0.780935,0.791686,0.695698,0.726287
5,0.0311,1.358531,0.788268,0.785953,0.710721,0.731228
6,0.0198,1.432772,0.781852,0.801665,0.697701,0.730178
7,0.0161,1.553298,0.774519,0.74402,0.669875,0.688887
8,0.0126,1.507646,0.784601,0.76867,0.700788,0.712675
9,0.0106,1.572703,0.785518,0.79073,0.687617,0.718563
10,0.0092,1.69255,0.774519,0.753579,0.704421,0.713421


[I 2025-03-15 23:20:54,562] Trial 32 pruned. 


Trial 33 with params: {'learning_rate': 5.509905487567882e-05, 'weight_decay': 0.007, 'adam_beta1': 0.99, 'warmup_steps': 11}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9198,2.254354,0.542621,0.204683,0.190185,0.172202
2,1.6615,1.463946,0.712191,0.368954,0.347623,0.326035
3,0.9997,1.179035,0.748854,0.420333,0.415621,0.399314
4,0.67,1.062375,0.767186,0.442359,0.46525,0.446336
5,0.4766,1.02891,0.770852,0.476336,0.486064,0.473979


[I 2025-03-15 23:23:27,880] Trial 33 pruned. 


Trial 34 with params: {'learning_rate': 2.773343681700955e-05, 'weight_decay': 0.008, 'adam_beta1': 0.99, 'warmup_steps': 27}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.2957,2.845498,0.407883,0.08714,0.09642,0.0782
2,2.408,2.139469,0.584785,0.209669,0.216151,0.193331
3,1.7752,1.681393,0.684693,0.313798,0.30452,0.287596
4,1.3457,1.412168,0.718607,0.387553,0.361975,0.346864
5,1.0562,1.260748,0.736939,0.388936,0.3905,0.367212
6,0.8591,1.166736,0.748854,0.415086,0.41508,0.397524
7,0.7113,1.099689,0.764436,0.467124,0.458791,0.447521
8,0.6083,1.064897,0.767186,0.484871,0.478432,0.46401
9,0.5229,1.043329,0.767186,0.478511,0.484059,0.472815
10,0.4523,1.017947,0.769019,0.470526,0.485044,0.471312


[I 2025-03-15 23:28:37,853] Trial 34 pruned. 


Trial 35 with params: {'learning_rate': 8.753744400903284e-05, 'weight_decay': 0.003, 'adam_beta1': 0.99, 'warmup_steps': 20}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.636,1.837575,0.594867,0.228513,0.226714,0.202685
2,1.1795,1.190097,0.744271,0.414047,0.400249,0.375184
3,0.6177,1.034662,0.767186,0.471513,0.477358,0.461114
4,0.3665,1.002346,0.769019,0.550471,0.517285,0.518802
5,0.2301,1.028895,0.776352,0.629785,0.585175,0.591832
6,0.1514,1.062239,0.780935,0.716245,0.631841,0.651237
7,0.1043,1.079819,0.790101,0.73503,0.656823,0.674301
8,0.0757,1.09155,0.793767,0.742484,0.675302,0.68909
9,0.0572,1.141469,0.791017,0.777869,0.705992,0.722303
10,0.0443,1.187745,0.784601,0.766623,0.711823,0.720059


[I 2025-03-15 23:44:15,752] Trial 35 finished with value: 0.7389568059039658 and parameters: {'learning_rate': 8.753744400903284e-05, 'weight_decay': 0.003, 'adam_beta1': 0.99, 'warmup_steps': 20}. Best is trial 35 with value: 0.7389568059039658.


Trial 36 with params: {'learning_rate': 7.900156280003128e-05, 'weight_decay': 0.003, 'adam_beta1': 0.98, 'warmup_steps': 22}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6131,1.783625,0.651696,0.307204,0.281012,0.267434
2,1.1616,1.188527,0.746104,0.398259,0.406519,0.384117
3,0.6368,1.040911,0.766269,0.463883,0.46962,0.458617
4,0.3948,0.987231,0.769936,0.49688,0.497897,0.487629
5,0.2542,1.005335,0.785518,0.650023,0.576729,0.590078
6,0.1718,1.026621,0.784601,0.696039,0.609666,0.632547
7,0.1199,1.038676,0.784601,0.690724,0.62931,0.641368
8,0.0886,1.059059,0.793767,0.772659,0.690163,0.710678
9,0.0678,1.120232,0.785518,0.753742,0.677565,0.697538
10,0.0521,1.154973,0.784601,0.77825,0.699172,0.718755


[I 2025-03-15 23:59:57,650] Trial 36 finished with value: 0.734666875180893 and parameters: {'learning_rate': 7.900156280003128e-05, 'weight_decay': 0.003, 'adam_beta1': 0.98, 'warmup_steps': 22}. Best is trial 35 with value: 0.7389568059039658.


Trial 37 with params: {'learning_rate': 5.977398091589551e-05, 'weight_decay': 0.002, 'adam_beta1': 0.99, 'warmup_steps': 27}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9131,2.216671,0.545371,0.204007,0.192548,0.172902
2,1.6009,1.420938,0.71769,0.398115,0.36691,0.350137
3,0.9393,1.152544,0.751604,0.429548,0.424108,0.41045
4,0.6178,1.048143,0.768103,0.473872,0.473136,0.460209
5,0.4301,1.02137,0.769936,0.481761,0.487363,0.478659
6,0.3076,1.010061,0.769936,0.585596,0.525005,0.532138
7,0.2233,1.010886,0.780018,0.652834,0.576229,0.591227
8,0.1711,1.022041,0.790101,0.69435,0.62037,0.640188
9,0.1328,1.042668,0.784601,0.715188,0.637491,0.656142
10,0.1042,1.071215,0.785518,0.722,0.64721,0.659739


[I 2025-03-16 00:05:16,379] Trial 37 pruned. 


Trial 38 with params: {'learning_rate': 0.00010220732101943805, 'weight_decay': 0.003, 'adam_beta1': 0.99, 'warmup_steps': 17}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5069,1.680324,0.658112,0.278076,0.281256,0.264116
2,1.021,1.12205,0.752521,0.454048,0.443334,0.424341
3,0.508,1.021016,0.769019,0.506153,0.488934,0.479565
4,0.2877,1.012977,0.778185,0.630972,0.562143,0.576711
5,0.1737,1.061021,0.780935,0.697494,0.623061,0.638706
6,0.1108,1.105792,0.780935,0.74373,0.648215,0.667574
7,0.0754,1.133121,0.787351,0.747906,0.651942,0.671631
8,0.0549,1.147972,0.791934,0.799234,0.701135,0.726151
9,0.0416,1.194702,0.794684,0.76948,0.706655,0.720282
10,0.0323,1.240781,0.791017,0.776041,0.712862,0.721507


[I 2025-03-16 00:21:18,658] Trial 38 finished with value: 0.7365573554528182 and parameters: {'learning_rate': 0.00010220732101943805, 'weight_decay': 0.003, 'adam_beta1': 0.99, 'warmup_steps': 17}. Best is trial 35 with value: 0.7389568059039658.


Trial 39 with params: {'learning_rate': 1.9867411486560037e-05, 'weight_decay': 0.004, 'adam_beta1': 0.99, 'warmup_steps': 15}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4084,3.04113,0.35472,0.056673,0.074321,0.050397
2,2.6908,2.447186,0.51604,0.188456,0.159896,0.142024
3,2.141,2.013415,0.598533,0.270064,0.230126,0.209689
4,1.7322,1.706566,0.679193,0.333211,0.299829,0.285541
5,1.423,1.498194,0.710357,0.359478,0.346341,0.330053
6,1.1977,1.354627,0.729606,0.37862,0.380099,0.36385
7,1.0221,1.25808,0.739688,0.380745,0.393291,0.370955
8,0.8947,1.189439,0.747938,0.416785,0.411785,0.390836
9,0.7908,1.145042,0.754354,0.431817,0.43107,0.416
10,0.7018,1.104107,0.758937,0.454892,0.450829,0.436162


[I 2025-03-16 00:31:52,915] Trial 39 pruned. 


Trial 40 with params: {'learning_rate': 0.00012520413499433398, 'weight_decay': 0.01, 'adam_beta1': 0.97, 'warmup_steps': 31}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1986,1.345369,0.735105,0.385426,0.384246,0.364829
2,0.7247,1.043304,0.769936,0.498292,0.487701,0.482039
3,0.3374,1.007115,0.774519,0.598359,0.552988,0.556588
4,0.1808,1.018855,0.787351,0.68911,0.623551,0.639847
5,0.1049,1.061672,0.783685,0.725606,0.629967,0.653327
6,0.0671,1.106026,0.789184,0.78718,0.685027,0.710736
7,0.0463,1.166955,0.7956,0.778921,0.694746,0.712915
8,0.0347,1.189559,0.797434,0.788909,0.705457,0.724522
9,0.0275,1.251066,0.791934,0.781988,0.708613,0.723745
10,0.0201,1.309275,0.786434,0.77798,0.705749,0.717081


[I 2025-03-16 00:47:42,105] Trial 40 finished with value: 0.7430654138816002 and parameters: {'learning_rate': 0.00012520413499433398, 'weight_decay': 0.01, 'adam_beta1': 0.97, 'warmup_steps': 31}. Best is trial 40 with value: 0.7430654138816002.


Trial 41 with params: {'learning_rate': 0.00013410300578029893, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.97, 'warmup_steps': 35}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1514,1.306638,0.740605,0.410568,0.397153,0.377861
2,0.6754,1.029786,0.769936,0.486509,0.486457,0.47897
3,0.3055,1.00843,0.778185,0.621759,0.568463,0.575114
4,0.1603,1.035089,0.784601,0.700207,0.629022,0.644135
5,0.0924,1.068859,0.788268,0.741795,0.646585,0.66984
6,0.0594,1.117513,0.791934,0.79009,0.695526,0.719495
7,0.0407,1.193617,0.790101,0.774201,0.689761,0.708613
8,0.0308,1.203965,0.800183,0.792824,0.716205,0.730334
9,0.0243,1.271207,0.791017,0.77218,0.704615,0.721456
10,0.018,1.312541,0.787351,0.789728,0.710929,0.724142


[I 2025-03-16 01:03:34,068] Trial 41 finished with value: 0.7349632761929046 and parameters: {'learning_rate': 0.00013410300578029893, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.97, 'warmup_steps': 35}. Best is trial 40 with value: 0.7430654138816002.


Trial 42 with params: {'learning_rate': 0.00025101073365780825, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 38}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.578,1.044763,0.771769,0.46532,0.468641,0.452378
2,0.3007,1.008945,0.787351,0.663783,0.617899,0.623395
3,0.1085,1.080773,0.794684,0.732491,0.691576,0.697091
4,0.053,1.151122,0.791017,0.812262,0.696714,0.732139
5,0.0328,1.205364,0.796517,0.80208,0.718995,0.744276
6,0.0243,1.27461,0.797434,0.790893,0.71102,0.733105
7,0.0173,1.369667,0.783685,0.766105,0.719339,0.723136
8,0.0144,1.343381,0.787351,0.788781,0.714634,0.731993
9,0.0108,1.443368,0.7956,0.798703,0.734612,0.747256
10,0.0095,1.503361,0.787351,0.812538,0.722232,0.742097


[I 2025-03-16 01:19:05,184] Trial 42 finished with value: 0.7297447610019583 and parameters: {'learning_rate': 0.00025101073365780825, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 38}. Best is trial 40 with value: 0.7430654138816002.


Trial 43 with params: {'learning_rate': 8.649930997204209e-05, 'weight_decay': 0.01, 'adam_beta1': 0.97, 'warmup_steps': 50}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5451,1.658605,0.681027,0.330316,0.305572,0.288508
2,1.0515,1.146352,0.756187,0.430004,0.430963,0.410873
3,0.5634,1.03205,0.767186,0.492306,0.491841,0.481286
4,0.3411,0.992228,0.779102,0.563495,0.528845,0.529882
5,0.2147,1.013871,0.783685,0.64466,0.595893,0.604379
6,0.1434,1.052479,0.785518,0.684874,0.609946,0.630191
7,0.0989,1.064534,0.789184,0.72445,0.64721,0.664724
8,0.073,1.089201,0.791934,0.761475,0.679729,0.697475
9,0.0558,1.15394,0.788268,0.778934,0.695796,0.718749
10,0.0424,1.194259,0.787351,0.782036,0.706557,0.723435


[I 2025-03-16 01:29:28,370] Trial 43 pruned. 


Trial 44 with params: {'learning_rate': 0.00019272781094329593, 'weight_decay': 0.002, 'adam_beta1': 0.99, 'warmup_steps': 18}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0064,1.261238,0.724106,0.379078,0.378595,0.361975
2,0.5344,1.057373,0.773602,0.569102,0.53882,0.53785
3,0.2058,1.082224,0.780018,0.68662,0.652667,0.655895
4,0.0981,1.163652,0.783685,0.737701,0.663964,0.681729
5,0.0566,1.212169,0.783685,0.767862,0.679323,0.701901
6,0.0367,1.27994,0.784601,0.793922,0.710541,0.73505
7,0.0249,1.370212,0.783685,0.787423,0.715066,0.732825
8,0.019,1.345386,0.787351,0.787776,0.708527,0.730101
9,0.0145,1.389816,0.791017,0.802019,0.708806,0.737102
10,0.0123,1.48726,0.789184,0.780577,0.732859,0.736124


[I 2025-03-16 01:45:00,729] Trial 44 finished with value: 0.7337300048615195 and parameters: {'learning_rate': 0.00019272781094329593, 'weight_decay': 0.002, 'adam_beta1': 0.99, 'warmup_steps': 18}. Best is trial 40 with value: 0.7430654138816002.


Trial 45 with params: {'learning_rate': 3.5051177072717587e-05, 'weight_decay': 0.01, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 32}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0768,2.447258,0.525206,0.166784,0.159462,0.139589
2,1.9557,1.725543,0.68286,0.31156,0.3033,0.287055
3,1.3426,1.380131,0.729606,0.397034,0.381645,0.366643
4,0.9875,1.214214,0.746104,0.393208,0.404382,0.384301
5,0.7641,1.124489,0.75802,0.469899,0.451815,0.439936


[I 2025-03-16 01:47:34,927] Trial 45 pruned. 


Trial 46 with params: {'learning_rate': 4.3515782249930405e-05, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.97, 'warmup_steps': 34}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9984,2.31453,0.553621,0.234957,0.191262,0.178171
2,1.7719,1.565323,0.703025,0.339902,0.322779,0.301791
3,1.1461,1.263905,0.741522,0.381604,0.395149,0.375683
4,0.8096,1.128366,0.761687,0.437232,0.442448,0.42779
5,0.6061,1.068694,0.767186,0.481716,0.466425,0.453993
6,0.4714,1.028353,0.768103,0.459622,0.477777,0.462616
7,0.3656,1.005507,0.769936,0.508407,0.507788,0.498592
8,0.2935,1.003179,0.776352,0.592425,0.535657,0.540511
9,0.238,1.007307,0.778185,0.594975,0.539734,0.546314
10,0.1941,1.020983,0.781852,0.648165,0.587179,0.597365


[I 2025-03-16 01:58:15,865] Trial 46 pruned. 


Trial 47 with params: {'learning_rate': 0.0003753823772443784, 'weight_decay': 0.01, 'adam_beta1': 0.98, 'warmup_steps': 26}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4171,1.072422,0.76352,0.527198,0.522332,0.509543
2,0.2051,1.167029,0.774519,0.716385,0.648689,0.65863
3,0.0709,1.267442,0.772686,0.766414,0.708635,0.721804
4,0.0374,1.318705,0.780018,0.781957,0.720866,0.732151
5,0.0246,1.415661,0.780935,0.802389,0.714706,0.739336
6,0.0171,1.462827,0.777269,0.807553,0.680601,0.714276
7,0.0141,1.533412,0.777269,0.781936,0.702427,0.721583
8,0.0123,1.556657,0.780935,0.807604,0.703252,0.729989
9,0.0102,1.625016,0.772686,0.7604,0.715969,0.722702
10,0.0087,1.630892,0.785518,0.811944,0.717842,0.73783


[I 2025-03-16 02:08:40,203] Trial 47 pruned. 


Trial 48 with params: {'learning_rate': 0.00040740464809269436, 'weight_decay': 0.01, 'adam_beta1': 0.98, 'warmup_steps': 42}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4232,1.087546,0.764436,0.492642,0.50362,0.490681
2,0.1942,1.127852,0.782768,0.718556,0.657734,0.670288
3,0.0672,1.284736,0.765353,0.760722,0.703979,0.715226
4,0.0349,1.306137,0.796517,0.794402,0.712338,0.7285
5,0.0238,1.374176,0.784601,0.787586,0.687329,0.713692
6,0.018,1.489161,0.775435,0.775755,0.691762,0.714908
7,0.0126,1.540925,0.779102,0.742204,0.700586,0.702788
8,0.0119,1.55708,0.780935,0.776154,0.696272,0.715881
9,0.0091,1.74379,0.773602,0.766399,0.694772,0.71303
10,0.0083,1.764621,0.76352,0.71668,0.695952,0.691337


[I 2025-03-16 02:13:54,545] Trial 48 pruned. 


Trial 49 with params: {'learning_rate': 2.2037250878112672e-06, 'weight_decay': 0.002, 'adam_beta1': 0.97, 'warmup_steps': 46}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8022,3.718979,0.203483,0.015386,0.030689,0.015477
2,3.6555,3.595393,0.208983,0.015042,0.029589,0.016688
3,3.5398,3.485011,0.235564,0.052081,0.037713,0.029123
4,3.4358,3.382716,0.308891,0.044367,0.059738,0.046106
5,3.336,3.290022,0.346471,0.040951,0.071921,0.049736
6,3.2461,3.206956,0.353804,0.07019,0.075349,0.052468
7,3.1633,3.132471,0.370302,0.072345,0.08037,0.059639
8,3.0893,3.065569,0.39505,0.099429,0.088494,0.071286
9,3.0222,3.003909,0.414299,0.095281,0.095375,0.079246
10,2.9596,2.947238,0.428048,0.112803,0.103236,0.088373


[I 2025-03-16 02:24:27,091] Trial 49 pruned. 


Trial 50 with params: {'learning_rate': 7.09065670740699e-05, 'weight_decay': 0.004, 'adam_beta1': 0.99, 'warmup_steps': 15}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7671,2.025421,0.577452,0.219492,0.211669,0.189214
2,1.3872,1.295684,0.737855,0.396038,0.395095,0.369298
3,0.7764,1.083825,0.76352,0.473877,0.467137,0.447687
4,0.4923,1.014455,0.767186,0.469194,0.484584,0.469321
5,0.3268,1.012396,0.768103,0.552038,0.517201,0.51272
6,0.2255,1.021441,0.780935,0.644374,0.575007,0.588796
7,0.1588,1.027078,0.783685,0.694489,0.615857,0.63422
8,0.1187,1.040886,0.785518,0.708326,0.631357,0.65113
9,0.0901,1.077026,0.788268,0.73735,0.654881,0.676431
10,0.07,1.116915,0.782768,0.777415,0.705357,0.721444


[I 2025-03-16 02:34:48,677] Trial 50 pruned. 


Trial 51 with params: {'learning_rate': 0.00010095801211097598, 'weight_decay': 0.008, 'adam_beta1': 0.99, 'warmup_steps': 38}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5888,1.745987,0.63703,0.273614,0.263885,0.246342
2,1.0702,1.141792,0.752521,0.436712,0.438658,0.413868
3,0.5311,1.020862,0.76352,0.522187,0.49285,0.483655
4,0.3012,1.016845,0.775435,0.58635,0.547446,0.551201
5,0.1829,1.05517,0.777269,0.714295,0.62071,0.642311
6,0.1174,1.098913,0.784601,0.747493,0.653466,0.674133
7,0.0801,1.135634,0.788268,0.753325,0.65236,0.674097
8,0.058,1.14712,0.794684,0.790518,0.702954,0.725571
9,0.0438,1.204458,0.790101,0.776389,0.706204,0.72167
10,0.0344,1.251051,0.782768,0.766697,0.709459,0.716352


[I 2025-03-16 02:50:25,958] Trial 51 finished with value: 0.7404721838627663 and parameters: {'learning_rate': 0.00010095801211097598, 'weight_decay': 0.008, 'adam_beta1': 0.99, 'warmup_steps': 38}. Best is trial 40 with value: 0.7430654138816002.


Trial 52 with params: {'learning_rate': 0.00011320008962884753, 'weight_decay': 0.008, 'adam_beta1': 0.99, 'warmup_steps': 40}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5055,1.638037,0.663611,0.308901,0.306864,0.28786
2,0.9639,1.108444,0.756187,0.435648,0.44901,0.4269
3,0.4616,1.017499,0.770852,0.529946,0.514927,0.508146
4,0.2528,1.029009,0.776352,0.665928,0.586114,0.605044
5,0.1497,1.079664,0.781852,0.700354,0.627215,0.641945
6,0.0943,1.141605,0.783685,0.737497,0.652895,0.670008
7,0.064,1.178661,0.785518,0.787374,0.688337,0.713871
8,0.047,1.183417,0.792851,0.801492,0.703194,0.728743
9,0.0357,1.218223,0.790101,0.769479,0.704767,0.717585
10,0.0274,1.277234,0.788268,0.776045,0.714089,0.721217


[I 2025-03-16 03:00:57,161] Trial 52 pruned. 


Trial 53 with params: {'learning_rate': 5.443255711584556e-05, 'weight_decay': 0.01, 'adam_beta1': 0.96, 'warmup_steps': 10}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.781,2.030293,0.603116,0.262147,0.239436,0.223698
2,1.4715,1.363388,0.733272,0.382093,0.380942,0.363536
3,0.9014,1.14469,0.761687,0.450864,0.440666,0.427487
4,0.6151,1.054428,0.765353,0.474088,0.465639,0.455474
5,0.4415,1.019332,0.768103,0.487965,0.488208,0.47809
6,0.3268,1.005204,0.769019,0.551408,0.517362,0.519157
7,0.2412,0.998764,0.774519,0.586123,0.53845,0.541696
8,0.1871,1.007763,0.783685,0.648456,0.586916,0.598286
9,0.1485,1.028021,0.785518,0.660647,0.59911,0.613839
10,0.1176,1.056236,0.787351,0.705587,0.622671,0.641449


[I 2025-03-16 03:06:07,242] Trial 53 pruned. 


Trial 54 with params: {'learning_rate': 8.383415806965337e-05, 'weight_decay': 0.006, 'adam_beta1': 0.96, 'warmup_steps': 36}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5016,1.634399,0.68286,0.314744,0.305342,0.290488
2,1.0452,1.14102,0.759853,0.443953,0.443028,0.425998
3,0.5692,1.031367,0.772686,0.472524,0.49044,0.475759
4,0.3486,0.991879,0.772686,0.540135,0.514001,0.510031
5,0.2215,1.004774,0.785518,0.640628,0.595526,0.603747
6,0.1488,1.039051,0.787351,0.688217,0.607796,0.629397
7,0.1031,1.055076,0.789184,0.706172,0.640556,0.656202
8,0.0764,1.08259,0.793767,0.769994,0.685272,0.705069
9,0.0587,1.13211,0.793767,0.783278,0.700676,0.722655
10,0.0446,1.183919,0.789184,0.77853,0.701344,0.720711


[I 2025-03-16 03:16:34,767] Trial 54 pruned. 


Trial 55 with params: {'learning_rate': 2.2793208043763986e-06, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 37}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.7974,3.711179,0.208983,0.01419,0.031131,0.014988
2,3.6458,3.583209,0.208983,0.015411,0.029589,0.016834
3,3.5261,3.468997,0.242896,0.050634,0.039762,0.031224
4,3.4184,3.362981,0.316224,0.042533,0.06202,0.046377
5,3.3153,3.268545,0.348304,0.059587,0.072614,0.049726
6,3.2229,3.183723,0.356554,0.070244,0.076405,0.053333
7,3.1385,3.107968,0.378552,0.097543,0.083805,0.065184
8,3.0631,3.039802,0.406049,0.099123,0.091522,0.074706
9,2.9947,2.976801,0.420715,0.096443,0.099271,0.083854
10,2.9309,2.919061,0.434464,0.111115,0.10738,0.092327


[I 2025-03-16 03:21:47,266] Trial 55 pruned. 


Trial 56 with params: {'learning_rate': 0.00010023894871972397, 'weight_decay': 0.005, 'adam_beta1': 0.99, 'warmup_steps': 46}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6209,1.778612,0.625115,0.268386,0.248861,0.232873
2,1.0915,1.148964,0.750687,0.40697,0.431778,0.405699
3,0.54,1.021044,0.764436,0.522898,0.493276,0.484316
4,0.3062,1.01341,0.776352,0.586343,0.547231,0.551096
5,0.1862,1.055514,0.771769,0.678021,0.608673,0.623363
6,0.1197,1.09297,0.786434,0.741302,0.644685,0.668079
7,0.0817,1.134158,0.789184,0.740934,0.654399,0.67407
8,0.0589,1.139983,0.791934,0.781975,0.701511,0.721271
9,0.0443,1.201372,0.787351,0.775985,0.705008,0.721015
10,0.035,1.247103,0.782768,0.772829,0.709185,0.717097


[I 2025-03-16 03:27:03,187] Trial 56 pruned. 


Trial 57 with params: {'learning_rate': 9.590754651274163e-05, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.96, 'warmup_steps': 33}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3825,1.512476,0.705775,0.323292,0.330486,0.310392
2,0.9207,1.094639,0.766269,0.436752,0.45544,0.437803
3,0.4806,1.010205,0.765353,0.469252,0.490394,0.475104
4,0.2804,1.003725,0.774519,0.600113,0.539145,0.55002
5,0.1725,1.016086,0.783685,0.653506,0.597281,0.608142
6,0.1133,1.061665,0.785518,0.716699,0.611138,0.639142
7,0.0776,1.083814,0.796517,0.765279,0.673369,0.693448
8,0.0569,1.114638,0.797434,0.803819,0.713752,0.734574
9,0.0439,1.166025,0.799267,0.785687,0.707156,0.727865
10,0.0331,1.211129,0.792851,0.801178,0.70822,0.730732


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--precision/155d3220d6cd4a6553f12da68eeb3d1f97cf431206304a4bc6e2d564c29502e9 (last modified on Fri Jan 10 23:13:59 2025) since it couldn't be found locally at evaluate-metric--precision, or remotely on the Hugging Face Hub.
[I 2025-03-16 03:42:44,152] Trial 57 finished with value: 0.739348905242017 and parameters: {'learning_rate': 9.590754651274163e-05, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.96, 'warmup_steps': 33}. Best is trial 40 with value: 0.7430654138816002.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trial 58 with params: {'learning_rate': 0.00012545757026986983, 'weight_decay': 0.008, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 24}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.11,1.307781,0.741522,0.389932,0.391141,0.369108
2,0.6822,1.028921,0.771769,0.489229,0.483762,0.478377
3,0.3174,0.992548,0.780018,0.601014,0.566347,0.56691
4,0.1688,1.015873,0.785518,0.673979,0.613545,0.628138
5,0.0989,1.055042,0.790101,0.713954,0.638747,0.654591
6,0.0645,1.119583,0.790101,0.788189,0.678667,0.706923
7,0.0438,1.176871,0.793767,0.819221,0.696357,0.727095
8,0.0329,1.200353,0.799267,0.798605,0.716193,0.734584
9,0.0266,1.25007,0.791017,0.774151,0.711743,0.726133
10,0.0195,1.322959,0.788268,0.802643,0.714857,0.734648


[I 2025-03-16 03:58:07,636] Trial 58 finished with value: 0.7409288034163989 and parameters: {'learning_rate': 0.00012545757026986983, 'weight_decay': 0.008, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 24}. Best is trial 40 with value: 0.7430654138816002.


Trial 59 with params: {'learning_rate': 5.813441783758918e-05, 'weight_decay': 0.008, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 22}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7345,1.952807,0.618698,0.286308,0.249673,0.232647
2,1.3904,1.316074,0.735105,0.375642,0.385096,0.366928
3,0.8408,1.123234,0.761687,0.4478,0.447194,0.431255
4,0.5678,1.040895,0.766269,0.482938,0.476005,0.466822
5,0.4007,1.010764,0.769019,0.494817,0.500349,0.487489


[I 2025-03-16 04:00:46,028] Trial 59 pruned. 


Trial 60 with params: {'learning_rate': 0.0003514535590125288, 'weight_decay': 0.01, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 17}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2758,1.01726,0.776352,0.5111,0.522026,0.505983
2,0.1893,1.067336,0.786434,0.708734,0.657381,0.663482
3,0.0682,1.220345,0.780935,0.760871,0.728069,0.728407
4,0.0356,1.235497,0.79835,0.805167,0.712453,0.73839
5,0.0243,1.311132,0.789184,0.812455,0.681327,0.724361
6,0.0191,1.332148,0.799267,0.810619,0.703931,0.734746
7,0.0144,1.42309,0.802016,0.808447,0.719886,0.745898
8,0.0122,1.475518,0.790101,0.785587,0.707751,0.724384
9,0.0104,1.546923,0.783685,0.771791,0.693835,0.713724
10,0.0081,1.530536,0.794684,0.764877,0.713191,0.724103


[I 2025-03-16 04:11:00,197] Trial 60 pruned. 


Trial 61 with params: {'learning_rate': 0.00010215436306208406, 'weight_decay': 0.002, 'adam_beta1': 0.98, 'warmup_steps': 6}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3559,1.522834,0.696609,0.332046,0.326018,0.30602
2,0.9023,1.09086,0.764436,0.458931,0.463525,0.448425
3,0.4578,1.002166,0.767186,0.48682,0.492377,0.482296
4,0.2601,0.995731,0.784601,0.64152,0.575083,0.59085
5,0.1573,1.029016,0.788268,0.6754,0.616926,0.631492
6,0.1013,1.06058,0.792851,0.760674,0.663166,0.687641
7,0.069,1.108843,0.8011,0.789437,0.705987,0.726028
8,0.0505,1.130782,0.7956,0.806327,0.701847,0.7284
9,0.0385,1.191602,0.792851,0.778942,0.698264,0.721858
10,0.0295,1.235317,0.791017,0.793657,0.704875,0.727261


[I 2025-03-16 04:21:40,792] Trial 61 pruned. 


Trial 62 with params: {'learning_rate': 0.00038950536460528504, 'weight_decay': 0.007, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 29}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2402,1.000669,0.783685,0.561803,0.550239,0.540996
2,0.1679,1.081595,0.785518,0.700677,0.646374,0.658596
3,0.0641,1.223456,0.791017,0.759193,0.723703,0.724333
4,0.0328,1.297212,0.786434,0.806301,0.681514,0.716855
5,0.0234,1.345515,0.784601,0.777575,0.698443,0.711889
6,0.0181,1.393977,0.791017,0.809236,0.704254,0.732556
7,0.0129,1.490005,0.781852,0.778507,0.689332,0.715536
8,0.0111,1.485947,0.788268,0.758214,0.707273,0.713504
9,0.0105,1.588515,0.786434,0.804804,0.704552,0.727244
10,0.0086,1.633088,0.782768,0.756414,0.705636,0.711919


[I 2025-03-16 04:26:54,919] Trial 62 pruned. 


Trial 63 with params: {'learning_rate': 4.467180133544001e-05, 'weight_decay': 0.01, 'adam_beta1': 0.97, 'warmup_steps': 25}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9695,2.279404,0.567369,0.233364,0.201578,0.18684
2,1.7338,1.537291,0.708524,0.343875,0.329594,0.309297
3,1.1136,1.246871,0.743355,0.402384,0.400637,0.384573
4,0.7836,1.116431,0.76352,0.449845,0.44813,0.43516
5,0.5843,1.061824,0.766269,0.47061,0.466426,0.452637
6,0.4525,1.024903,0.767186,0.478707,0.486018,0.475374
7,0.3488,1.00359,0.767186,0.506803,0.505783,0.496448
8,0.2788,1.003328,0.775435,0.588499,0.53399,0.5384
9,0.2255,1.008886,0.779102,0.619547,0.549734,0.56111
10,0.1834,1.02289,0.781852,0.647108,0.587179,0.597751


[I 2025-03-16 04:42:20,977] Trial 63 finished with value: 0.7304699924097497 and parameters: {'learning_rate': 4.467180133544001e-05, 'weight_decay': 0.01, 'adam_beta1': 0.97, 'warmup_steps': 25}. Best is trial 40 with value: 0.7430654138816002.


Trial 64 with params: {'learning_rate': 1.6488779238415127e-06, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 40}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8207,3.750179,0.184235,0.032173,0.025921,0.01354
2,3.6991,3.64993,0.212649,0.018428,0.030945,0.016257
3,3.6069,3.563762,0.209899,0.036689,0.029827,0.017776
4,3.5257,3.484789,0.234647,0.051315,0.03735,0.028418
5,3.4491,3.411254,0.293309,0.045591,0.054955,0.043483
6,3.3786,3.342406,0.328139,0.042515,0.06603,0.048516
7,3.3117,3.280226,0.349221,0.060623,0.072978,0.050489
8,3.2506,3.223618,0.352887,0.074812,0.074986,0.052203
9,3.195,3.171898,0.359303,0.073678,0.077051,0.054693
10,3.1424,3.124766,0.366636,0.073143,0.079376,0.058125


[I 2025-03-16 04:53:00,286] Trial 64 pruned. 


Trial 65 with params: {'learning_rate': 0.00025146973771310706, 'weight_decay': 0.004, 'adam_beta1': 0.99, 'warmup_steps': 21}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8166,1.190332,0.728689,0.42109,0.414161,0.399957
2,0.3989,1.113561,0.769936,0.64016,0.58932,0.602031
3,0.1391,1.152823,0.780018,0.688272,0.669798,0.667025
4,0.064,1.25522,0.778185,0.747718,0.662337,0.68105
5,0.0379,1.297097,0.791934,0.785601,0.708973,0.730993
6,0.0265,1.366392,0.778185,0.771318,0.674151,0.702531
7,0.0186,1.395397,0.783685,0.774291,0.701931,0.722656
8,0.0149,1.468507,0.777269,0.756104,0.663187,0.690876
9,0.0129,1.529112,0.781852,0.777143,0.699143,0.722679
10,0.0103,1.581938,0.783685,0.764986,0.718124,0.724148


[I 2025-03-16 05:08:47,693] Trial 65 finished with value: 0.7317753029315681 and parameters: {'learning_rate': 0.00025146973771310706, 'weight_decay': 0.004, 'adam_beta1': 0.99, 'warmup_steps': 21}. Best is trial 40 with value: 0.7430654138816002.


Trial 66 with params: {'learning_rate': 2.0788074719357774e-05, 'weight_decay': 0.007, 'adam_beta1': 0.98, 'warmup_steps': 13}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3645,2.949123,0.418882,0.097129,0.097978,0.081741
2,2.5708,2.316668,0.555454,0.214246,0.18935,0.175662
3,2.0045,1.893938,0.627864,0.28317,0.257123,0.237146
4,1.6054,1.607852,0.696609,0.334042,0.315716,0.298838
5,1.3133,1.424544,0.718607,0.363643,0.365747,0.347313
6,1.1041,1.298201,0.736939,0.405228,0.397671,0.385114
7,0.9413,1.21518,0.747021,0.43889,0.416954,0.401448
8,0.8239,1.157691,0.754354,0.427553,0.431035,0.412936
9,0.7279,1.116269,0.759853,0.465444,0.451358,0.44227
10,0.6449,1.08302,0.76352,0.46156,0.455691,0.444654


[I 2025-03-16 05:18:54,242] Trial 66 pruned. 


Trial 67 with params: {'learning_rate': 0.00019438727075479905, 'weight_decay': 0.008, 'adam_beta1': 0.96, 'warmup_steps': 22}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7733,1.124635,0.762603,0.437195,0.44921,0.42957
2,0.4159,1.012292,0.777269,0.620995,0.566325,0.571393
3,0.1589,1.045737,0.786434,0.697952,0.65371,0.656127
4,0.0776,1.102102,0.794684,0.776556,0.692618,0.714026
5,0.046,1.168948,0.799267,0.813927,0.720394,0.748517
6,0.0311,1.208174,0.794684,0.790956,0.687378,0.713782
7,0.0217,1.279282,0.787351,0.794433,0.691191,0.718242
8,0.0184,1.289861,0.792851,0.791664,0.718285,0.735551
9,0.0143,1.334578,0.799267,0.819297,0.725396,0.750593
10,0.01,1.445414,0.786434,0.782961,0.733043,0.74129


[I 2025-03-16 05:34:46,872] Trial 67 finished with value: 0.7423151367486238 and parameters: {'learning_rate': 0.00019438727075479905, 'weight_decay': 0.008, 'adam_beta1': 0.96, 'warmup_steps': 22}. Best is trial 40 with value: 0.7430654138816002.


Trial 68 with params: {'learning_rate': 0.00016632116544549325, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.96, 'warmup_steps': 24}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9023,1.180301,0.754354,0.437083,0.431716,0.415688
2,0.504,1.005094,0.774519,0.576632,0.529977,0.528594
3,0.2052,1.010256,0.793767,0.682815,0.642254,0.646435
4,0.1027,1.092733,0.789184,0.738932,0.642593,0.668849
5,0.0596,1.107854,0.792851,0.801879,0.701403,0.729709
6,0.0399,1.171889,0.792851,0.797289,0.691396,0.721611
7,0.0273,1.258666,0.789184,0.799093,0.693042,0.721427
8,0.0216,1.284186,0.790101,0.801272,0.710939,0.732003
9,0.0177,1.30613,0.790101,0.789232,0.718479,0.734438
10,0.0135,1.378989,0.790101,0.793744,0.711468,0.729949


[I 2025-03-16 05:50:32,913] Trial 68 finished with value: 0.732368432206416 and parameters: {'learning_rate': 0.00016632116544549325, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.96, 'warmup_steps': 24}. Best is trial 40 with value: 0.7430654138816002.


Trial 69 with params: {'learning_rate': 0.0001139981084024823, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'warmup_steps': 5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0766,1.324254,0.743355,0.388081,0.390044,0.368354
2,0.7232,1.044935,0.769936,0.480717,0.481779,0.472853
3,0.3563,0.989576,0.780935,0.607262,0.553545,0.557178
4,0.1947,1.033192,0.780935,0.644782,0.577951,0.595918
5,0.1156,1.037899,0.793767,0.685478,0.620476,0.635824
6,0.0756,1.105759,0.791934,0.745716,0.654588,0.677499
7,0.0514,1.156808,0.800183,0.802412,0.702128,0.726708
8,0.0387,1.174409,0.79835,0.801585,0.719676,0.73778
9,0.0312,1.21312,0.788268,0.785194,0.703551,0.724045
10,0.0229,1.288796,0.789184,0.80336,0.707857,0.729726


[I 2025-03-16 06:06:00,671] Trial 69 finished with value: 0.747359569286359 and parameters: {'learning_rate': 0.0001139981084024823, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'warmup_steps': 5}. Best is trial 69 with value: 0.747359569286359.


Trial 70 with params: {'learning_rate': 2.0941712066636755e-05, 'weight_decay': 0.01, 'adam_beta1': 0.91, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3005,2.8463,0.452796,0.107515,0.116377,0.098526
2,2.4676,2.219648,0.582951,0.245155,0.211588,0.196084
3,1.9127,1.818554,0.652612,0.318552,0.27543,0.257873
4,1.5327,1.556419,0.702108,0.334637,0.326553,0.308987
5,1.2581,1.386071,0.71494,0.355464,0.359359,0.338169
6,1.0621,1.271247,0.744271,0.385621,0.404744,0.382574
7,0.908,1.192128,0.752521,0.410574,0.426497,0.406327
8,0.7966,1.138462,0.76352,0.448567,0.445401,0.427264
9,0.705,1.104691,0.76077,0.465281,0.453196,0.442589
10,0.6267,1.077439,0.767186,0.515853,0.472685,0.466401


[I 2025-03-16 06:11:08,536] Trial 70 pruned. 


Trial 71 with params: {'learning_rate': 0.00021636896534386668, 'weight_decay': 0.007, 'adam_beta1': 0.96, 'warmup_steps': 22}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6895,1.095908,0.76077,0.43255,0.456091,0.432769
2,0.3653,1.029,0.779102,0.646253,0.59057,0.597135
3,0.1354,1.07728,0.791934,0.726149,0.668338,0.676337
4,0.0659,1.139346,0.79835,0.806316,0.726891,0.748417
5,0.0404,1.183948,0.802933,0.79981,0.731178,0.750132
6,0.0279,1.243808,0.790101,0.80364,0.691243,0.725345
7,0.0195,1.321829,0.787351,0.793485,0.693402,0.717246
8,0.0168,1.340225,0.785518,0.795265,0.709057,0.728572
9,0.0127,1.396519,0.783685,0.796843,0.71056,0.734038
10,0.0095,1.452866,0.783685,0.77354,0.741467,0.742495


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--recall/11f90e583db35601050aed380d48e83202a896976b9608432fba9244fb447f24 (last modified on Fri Jan 10 23:14:00 2025) since it couldn't be found locally at evaluate-metric--recall, or remotely on the Hugging Face Hub.
[I 2025-03-16 06:27:07,628] Trial 71 finished with value: 0.7259298116320362 and parameters: {'learning_rate': 0.00021636896534386668, 'weight_decay': 0.007, 'adam_beta1': 0.96, 'warmup_steps': 22}. Best is trial 69 with value: 0.747359569286359.


Trial 72 with params: {'learning_rate': 2.3402871877686744e-06, 'weight_decay': 0.004, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 29}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.7937,3.705907,0.208983,0.01488,0.030961,0.014939
2,3.6392,3.574933,0.209899,0.036233,0.029827,0.017615
3,3.5167,3.458071,0.252979,0.051116,0.042739,0.034545
4,3.4063,3.349581,0.326306,0.042781,0.065428,0.048294
5,3.3009,3.25355,0.351971,0.060158,0.07442,0.05143


[I 2025-03-16 06:29:45,846] Trial 72 pruned. 


Trial 73 with params: {'learning_rate': 6.912248600811238e-05, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.91, 'warmup_steps': 7}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5263,1.736819,0.67461,0.322842,0.29804,0.286638
2,1.1714,1.203966,0.750687,0.407656,0.4137,0.39589
3,0.6843,1.067865,0.769936,0.477432,0.47726,0.463809
4,0.4473,1.010209,0.769019,0.472522,0.484125,0.472459
5,0.3002,0.991575,0.778185,0.5778,0.526255,0.528635
6,0.2098,1.007327,0.784601,0.655329,0.584142,0.598625
7,0.1484,1.037617,0.784601,0.67025,0.603116,0.619374
8,0.1124,1.042392,0.794684,0.691516,0.624479,0.640757
9,0.0877,1.081,0.781852,0.677784,0.607695,0.624267
10,0.0678,1.131648,0.790101,0.731081,0.647998,0.664983


[I 2025-03-16 06:34:56,714] Trial 73 pruned. 


Trial 74 with params: {'learning_rate': 9.24846444289197e-05, 'weight_decay': 0.006, 'adam_beta1': 0.9400000000000001, 'warmup_steps': 15}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3306,1.500503,0.714024,0.337522,0.349381,0.329462
2,0.9178,1.098837,0.768103,0.440024,0.457932,0.441183
3,0.489,1.013243,0.765353,0.466297,0.49061,0.472227
4,0.2881,1.00185,0.780018,0.579641,0.538351,0.544047
5,0.1781,1.01524,0.785518,0.64075,0.59333,0.603755


[I 2025-03-16 06:37:32,878] Trial 74 pruned. 


Trial 75 with params: {'learning_rate': 8.836929764547837e-05, 'weight_decay': 0.01, 'adam_beta1': 0.91, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2973,1.507881,0.710357,0.383491,0.353862,0.338662
2,0.9353,1.11254,0.759853,0.438805,0.449291,0.430184
3,0.5084,1.01769,0.762603,0.462421,0.480284,0.46554
4,0.304,1.001048,0.782768,0.577373,0.543602,0.546164
5,0.1914,1.012321,0.784601,0.633704,0.593675,0.600108
6,0.1274,1.039469,0.790101,0.718118,0.621357,0.647486
7,0.0876,1.08078,0.797434,0.756281,0.661316,0.685598
8,0.065,1.10629,0.794684,0.772564,0.672173,0.698273
9,0.0508,1.142771,0.791934,0.786211,0.682892,0.712632
10,0.0387,1.189117,0.789184,0.785498,0.69013,0.715291


[I 2025-03-16 06:53:15,293] Trial 75 finished with value: 0.7405269141452502 and parameters: {'learning_rate': 8.836929764547837e-05, 'weight_decay': 0.01, 'adam_beta1': 0.91, 'warmup_steps': 0}. Best is trial 69 with value: 0.747359569286359.


Trial 76 with params: {'learning_rate': 0.00029577722607030635, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9, 'warmup_steps': 5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3005,1.011738,0.769936,0.496173,0.494196,0.484293
2,0.2239,1.050381,0.789184,0.710508,0.644767,0.661314
3,0.0798,1.141284,0.793767,0.770666,0.68024,0.702543
4,0.0408,1.187694,0.7956,0.810839,0.707886,0.73838
5,0.0277,1.296576,0.790101,0.788378,0.709253,0.731768
6,0.0207,1.335168,0.786434,0.80295,0.699131,0.730108
7,0.014,1.435912,0.779102,0.777743,0.678002,0.707109
8,0.0119,1.440594,0.789184,0.795057,0.705166,0.729222
9,0.0101,1.553899,0.776352,0.786308,0.693258,0.716394
10,0.0083,1.607038,0.779102,0.785453,0.703329,0.725385


[I 2025-03-16 07:03:30,937] Trial 76 pruned. 


Trial 77 with params: {'learning_rate': 0.00011388436281762621, 'weight_decay': 0.01, 'adam_beta1': 0.91, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.082,1.329453,0.741522,0.408384,0.389482,0.371323
2,0.7246,1.046065,0.769019,0.452666,0.472068,0.453179
3,0.3552,1.000833,0.775435,0.598667,0.545307,0.550557
4,0.1944,1.032346,0.779102,0.640601,0.586575,0.600547
5,0.1161,1.048577,0.790101,0.702087,0.631246,0.645982
6,0.0764,1.093805,0.794684,0.768821,0.668416,0.695141
7,0.0514,1.171937,0.791934,0.783034,0.691291,0.712429
8,0.0388,1.179129,0.79835,0.788613,0.698932,0.720616
9,0.031,1.223631,0.792851,0.781344,0.691303,0.71409
10,0.0227,1.289599,0.790101,0.779076,0.697691,0.718399


[I 2025-03-16 07:08:35,169] Trial 77 pruned. 


Trial 78 with params: {'learning_rate': 0.00010745549908189871, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1237,1.361896,0.736939,0.381264,0.382166,0.363054
2,0.7687,1.059717,0.766269,0.466032,0.466416,0.446613
3,0.3877,0.995924,0.771769,0.575515,0.527821,0.526999
4,0.2163,1.025779,0.775435,0.611109,0.559471,0.57076
5,0.1301,1.038948,0.789184,0.697832,0.626986,0.643624
6,0.0855,1.081834,0.789184,0.75449,0.657272,0.683271
7,0.0577,1.153255,0.791017,0.75541,0.669745,0.689094
8,0.0432,1.170017,0.794684,0.789982,0.712113,0.728721
9,0.0342,1.209115,0.794684,0.789225,0.698564,0.722442
10,0.0254,1.265434,0.791934,0.786166,0.69628,0.721084


[I 2025-03-16 07:18:36,700] Trial 78 pruned. 


Trial 79 with params: {'learning_rate': 7.85993259542217e-05, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4068,1.609597,0.695692,0.33393,0.322939,0.305427
2,1.0429,1.149055,0.75802,0.450993,0.434383,0.422292
3,0.5897,1.041125,0.768103,0.470798,0.483867,0.471359
4,0.3709,1.000508,0.773602,0.542646,0.50999,0.508245
5,0.2397,0.998175,0.785518,0.645122,0.582118,0.597464
6,0.163,1.020649,0.792851,0.69116,0.619438,0.637922
7,0.1128,1.055343,0.792851,0.685075,0.617965,0.633814
8,0.0843,1.072145,0.7956,0.775929,0.676902,0.701081
9,0.0653,1.119327,0.782768,0.759422,0.653577,0.681874
10,0.0499,1.175144,0.788268,0.765457,0.684856,0.704719


[I 2025-03-16 07:34:10,823] Trial 79 finished with value: 0.7316952041227047 and parameters: {'learning_rate': 7.85993259542217e-05, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9, 'warmup_steps': 4}. Best is trial 69 with value: 0.747359569286359.


Trial 80 with params: {'learning_rate': 1.2466463833276121e-05, 'weight_decay': 0.01, 'adam_beta1': 0.99, 'warmup_steps': 49}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5716,3.299789,0.32539,0.043293,0.064437,0.047354
2,3.0453,2.85023,0.439047,0.109106,0.110829,0.095011
3,2.6202,2.481766,0.508708,0.149207,0.152172,0.130729
4,2.2715,2.192612,0.574702,0.21024,0.207623,0.189448
5,1.9827,1.964375,0.609533,0.258199,0.234669,0.212703
6,1.753,1.780689,0.659028,0.296091,0.282174,0.266229
7,1.5602,1.63573,0.687443,0.361186,0.316147,0.30386
8,1.4059,1.520021,0.708524,0.374464,0.343865,0.330223
9,1.2773,1.43343,0.722273,0.365806,0.361647,0.342379
10,1.1663,1.360029,0.731439,0.391473,0.382557,0.367589


[I 2025-03-16 07:39:27,970] Trial 80 pruned. 


Trial 81 with params: {'learning_rate': 0.00020915505374898168, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.93, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5786,1.076607,0.768103,0.425031,0.462275,0.434701
2,0.3596,1.006755,0.784601,0.668389,0.598698,0.614408
3,0.1368,1.081282,0.787351,0.714756,0.656117,0.669136
4,0.0659,1.132778,0.797434,0.78363,0.685883,0.715577
5,0.0406,1.18359,0.79835,0.818853,0.725201,0.75172
6,0.0277,1.23504,0.793767,0.798427,0.694776,0.724805
7,0.0193,1.295991,0.797434,0.818174,0.698132,0.733448
8,0.017,1.34869,0.792851,0.806382,0.730527,0.747535
9,0.0134,1.389354,0.789184,0.782607,0.705045,0.726022
10,0.0101,1.448485,0.792851,0.791256,0.702392,0.727263


[I 2025-03-16 07:55:25,701] Trial 81 finished with value: 0.7329089318240765 and parameters: {'learning_rate': 0.00020915505374898168, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.93, 'warmup_steps': 2}. Best is trial 69 with value: 0.747359569286359.


Trial 82 with params: {'learning_rate': 0.00015504972457644627, 'weight_decay': 0.008, 'adam_beta1': 0.96, 'warmup_steps': 32}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9808,1.214561,0.744271,0.410195,0.413352,0.397885
2,0.552,1.002907,0.770852,0.535465,0.509158,0.504375
3,0.2334,0.998348,0.790101,0.63764,0.610442,0.612331
4,0.1179,1.073203,0.789184,0.74688,0.641426,0.669688
5,0.0679,1.094693,0.793767,0.788856,0.690132,0.719239
6,0.0451,1.16442,0.791017,0.797672,0.693785,0.721087
7,0.0307,1.259862,0.788268,0.798228,0.700577,0.725115
8,0.0241,1.262525,0.800183,0.806325,0.733317,0.750726
9,0.0199,1.32929,0.789184,0.780783,0.706544,0.725317
10,0.0148,1.370487,0.786434,0.783922,0.715879,0.72889


[I 2025-03-16 08:11:03,366] Trial 82 finished with value: 0.7368234468273828 and parameters: {'learning_rate': 0.00015504972457644627, 'weight_decay': 0.008, 'adam_beta1': 0.96, 'warmup_steps': 32}. Best is trial 69 with value: 0.747359569286359.


Trial 83 with params: {'learning_rate': 7.919910992358981e-05, 'weight_decay': 0.008, 'adam_beta1': 0.96, 'warmup_steps': 35}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5453,1.687372,0.679193,0.318145,0.300345,0.28643
2,1.0996,1.164156,0.757104,0.442707,0.431336,0.417884
3,0.609,1.041329,0.773602,0.468999,0.484847,0.469727
4,0.3805,0.992181,0.771769,0.515892,0.504258,0.493518
5,0.2454,0.998684,0.784601,0.621199,0.57829,0.584643


[I 2025-03-16 08:13:41,803] Trial 83 pruned. 


Trial 84 with params: {'learning_rate': 8.991175211632078e-05, 'weight_decay': 0.008, 'adam_beta1': 0.96, 'warmup_steps': 27}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4222,1.561575,0.696609,0.323255,0.323953,0.3065
2,0.9748,1.113776,0.765353,0.447289,0.45144,0.434955
3,0.5209,1.019519,0.764436,0.466237,0.4842,0.470657
4,0.3104,0.99793,0.770852,0.541805,0.518954,0.516552
5,0.1935,1.007396,0.787351,0.670085,0.601192,0.617886
6,0.1283,1.048468,0.785518,0.688301,0.607328,0.62914
7,0.0884,1.065503,0.793767,0.757522,0.663748,0.686995
8,0.0654,1.097739,0.797434,0.799442,0.706065,0.727988
9,0.0504,1.149879,0.7956,0.78389,0.702286,0.724012
10,0.038,1.198551,0.791934,0.785053,0.709532,0.724997


[I 2025-03-16 08:29:11,108] Trial 84 finished with value: 0.7355420594615302 and parameters: {'learning_rate': 8.991175211632078e-05, 'weight_decay': 0.008, 'adam_beta1': 0.96, 'warmup_steps': 27}. Best is trial 69 with value: 0.747359569286359.


Trial 85 with params: {'learning_rate': 0.0001264099551930982, 'weight_decay': 0.008, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 34}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1255,1.307273,0.739688,0.38843,0.391016,0.368593
2,0.6836,1.032786,0.767186,0.485559,0.482106,0.476411
3,0.3178,0.995602,0.777269,0.595854,0.558524,0.558356
4,0.1687,1.015727,0.793767,0.695029,0.624377,0.641109
5,0.0984,1.051717,0.790101,0.711406,0.643151,0.65975
6,0.0641,1.120684,0.789184,0.782205,0.67057,0.701937
7,0.0432,1.171419,0.793767,0.790234,0.687273,0.710361
8,0.032,1.196008,0.799267,0.791395,0.714258,0.728369
9,0.0257,1.244685,0.791017,0.7751,0.710198,0.724724
10,0.0188,1.309951,0.788268,0.809953,0.705538,0.730298


[I 2025-03-16 08:44:32,899] Trial 85 finished with value: 0.7420168179200104 and parameters: {'learning_rate': 0.0001264099551930982, 'weight_decay': 0.008, 'adam_beta1': 0.9500000000000001, 'warmup_steps': 34}. Best is trial 69 with value: 0.747359569286359.


Trial 86 with params: {'learning_rate': 4.3182046483014284e-05, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'warmup_steps': 16}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8958,2.200905,0.584785,0.252032,0.213951,0.201107
2,1.6819,1.512172,0.705775,0.324023,0.338909,0.315649
3,1.1017,1.241814,0.746104,0.400151,0.405429,0.383243
4,0.7867,1.117231,0.764436,0.448881,0.44772,0.435292
5,0.5935,1.062446,0.769936,0.504893,0.477667,0.464891


[I 2025-03-16 08:47:06,187] Trial 86 pruned. 


Trial 87 with params: {'learning_rate': 0.0001253937577800803, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.93, 'warmup_steps': 31}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.086,1.295802,0.739688,0.379736,0.391369,0.366518
2,0.6733,1.03675,0.766269,0.484626,0.481966,0.476319
3,0.3152,0.994189,0.777269,0.599031,0.560153,0.56148
4,0.1668,1.026439,0.788268,0.673704,0.605059,0.625081
5,0.0978,1.045203,0.790101,0.710523,0.63081,0.652275
6,0.064,1.131787,0.786434,0.77688,0.665999,0.69699
7,0.0433,1.180471,0.793767,0.814254,0.702998,0.727815
8,0.0325,1.190124,0.799267,0.796101,0.719515,0.73537
9,0.0262,1.253859,0.789184,0.783396,0.709318,0.727885
10,0.019,1.319392,0.789184,0.813905,0.708619,0.73413


[I 2025-03-16 09:02:55,274] Trial 87 finished with value: 0.7545251683099805 and parameters: {'learning_rate': 0.0001253937577800803, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.93, 'warmup_steps': 31}. Best is trial 87 with value: 0.7545251683099805.


Trial 88 with params: {'learning_rate': 7.285832479135637e-05, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.93, 'warmup_steps': 38}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.56,1.730481,0.67736,0.320117,0.300401,0.288569
2,1.1501,1.193208,0.751604,0.415504,0.429928,0.411601
3,0.6587,1.062914,0.772686,0.46691,0.480725,0.46312
4,0.4243,1.005151,0.769019,0.496023,0.496715,0.486683
5,0.2808,0.994576,0.781852,0.597096,0.554781,0.561641
6,0.1942,1.013137,0.786434,0.646596,0.589841,0.605364
7,0.1366,1.042878,0.788268,0.690018,0.619644,0.636305
8,0.1023,1.051933,0.796517,0.697623,0.626406,0.645273
9,0.0791,1.092906,0.787351,0.701065,0.628062,0.645847
10,0.0609,1.145472,0.791017,0.756251,0.677598,0.697273


[I 2025-03-16 09:08:09,565] Trial 88 pruned. 


Trial 89 with params: {'learning_rate': 8.431777839855785e-05, 'weight_decay': 0.008, 'adam_beta1': 0.92, 'warmup_steps': 21}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.394,1.567836,0.700275,0.351648,0.327688,0.310883
2,0.9904,1.127972,0.765353,0.450814,0.454446,0.438116
3,0.5461,1.03,0.769936,0.468305,0.483926,0.470432
4,0.3349,0.994725,0.776352,0.53896,0.516903,0.512595
5,0.2124,0.997434,0.789184,0.646839,0.586933,0.601153
6,0.1425,1.030891,0.785518,0.689055,0.607697,0.629151
7,0.0982,1.060822,0.791017,0.732401,0.647056,0.667574
8,0.0729,1.084352,0.797434,0.79353,0.689212,0.71322
9,0.0563,1.130344,0.790101,0.787066,0.68622,0.713835
10,0.0431,1.191742,0.785518,0.777908,0.692147,0.716156


[I 2025-03-16 09:23:49,977] Trial 89 finished with value: 0.7451531507333542 and parameters: {'learning_rate': 8.431777839855785e-05, 'weight_decay': 0.008, 'adam_beta1': 0.92, 'warmup_steps': 21}. Best is trial 87 with value: 0.7545251683099805.


Trial 90 with params: {'learning_rate': 5.401664528764546e-05, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.92, 'warmup_steps': 23}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7549,1.994746,0.614115,0.2648,0.246085,0.22889
2,1.4437,1.353418,0.736939,0.406029,0.395887,0.383399
3,0.8945,1.147084,0.758937,0.440854,0.439545,0.423754
4,0.6154,1.05963,0.769019,0.487328,0.474292,0.464734
5,0.445,1.020573,0.769936,0.500685,0.495378,0.488019
6,0.3301,1.003232,0.771769,0.572879,0.523615,0.526522
7,0.2446,1.011438,0.773602,0.590013,0.542493,0.546418
8,0.1902,1.010658,0.786434,0.648964,0.593351,0.606852
9,0.1515,1.040004,0.786434,0.663281,0.602435,0.615895
10,0.1199,1.065115,0.787351,0.705587,0.62177,0.643041


[I 2025-03-16 09:29:03,403] Trial 90 pruned. 


Trial 91 with params: {'learning_rate': 0.0001785436549414426, 'weight_decay': 0.008, 'adam_beta1': 0.92, 'warmup_steps': 36}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7829,1.125075,0.766269,0.425896,0.452436,0.426811
2,0.4474,0.99861,0.773602,0.567263,0.52399,0.523773
3,0.1785,1.034347,0.784601,0.724429,0.638294,0.659814
4,0.0872,1.083582,0.789184,0.776956,0.664339,0.700063
5,0.051,1.149392,0.797434,0.796642,0.70228,0.732162
6,0.0356,1.201053,0.791934,0.807958,0.68594,0.718973
7,0.0232,1.269351,0.786434,0.790955,0.690552,0.716427
8,0.0195,1.282758,0.793767,0.801852,0.73729,0.749677
9,0.0159,1.34038,0.789184,0.792621,0.713997,0.733451
10,0.0114,1.38896,0.791934,0.783926,0.713662,0.728925


[I 2025-03-16 09:44:37,556] Trial 91 finished with value: 0.7435341247193272 and parameters: {'learning_rate': 0.0001785436549414426, 'weight_decay': 0.008, 'adam_beta1': 0.92, 'warmup_steps': 36}. Best is trial 87 with value: 0.7545251683099805.


Trial 92 with params: {'learning_rate': 0.00025249939621007966, 'weight_decay': 0.008, 'adam_beta1': 0.93, 'warmup_steps': 33}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5186,1.030146,0.765353,0.462997,0.464405,0.445113
2,0.2858,0.998854,0.790101,0.700129,0.621192,0.639089
3,0.1021,1.098224,0.793767,0.732077,0.677706,0.690823
4,0.0508,1.143204,0.800183,0.805296,0.706589,0.736578
5,0.0324,1.214207,0.789184,0.783818,0.708109,0.729578
6,0.0232,1.290832,0.792851,0.789624,0.699853,0.723957
7,0.016,1.37236,0.788268,0.800196,0.702638,0.72669
8,0.0149,1.387321,0.785518,0.798458,0.716768,0.734314
9,0.0113,1.457664,0.786434,0.772308,0.732825,0.737235
10,0.0091,1.477017,0.791017,0.814312,0.727857,0.748179


[I 2025-03-16 09:59:59,422] Trial 92 finished with value: 0.7377027844793805 and parameters: {'learning_rate': 0.00025249939621007966, 'weight_decay': 0.008, 'adam_beta1': 0.93, 'warmup_steps': 33}. Best is trial 87 with value: 0.7545251683099805.


Trial 93 with params: {'learning_rate': 0.0002798432432643174, 'weight_decay': 0.01, 'adam_beta1': 0.91, 'warmup_steps': 43}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4469,1.020198,0.770852,0.4694,0.474781,0.456235
2,0.2498,1.02915,0.786434,0.737414,0.635135,0.660378
3,0.0879,1.129745,0.787351,0.801352,0.728513,0.746462
4,0.0444,1.184451,0.788268,0.809144,0.695451,0.727079
5,0.0274,1.259562,0.8011,0.805503,0.697913,0.730307
6,0.0218,1.315022,0.799267,0.785845,0.698148,0.720848
7,0.015,1.375414,0.787351,0.786362,0.708568,0.7259
8,0.0133,1.39382,0.8011,0.789576,0.722726,0.736858
9,0.0116,1.454928,0.791017,0.809859,0.732085,0.751513
10,0.0083,1.508336,0.788268,0.80576,0.724741,0.741797


[I 2025-03-16 10:15:33,398] Trial 93 finished with value: 0.7346740102785173 and parameters: {'learning_rate': 0.0002798432432643174, 'weight_decay': 0.01, 'adam_beta1': 0.91, 'warmup_steps': 43}. Best is trial 87 with value: 0.7545251683099805.


Trial 94 with params: {'learning_rate': 0.00019762279855014495, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 27}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6607,1.079506,0.764436,0.435245,0.450782,0.429216
2,0.389,0.974354,0.788268,0.624918,0.582424,0.58727
3,0.1483,1.040181,0.791017,0.724125,0.643961,0.661885
4,0.0717,1.111995,0.789184,0.779243,0.673266,0.706918
5,0.0435,1.17281,0.789184,0.787934,0.709948,0.731637
6,0.0313,1.242729,0.787351,0.811645,0.686371,0.720644
7,0.0212,1.308402,0.792851,0.821389,0.699842,0.732927
8,0.0168,1.294848,0.794684,0.795891,0.720953,0.735898
9,0.0145,1.368158,0.787351,0.811917,0.70698,0.736393
10,0.0099,1.414175,0.787351,0.79404,0.710647,0.728418


[I 2025-03-16 10:31:05,405] Trial 94 finished with value: 0.7458363684318783 and parameters: {'learning_rate': 0.00019762279855014495, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 27}. Best is trial 87 with value: 0.7545251683099805.


Trial 95 with params: {'learning_rate': 0.0004124376536120012, 'weight_decay': 0.007, 'adam_beta1': 0.91, 'warmup_steps': 25}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1532,0.996783,0.777269,0.539742,0.547583,0.535325
2,0.1512,1.113957,0.790101,0.730606,0.668361,0.681407
3,0.0578,1.201008,0.787351,0.763559,0.719148,0.724961
4,0.0328,1.286943,0.790101,0.814428,0.709248,0.740514
5,0.022,1.38671,0.785518,0.787318,0.705264,0.723906
6,0.017,1.397362,0.787351,0.791344,0.724981,0.741554
7,0.0124,1.579736,0.786434,0.781329,0.708812,0.721581
8,0.0107,1.572365,0.774519,0.768636,0.712337,0.723323
9,0.0097,1.640229,0.777269,0.805748,0.71124,0.739022
10,0.0078,1.607968,0.775435,0.788791,0.699614,0.723255


[I 2025-03-16 10:41:33,490] Trial 95 pruned. 


Trial 96 with params: {'learning_rate': 8.970210996083779e-05, 'weight_decay': 0.007, 'adam_beta1': 0.92, 'warmup_steps': 29}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3568,1.521881,0.710357,0.343298,0.340915,0.324356
2,0.9403,1.11099,0.764436,0.450859,0.456351,0.440086
3,0.5089,1.021999,0.768103,0.469984,0.490511,0.475618
4,0.3049,1.000199,0.780018,0.562554,0.532312,0.532249
5,0.1907,1.006847,0.786434,0.644001,0.595173,0.606529
6,0.1264,1.041414,0.783685,0.686305,0.605675,0.627579
7,0.0866,1.071859,0.793767,0.739779,0.647599,0.670538
8,0.0643,1.09663,0.8011,0.821595,0.706307,0.734609
9,0.0498,1.145681,0.7956,0.789821,0.689744,0.716991
10,0.0378,1.208534,0.789184,0.788903,0.699511,0.723039


[I 2025-03-16 10:46:47,326] Trial 96 pruned. 


Trial 97 with params: {'learning_rate': 3.4845689761788494e-05, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 25}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0496,2.418319,0.523373,0.18877,0.158132,0.139432
2,1.9351,1.713585,0.688359,0.311751,0.308381,0.290509
3,1.3335,1.373857,0.732356,0.389813,0.37664,0.359947
4,0.9837,1.210205,0.751604,0.399432,0.410283,0.389561
5,0.7628,1.121523,0.762603,0.476387,0.451593,0.441936


[I 2025-03-16 10:49:24,403] Trial 97 pruned. 


Trial 98 with params: {'learning_rate': 0.00022179095726477724, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 31}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5797,1.050529,0.76352,0.435242,0.452703,0.431063
2,0.3361,0.983108,0.791017,0.654095,0.600997,0.611231
3,0.1234,1.063185,0.792851,0.743447,0.661397,0.683646
4,0.0595,1.124063,0.793767,0.78083,0.671743,0.706802
5,0.0376,1.181932,0.8011,0.797703,0.719527,0.740495
6,0.0263,1.234265,0.7956,0.80946,0.695386,0.727435
7,0.0182,1.295508,0.792851,0.812835,0.70699,0.734767
8,0.0152,1.332692,0.79835,0.820847,0.735032,0.756307
9,0.0127,1.345756,0.789184,0.776069,0.735954,0.741724
10,0.0096,1.378236,0.79835,0.819436,0.729904,0.751827


[I 2025-03-16 11:04:56,444] Trial 98 finished with value: 0.744070793918236 and parameters: {'learning_rate': 0.00022179095726477724, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 31}. Best is trial 87 with value: 0.7545251683099805.


Trial 99 with params: {'learning_rate': 9.621525134665849e-05, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 33}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.284,1.454194,0.724106,0.34773,0.357266,0.337066
2,0.8733,1.092684,0.76077,0.465234,0.461555,0.446246
3,0.4635,1.007627,0.770852,0.468647,0.4943,0.476333
4,0.2719,1.011947,0.781852,0.60958,0.548525,0.558136
5,0.1675,1.017417,0.788268,0.684694,0.615676,0.6324
6,0.1099,1.053399,0.783685,0.687275,0.606188,0.628115
7,0.0746,1.091167,0.793767,0.768513,0.668692,0.694738
8,0.0555,1.118863,0.8011,0.811666,0.715506,0.73948
9,0.0433,1.168545,0.793767,0.799597,0.690048,0.72221
10,0.0327,1.225199,0.791017,0.800968,0.703458,0.73213


[I 2025-03-16 11:20:45,399] Trial 99 finished with value: 0.7468424668030823 and parameters: {'learning_rate': 9.621525134665849e-05, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 33}. Best is trial 87 with value: 0.7545251683099805.


Trial 100 with params: {'learning_rate': 0.00026326183503997335, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 28}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4447,1.023367,0.773602,0.470896,0.474784,0.458267
2,0.2646,1.027561,0.782768,0.709766,0.626121,0.646624
3,0.0951,1.109384,0.789184,0.753812,0.677799,0.696852
4,0.0481,1.202097,0.783685,0.810267,0.692666,0.727531
5,0.0304,1.265803,0.802933,0.810728,0.714261,0.743046
6,0.0226,1.280068,0.804766,0.81393,0.726439,0.752483
7,0.0156,1.392788,0.788268,0.793661,0.700277,0.726344
8,0.0133,1.394402,0.7956,0.795883,0.732184,0.746394
9,0.0109,1.455548,0.793767,0.779254,0.729756,0.739187
10,0.0092,1.471101,0.791934,0.796726,0.72483,0.741566


[I 2025-03-16 11:31:13,110] Trial 100 pruned. 


Trial 101 with params: {'learning_rate': 0.00010218895538078328, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 42}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2513,1.417891,0.726856,0.368436,0.366286,0.341517
2,0.8284,1.082425,0.759853,0.455463,0.462431,0.442617
3,0.4295,1.004512,0.774519,0.555639,0.518153,0.514995
4,0.2456,1.017374,0.781852,0.647803,0.578166,0.597814
5,0.1492,1.027956,0.787351,0.692406,0.618063,0.637252
6,0.0972,1.070933,0.782768,0.710924,0.624744,0.650483
7,0.0655,1.109393,0.796517,0.774606,0.682293,0.706236
8,0.0487,1.142758,0.797434,0.805598,0.709391,0.732277
9,0.0384,1.178429,0.791934,0.797319,0.694007,0.722822
10,0.0289,1.238728,0.793767,0.805281,0.705438,0.735131


[I 2025-03-16 11:46:47,462] Trial 101 finished with value: 0.7580381076828203 and parameters: {'learning_rate': 0.00010218895538078328, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 42}. Best is trial 101 with value: 0.7580381076828203.


Trial 102 with params: {'learning_rate': 0.00024303206858875134, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 38}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5275,1.033849,0.762603,0.439862,0.455737,0.43429
2,0.3001,0.998491,0.790101,0.649648,0.606398,0.612449
3,0.1073,1.08278,0.79835,0.759557,0.671713,0.694357
4,0.0526,1.153161,0.79835,0.796178,0.685711,0.719765
5,0.0334,1.239615,0.792851,0.804397,0.708587,0.733167
6,0.0235,1.290254,0.789184,0.798797,0.678787,0.710841
7,0.0166,1.33675,0.7956,0.799026,0.706038,0.730924
8,0.0138,1.343703,0.800183,0.800509,0.729245,0.745404
9,0.011,1.450001,0.780935,0.802975,0.722484,0.742353
10,0.0092,1.472208,0.790101,0.814706,0.719429,0.74344


[I 2025-03-16 12:02:15,244] Trial 102 finished with value: 0.7445584528490106 and parameters: {'learning_rate': 0.00024303206858875134, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 38}. Best is trial 101 with value: 0.7580381076828203.


Trial 103 with params: {'learning_rate': 7.379909560782258e-05, 'weight_decay': 0.005, 'adam_beta1': 0.9, 'warmup_steps': 39}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5239,1.69954,0.681027,0.31207,0.309164,0.289607
2,1.1248,1.185884,0.752521,0.422866,0.434872,0.417491
3,0.6448,1.059815,0.769936,0.474181,0.481231,0.463082
4,0.4151,1.007991,0.770852,0.506081,0.502727,0.496671
5,0.2735,0.992563,0.783685,0.623327,0.571505,0.583197


[I 2025-03-16 12:04:57,910] Trial 103 pruned. 


Trial 104 with params: {'learning_rate': 0.00025631562696183294, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9, 'warmup_steps': 26}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4592,1.027214,0.773602,0.46546,0.472995,0.454879
2,0.2743,1.015119,0.785518,0.686938,0.623727,0.638254
3,0.0986,1.096897,0.792851,0.751518,0.678176,0.696479
4,0.0495,1.167523,0.799267,0.818391,0.690797,0.733185
5,0.032,1.279468,0.792851,0.781993,0.699189,0.721486
6,0.0225,1.302098,0.792851,0.791836,0.703334,0.728698
7,0.0168,1.388248,0.785518,0.781692,0.706298,0.723145
8,0.0133,1.436103,0.8011,0.816443,0.737692,0.760072
9,0.0107,1.479326,0.791934,0.787548,0.719927,0.735494
10,0.0089,1.483114,0.793767,0.794444,0.722811,0.737121


[I 2025-03-16 12:20:35,924] Trial 104 finished with value: 0.7356811974104959 and parameters: {'learning_rate': 0.00025631562696183294, 'weight_decay': 0.009000000000000001, 'adam_beta1': 0.9, 'warmup_steps': 26}. Best is trial 101 with value: 0.7580381076828203.


Trial 105 with params: {'learning_rate': 8.855441380475776e-05, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 42}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3756,1.533528,0.711274,0.367117,0.350204,0.335612
2,0.9536,1.123244,0.75802,0.442207,0.451652,0.432246
3,0.5181,1.027614,0.772686,0.469611,0.490621,0.47553
4,0.3123,1.005554,0.778185,0.588348,0.535626,0.540686
5,0.1962,1.009932,0.789184,0.650871,0.596503,0.609805
6,0.1306,1.036875,0.787351,0.686572,0.609529,0.631236
7,0.0891,1.07357,0.797434,0.741892,0.651743,0.674138
8,0.0663,1.092337,0.800183,0.81344,0.700068,0.73167
9,0.0512,1.144837,0.792851,0.798299,0.682338,0.714506
10,0.0387,1.196302,0.790101,0.796678,0.697251,0.726811


[I 2025-03-16 12:36:10,527] Trial 105 finished with value: 0.7518035569469373 and parameters: {'learning_rate': 8.855441380475776e-05, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 42}. Best is trial 101 with value: 0.7580381076828203.


Trial 106 with params: {'learning_rate': 2.8009890789134495e-05, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 39}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.1964,2.638755,0.485793,0.103856,0.135388,0.111481
2,2.1908,1.94107,0.619615,0.27034,0.248286,0.226817
3,1.5899,1.55237,0.706691,0.35145,0.334729,0.316294
4,1.2135,1.337207,0.731439,0.365635,0.37958,0.35952
5,0.9634,1.216431,0.747938,0.396817,0.407624,0.388056


[I 2025-03-16 12:38:39,446] Trial 106 pruned. 


Trial 107 with params: {'learning_rate': 6.46317968978187e-05, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 38}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.629,1.824155,0.64528,0.290091,0.27027,0.254138
2,1.2547,1.246247,0.744271,0.402842,0.406547,0.385753
3,0.7442,1.088126,0.768103,0.456466,0.467873,0.446275
4,0.4956,1.027558,0.768103,0.48023,0.479726,0.47176
5,0.3406,0.997759,0.770852,0.578794,0.522568,0.525765
6,0.2419,1.002187,0.784601,0.624253,0.57021,0.581059
7,0.1729,1.03069,0.786434,0.675512,0.604941,0.622294
8,0.1316,1.031091,0.790101,0.672539,0.610595,0.626002
9,0.1026,1.068186,0.788268,0.68318,0.611101,0.626562
10,0.0796,1.117634,0.786434,0.739543,0.652383,0.672937


[I 2025-03-16 12:54:11,749] Trial 107 finished with value: 0.740566443666514 and parameters: {'learning_rate': 6.46317968978187e-05, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 38}. Best is trial 101 with value: 0.7580381076828203.


Trial 108 with params: {'learning_rate': 0.00011423850364123761, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 46}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1623,1.341676,0.736022,0.382853,0.386392,0.363145
2,0.7403,1.057674,0.761687,0.468312,0.471582,0.462032
3,0.3683,0.994029,0.782768,0.597856,0.544703,0.550003
4,0.2008,1.021638,0.782768,0.653758,0.590917,0.609227
5,0.1189,1.038737,0.789184,0.684728,0.615217,0.631405
6,0.0771,1.088054,0.790101,0.760575,0.661148,0.690202
7,0.0519,1.146855,0.793767,0.789139,0.677963,0.707759
8,0.0389,1.167429,0.802016,0.797934,0.715129,0.734155
9,0.0304,1.207072,0.794684,0.798291,0.698795,0.725511
10,0.0227,1.269243,0.7956,0.816475,0.710478,0.737007


[I 2025-03-16 13:09:28,448] Trial 108 finished with value: 0.7577323391778955 and parameters: {'learning_rate': 0.00011423850364123761, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 46}. Best is trial 101 with value: 0.7580381076828203.


Trial 109 with params: {'learning_rate': 0.00018609888492482365, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 47}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.751,1.104338,0.76077,0.42599,0.446392,0.42402
2,0.4277,0.99057,0.774519,0.575621,0.535414,0.538391
3,0.1678,1.015573,0.792851,0.731516,0.65898,0.676905
4,0.0805,1.091683,0.791934,0.761416,0.667155,0.698591
5,0.048,1.152814,0.7956,0.786506,0.702835,0.728374
6,0.0333,1.215136,0.786434,0.797281,0.670274,0.70972
7,0.0214,1.28292,0.789184,0.772991,0.705772,0.720394
8,0.0183,1.280434,0.79835,0.810101,0.720838,0.741978
9,0.0154,1.335141,0.786434,0.793576,0.709409,0.733371
10,0.0111,1.414489,0.791017,0.80954,0.718992,0.743521


[I 2025-03-16 13:25:33,952] Trial 109 finished with value: 0.7400863228030755 and parameters: {'learning_rate': 0.00018609888492482365, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 47}. Best is trial 101 with value: 0.7580381076828203.


Trial 110 with params: {'learning_rate': 5.7545494777844725e-05, 'weight_decay': 0.008, 'adam_beta1': 0.91, 'warmup_steps': 49}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7422,1.952346,0.615032,0.264737,0.247092,0.230439
2,1.3881,1.321236,0.737855,0.377772,0.389919,0.368616
3,0.8447,1.126132,0.765353,0.457473,0.453527,0.436628
4,0.5745,1.048735,0.769936,0.481864,0.478101,0.471672
5,0.4094,1.014299,0.771769,0.521341,0.507041,0.499725


[I 2025-03-16 13:28:15,274] Trial 110 pruned. 


Trial 111 with params: {'learning_rate': 0.0002181832842611734, 'weight_decay': 0.005, 'adam_beta1': 0.91, 'warmup_steps': 37}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6153,1.059808,0.764436,0.439039,0.451457,0.42919
2,0.3466,0.983736,0.785518,0.622593,0.578327,0.584337
3,0.1283,1.050896,0.7956,0.746515,0.666515,0.686418
4,0.0621,1.138328,0.794684,0.785986,0.674874,0.709667
5,0.0382,1.173374,0.799267,0.80154,0.721127,0.743811
6,0.0273,1.245914,0.789184,0.782746,0.672913,0.706138
7,0.0185,1.306219,0.8011,0.822557,0.710793,0.742431
8,0.0152,1.34542,0.789184,0.802413,0.697598,0.726068
9,0.0133,1.363331,0.792851,0.7967,0.722243,0.740238
10,0.01,1.401254,0.7956,0.815626,0.727664,0.749829


[I 2025-03-16 13:44:01,908] Trial 111 finished with value: 0.7422891395603033 and parameters: {'learning_rate': 0.0002181832842611734, 'weight_decay': 0.005, 'adam_beta1': 0.91, 'warmup_steps': 37}. Best is trial 101 with value: 0.7580381076828203.


Trial 112 with params: {'learning_rate': 9.176214230786556e-05, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 47}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3555,1.508245,0.71494,0.365123,0.353124,0.336734
2,0.9249,1.113992,0.756187,0.44222,0.45131,0.432263
3,0.497,1.021599,0.771769,0.491249,0.499768,0.487756
4,0.2957,1.009208,0.780018,0.591403,0.541464,0.548484
5,0.184,1.016081,0.786434,0.652416,0.598586,0.61181
6,0.1216,1.047768,0.790101,0.70075,0.614019,0.640248
7,0.0826,1.08192,0.802016,0.76412,0.676228,0.69858
8,0.0613,1.102918,0.8011,0.816089,0.705387,0.735339
9,0.0475,1.159704,0.790101,0.797593,0.67603,0.709348
10,0.036,1.208305,0.792851,0.796071,0.698244,0.726753


[I 2025-03-16 13:59:40,145] Trial 112 finished with value: 0.749226744183446 and parameters: {'learning_rate': 9.176214230786556e-05, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 47}. Best is trial 101 with value: 0.7580381076828203.


Trial 113 with params: {'learning_rate': 8.435718072766996e-05, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 42}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4174,1.576713,0.699358,0.330864,0.329412,0.311077
2,0.9983,1.138395,0.758937,0.447958,0.448887,0.433379
3,0.5505,1.034114,0.770852,0.463862,0.484723,0.469665
4,0.3381,1.002701,0.775435,0.568421,0.528318,0.531467
5,0.2148,0.999847,0.790101,0.651157,0.597032,0.610266
6,0.1443,1.027401,0.791017,0.695937,0.613426,0.635812
7,0.0992,1.062441,0.796517,0.745658,0.65114,0.675115
8,0.0739,1.078212,0.797434,0.813187,0.700016,0.732313
9,0.0568,1.132729,0.790101,0.796279,0.680208,0.71244
10,0.0433,1.177269,0.791934,0.789621,0.698097,0.724323


[I 2025-03-16 14:15:11,142] Trial 113 finished with value: 0.7498094005531141 and parameters: {'learning_rate': 8.435718072766996e-05, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 42}. Best is trial 101 with value: 0.7580381076828203.


Trial 114 with params: {'learning_rate': 2.5723446972771798e-05, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 47}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.2403,2.705844,0.477544,0.106512,0.133803,0.111238
2,2.2765,2.023566,0.609533,0.257317,0.238784,0.218594
3,1.6857,1.628776,0.694775,0.318808,0.319023,0.299029
4,1.3063,1.39687,0.718607,0.362548,0.356197,0.335192
5,1.0472,1.26135,0.748854,0.403964,0.42062,0.399116
6,0.868,1.169467,0.757104,0.42768,0.4385,0.418246
7,0.7291,1.112701,0.76077,0.471678,0.452871,0.440246
8,0.6307,1.075281,0.768103,0.50904,0.479604,0.469043
9,0.5487,1.054254,0.772686,0.516142,0.484254,0.478268
10,0.4807,1.035312,0.772686,0.519906,0.497543,0.495027


[I 2025-03-16 14:20:24,623] Trial 114 pruned. 


Trial 115 with params: {'learning_rate': 7.709532607807873e-05, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 38}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4855,1.656262,0.691109,0.338564,0.320728,0.301907
2,1.081,1.167659,0.756187,0.448737,0.442017,0.427482
3,0.6118,1.049636,0.768103,0.463645,0.48216,0.467162
4,0.3881,1.003672,0.774519,0.568874,0.517788,0.52151
5,0.2527,0.992385,0.789184,0.652324,0.594553,0.609109
6,0.1727,1.015664,0.791017,0.670526,0.604224,0.620542
7,0.1203,1.051036,0.793767,0.716316,0.628606,0.651066
8,0.0901,1.058037,0.796517,0.746873,0.654928,0.681691
9,0.0696,1.102655,0.788268,0.749705,0.659814,0.684199
10,0.0532,1.151028,0.787351,0.777492,0.681746,0.70856


[I 2025-03-16 14:25:36,944] Trial 115 pruned. 


Trial 116 with params: {'learning_rate': 4.7746956596454305e-05, 'weight_decay': 0.004, 'adam_beta1': 0.9, 'warmup_steps': 50}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8728,2.129713,0.5967,0.252862,0.228426,0.212863
2,1.5892,1.443242,0.716774,0.358056,0.358462,0.339538
3,1.0123,1.196964,0.756187,0.420351,0.434433,0.414387
4,0.7108,1.088445,0.765353,0.446956,0.455146,0.440706
5,0.5276,1.043321,0.774519,0.504102,0.493295,0.486256
6,0.4056,1.01823,0.772686,0.515417,0.503378,0.496894
7,0.3081,1.011139,0.771769,0.530545,0.515894,0.509724
8,0.2442,0.998613,0.783685,0.645245,0.571565,0.589473
9,0.1962,1.017303,0.785518,0.64648,0.581341,0.598603
10,0.1579,1.036544,0.790101,0.683779,0.607665,0.625609


[I 2025-03-16 14:30:54,860] Trial 116 pruned. 


Trial 117 with params: {'learning_rate': 0.00010621624277688624, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 47}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2491,1.406284,0.733272,0.38986,0.376971,0.351668
2,0.8064,1.075534,0.76077,0.44809,0.45854,0.441843
3,0.4101,1.007158,0.772686,0.553624,0.521835,0.51652
4,0.2306,1.020796,0.780018,0.624909,0.564089,0.579561
5,0.1388,1.034201,0.789184,0.695056,0.614922,0.636982
6,0.0904,1.087131,0.786434,0.741036,0.638265,0.66874
7,0.0611,1.126766,0.792851,0.792863,0.689311,0.717248
8,0.0451,1.15363,0.796517,0.805831,0.712739,0.734223
9,0.0357,1.192806,0.791934,0.788588,0.687299,0.714459
10,0.0268,1.250016,0.797434,0.810566,0.712281,0.739876


[I 2025-03-16 14:46:44,610] Trial 117 finished with value: 0.7517800588353467 and parameters: {'learning_rate': 0.00010621624277688624, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 47}. Best is trial 101 with value: 0.7580381076828203.


Trial 118 with params: {'learning_rate': 0.00016095510626902364, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 50}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8775,1.163629,0.761687,0.439526,0.441883,0.425341
2,0.5121,0.994391,0.780935,0.524377,0.518597,0.514821
3,0.2143,0.986253,0.793767,0.644898,0.608369,0.613656
4,0.1058,1.049332,0.791934,0.741122,0.648481,0.677204
5,0.0619,1.100132,0.796517,0.789615,0.708808,0.732204
6,0.0411,1.154135,0.787351,0.781466,0.669554,0.699196
7,0.0274,1.23804,0.793767,0.803121,0.710882,0.73279
8,0.022,1.236204,0.792851,0.790114,0.730463,0.741519
9,0.0186,1.312925,0.787351,0.794243,0.690165,0.719655
10,0.0132,1.365507,0.789184,0.800334,0.723666,0.741995


[I 2025-03-16 15:02:28,194] Trial 118 finished with value: 0.7475090018430808 and parameters: {'learning_rate': 0.00016095510626902364, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 50}. Best is trial 101 with value: 0.7580381076828203.


Trial 119 with params: {'learning_rate': 0.0001255088838467423, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 36}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0575,1.277342,0.746104,0.389206,0.395549,0.373923
2,0.666,1.040623,0.764436,0.473833,0.476713,0.468302
3,0.3156,0.985733,0.787351,0.605897,0.566735,0.572386
4,0.1669,1.029379,0.783685,0.674242,0.601429,0.622605
5,0.0982,1.04931,0.791017,0.693764,0.625455,0.642038
6,0.0637,1.120528,0.790101,0.755771,0.656653,0.68204
7,0.0426,1.184511,0.794684,0.807816,0.694075,0.720323
8,0.0321,1.202217,0.796517,0.797415,0.71331,0.730299
9,0.0258,1.241799,0.789184,0.784921,0.692634,0.716085
10,0.0186,1.305864,0.790101,0.805745,0.703018,0.727892


[I 2025-03-16 15:18:33,113] Trial 119 finished with value: 0.7447685465145879 and parameters: {'learning_rate': 0.0001255088838467423, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 36}. Best is trial 101 with value: 0.7580381076828203.


Trial 120 with params: {'learning_rate': 9.011141377096925e-05, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 49}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3754,1.525882,0.707608,0.334099,0.335346,0.317321
2,0.9429,1.119412,0.759853,0.446084,0.452571,0.434856
3,0.51,1.024117,0.770852,0.467696,0.489205,0.473804
4,0.3055,1.007291,0.779102,0.582128,0.535592,0.539961
5,0.1907,1.011284,0.788268,0.652767,0.5992,0.612248
6,0.1263,1.039945,0.791017,0.693025,0.613943,0.636463
7,0.086,1.076802,0.797434,0.744555,0.654603,0.677109
8,0.0639,1.096201,0.802016,0.808859,0.704213,0.72996
9,0.0493,1.15092,0.792851,0.807661,0.688407,0.722279
10,0.0373,1.201321,0.796517,0.802483,0.705905,0.73315


[I 2025-03-16 15:34:27,365] Trial 120 finished with value: 0.748125303860967 and parameters: {'learning_rate': 9.011141377096925e-05, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 49}. Best is trial 101 with value: 0.7580381076828203.


Trial 121 with params: {'learning_rate': 0.00026452018072984935, 'weight_decay': 0.008, 'adam_beta1': 0.92, 'warmup_steps': 50}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5133,1.036795,0.768103,0.445692,0.469064,0.446431
2,0.2722,1.010571,0.792851,0.683568,0.615237,0.631119
3,0.0956,1.107712,0.796517,0.765997,0.694361,0.712586
4,0.0478,1.18219,0.788268,0.815061,0.691413,0.726739
5,0.0315,1.215126,0.7956,0.802916,0.713559,0.74194
6,0.0222,1.257128,0.796517,0.813763,0.715617,0.742709
7,0.0151,1.307117,0.796517,0.808843,0.694851,0.728923
8,0.0136,1.355468,0.796517,0.820665,0.729501,0.75446
9,0.011,1.464665,0.779102,0.800307,0.72608,0.745076
10,0.0086,1.494175,0.792851,0.799757,0.740914,0.755725


[I 2025-03-16 15:45:09,714] Trial 121 pruned. 


Trial 122 with params: {'learning_rate': 0.00014265093249551882, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 51}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9815,1.217675,0.747021,0.377778,0.399721,0.375746
2,0.586,1.012147,0.771769,0.490368,0.493911,0.485783
3,0.2605,0.980968,0.791934,0.656434,0.597453,0.611131
4,0.1318,1.033406,0.791934,0.715118,0.626068,0.649621
5,0.0769,1.07706,0.79835,0.776211,0.69272,0.714752
6,0.0504,1.126601,0.785518,0.771172,0.6597,0.691536
7,0.0338,1.215627,0.793767,0.805477,0.694432,0.722116
8,0.0266,1.215063,0.792851,0.786303,0.707763,0.725485
9,0.0214,1.266257,0.789184,0.780952,0.703854,0.724802
10,0.0154,1.34005,0.788268,0.812906,0.70924,0.734134


[I 2025-03-16 15:55:48,842] Trial 122 pruned. 


Trial 123 with params: {'learning_rate': 3.600855231113945e-05, 'weight_decay': 0.005, 'adam_beta1': 0.93, 'warmup_steps': 53}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0765,2.424616,0.528873,0.187502,0.16156,0.141584
2,1.927,1.698295,0.688359,0.313226,0.307569,0.290011
3,1.3129,1.360065,0.728689,0.368989,0.372857,0.351945
4,0.9617,1.199388,0.754354,0.396006,0.417711,0.394623
5,0.7419,1.114617,0.762603,0.471516,0.453615,0.442213


[I 2025-03-16 15:58:26,910] Trial 123 pruned. 


Trial 124 with params: {'learning_rate': 7.154718987890825e-05, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 46}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5623,1.732927,0.675527,0.323137,0.298897,0.287353
2,1.1569,1.196745,0.747021,0.425262,0.415421,0.399274
3,0.668,1.061149,0.769019,0.457257,0.473226,0.453022
4,0.4337,1.010198,0.770852,0.480875,0.491498,0.480421
5,0.2884,0.99025,0.781852,0.605187,0.554335,0.562536
6,0.2,1.010412,0.789184,0.66285,0.585894,0.604278
7,0.1405,1.047366,0.790101,0.695936,0.621804,0.639609
8,0.1059,1.048704,0.796517,0.694481,0.627883,0.644324
9,0.0821,1.087433,0.785518,0.701545,0.630522,0.647202
10,0.0631,1.146066,0.790101,0.761929,0.674117,0.694985


[I 2025-03-16 16:03:50,537] Trial 124 pruned. 


Trial 125 with params: {'learning_rate': 6.320517481758516e-05, 'weight_decay': 0.006, 'adam_beta1': 0.91, 'warmup_steps': 45}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6656,1.858693,0.629698,0.284384,0.261013,0.244578
2,1.2868,1.264213,0.743355,0.40298,0.399468,0.3778
3,0.766,1.096587,0.766269,0.448729,0.465752,0.445737
4,0.5119,1.031736,0.766269,0.474726,0.476057,0.468477
5,0.3543,1.001287,0.770852,0.533529,0.512202,0.506809


[I 2025-03-16 16:06:29,750] Trial 125 pruned. 


Trial 126 with params: {'learning_rate': 5.2653913461752696e-05, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 52}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8056,2.034397,0.60495,0.270404,0.237027,0.219467
2,1.4809,1.37463,0.732356,0.376793,0.376024,0.35778
3,0.9209,1.155891,0.759853,0.437487,0.440278,0.421176
4,0.6367,1.06548,0.770852,0.494136,0.473739,0.46212
5,0.4638,1.027599,0.776352,0.506,0.50158,0.493605


[I 2025-03-16 16:09:12,653] Trial 126 pruned. 


Trial 127 with params: {'learning_rate': 9.152827015282269e-05, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 43}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3695,1.520737,0.713107,0.36579,0.350511,0.335361
2,0.9328,1.114061,0.758937,0.443782,0.452313,0.433464
3,0.4997,1.023968,0.768103,0.465794,0.488113,0.472151
4,0.2972,1.004306,0.781852,0.586594,0.542721,0.547169
5,0.185,1.012781,0.788268,0.645823,0.598545,0.607795
6,0.1224,1.045457,0.785518,0.696102,0.609233,0.63424
7,0.0836,1.078581,0.8011,0.745121,0.652639,0.675748
8,0.0618,1.10135,0.79835,0.805744,0.695856,0.72524
9,0.0478,1.153122,0.794684,0.799983,0.683401,0.716419
10,0.0362,1.205795,0.791934,0.790849,0.698141,0.725256


[I 2025-03-16 16:14:34,955] Trial 127 pruned. 


Trial 128 with params: {'learning_rate': 9.670871525412212e-05, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 53}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3428,1.482278,0.719523,0.367097,0.35815,0.34102
2,0.8898,1.099321,0.75802,0.443481,0.453481,0.434975
3,0.4691,1.014487,0.769019,0.465691,0.492292,0.474256
4,0.2738,1.009913,0.780935,0.607852,0.547252,0.556693
5,0.1681,1.02235,0.785518,0.645434,0.597018,0.606889
6,0.1103,1.060812,0.782768,0.694191,0.606372,0.631929
7,0.0748,1.09957,0.796517,0.767052,0.673101,0.696507
8,0.0554,1.125193,0.799267,0.808456,0.715886,0.738767
9,0.043,1.171994,0.7956,0.811745,0.690098,0.724938
10,0.0326,1.22904,0.791934,0.791413,0.701144,0.726174


[I 2025-03-16 16:30:38,066] Trial 128 finished with value: 0.7448504013966059 and parameters: {'learning_rate': 9.670871525412212e-05, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 53}. Best is trial 101 with value: 0.7580381076828203.


Trial 129 with params: {'learning_rate': 0.0003454647548761435, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 49}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3103,0.994044,0.770852,0.498393,0.495042,0.483763
2,0.1898,1.088319,0.787351,0.718656,0.645221,0.662029
3,0.068,1.187219,0.789184,0.783006,0.717175,0.731384
4,0.037,1.28525,0.789184,0.781785,0.694521,0.713629
5,0.026,1.298487,0.797434,0.772758,0.720422,0.728631
6,0.0199,1.338235,0.789184,0.817578,0.701908,0.734873
7,0.0128,1.498653,0.777269,0.796187,0.701057,0.721507
8,0.0117,1.458078,0.782768,0.797811,0.724014,0.74266
9,0.0115,1.573691,0.777269,0.797007,0.711586,0.731886
10,0.008,1.581267,0.783685,0.78728,0.716453,0.732702


[I 2025-03-16 16:41:11,565] Trial 129 pruned. 


Trial 130 with params: {'learning_rate': 6.173972306226184e-05, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 42}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6724,1.873099,0.625115,0.28468,0.258032,0.242476
2,1.3053,1.275022,0.741522,0.400504,0.3982,0.375773
3,0.7825,1.100894,0.767186,0.450659,0.466115,0.446733
4,0.5258,1.035686,0.768103,0.478864,0.477828,0.470239
5,0.3668,1.00241,0.769936,0.538064,0.511714,0.509992


[I 2025-03-16 16:43:47,522] Trial 130 pruned. 


Trial 131 with params: {'learning_rate': 0.0002316662864002156, 'weight_decay': 0.005, 'adam_beta1': 0.91, 'warmup_steps': 53}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6056,1.05236,0.766269,0.42101,0.45952,0.432436
2,0.3252,1.00425,0.785518,0.651741,0.580584,0.594481
3,0.1179,1.06853,0.793767,0.745395,0.659596,0.684245
4,0.0571,1.11912,0.791934,0.79636,0.679119,0.718719
5,0.0344,1.24323,0.791017,0.778486,0.711342,0.729071
6,0.0252,1.266095,0.789184,0.797395,0.679649,0.716546
7,0.0169,1.343879,0.792851,0.804589,0.691706,0.721394
8,0.0144,1.36655,0.791934,0.811144,0.725993,0.744067
9,0.0128,1.381895,0.792851,0.808216,0.722895,0.746008
10,0.0093,1.458593,0.789184,0.817162,0.727618,0.751767


[I 2025-03-16 16:59:35,901] Trial 131 finished with value: 0.7414736742501329 and parameters: {'learning_rate': 0.0002316662864002156, 'weight_decay': 0.005, 'adam_beta1': 0.91, 'warmup_steps': 53}. Best is trial 101 with value: 0.7580381076828203.


Trial 132 with params: {'learning_rate': 0.00010262366613347604, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 47}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2789,1.431489,0.726856,0.36849,0.368288,0.343143
2,0.835,1.083724,0.758937,0.456303,0.456385,0.439839
3,0.4301,1.00886,0.769019,0.531415,0.51018,0.501933
4,0.2452,1.015146,0.784601,0.625958,0.567159,0.581625
5,0.1488,1.029784,0.788268,0.69378,0.614335,0.636064
6,0.0972,1.078897,0.785518,0.709194,0.626496,0.649916
7,0.0657,1.116246,0.7956,0.772972,0.680582,0.704662
8,0.0485,1.143488,0.799267,0.808454,0.713645,0.73683
9,0.0381,1.184423,0.792851,0.794242,0.68081,0.713151
10,0.0287,1.234521,0.797434,0.793247,0.703987,0.729528


[I 2025-03-16 17:15:27,394] Trial 132 finished with value: 0.7537432577475205 and parameters: {'learning_rate': 0.00010262366613347604, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 47}. Best is trial 101 with value: 0.7580381076828203.


Trial 133 with params: {'learning_rate': 1.5940010731374617e-05, 'weight_decay': 0.007, 'adam_beta1': 0.93, 'warmup_steps': 18}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.431,3.051452,0.404216,0.099765,0.090741,0.073996
2,2.7335,2.498484,0.512374,0.123693,0.149399,0.125148
3,2.2324,2.109993,0.591201,0.26389,0.223031,0.206171
4,1.8641,1.829331,0.651696,0.316023,0.272596,0.257564
5,1.5802,1.622367,0.691109,0.330466,0.313256,0.296722
6,1.3671,1.472049,0.713107,0.363364,0.347726,0.332322
7,1.1947,1.361274,0.72594,0.403008,0.379086,0.367151
8,1.0646,1.285432,0.749771,0.435756,0.424818,0.40549
9,0.9571,1.225831,0.749771,0.42153,0.431591,0.413321
10,0.8644,1.183149,0.749771,0.438816,0.438646,0.422643


[I 2025-03-16 17:26:04,402] Trial 133 pruned. 


Trial 134 with params: {'learning_rate': 7.394453948315603e-05, 'weight_decay': 0.006, 'adam_beta1': 0.92, 'warmup_steps': 15}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4932,1.683671,0.686526,0.315513,0.310032,0.293872
2,1.1123,1.17554,0.757104,0.445153,0.433603,0.41856
3,0.6361,1.05462,0.770852,0.468154,0.481046,0.466538
4,0.4079,1.001333,0.772686,0.500637,0.501272,0.491873
5,0.2687,0.99217,0.783685,0.598269,0.5605,0.567757


[I 2025-03-16 17:28:37,884] Trial 134 pruned. 


Trial 135 with params: {'learning_rate': 1.120156779854372e-06, 'weight_decay': 0.006, 'adam_beta1': 0.93, 'warmup_steps': 13}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8392,3.788987,0.126489,0.01257,0.016274,0.01001
2,3.749,3.710993,0.210816,0.016772,0.032118,0.016557
3,3.6798,3.648659,0.210816,0.01821,0.030227,0.015816
4,3.6212,3.592611,0.208983,0.014866,0.029589,0.016617
5,3.5665,3.540942,0.214482,0.05796,0.031215,0.02022


[I 2025-03-16 17:31:15,806] Trial 135 pruned. 


Trial 136 with params: {'learning_rate': 1.689424523580871e-06, 'weight_decay': 0.003, 'adam_beta1': 0.96, 'warmup_steps': 10}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8156,3.746221,0.190651,0.032922,0.027499,0.01434
2,3.6948,3.645936,0.209899,0.018845,0.029953,0.016056
3,3.6021,3.558832,0.210816,0.037059,0.030101,0.018091
4,3.52,3.4787,0.23923,0.0499,0.038738,0.030147
5,3.4421,3.40398,0.296059,0.045208,0.055867,0.043845


[I 2025-03-16 17:33:57,418] Trial 136 pruned. 


Trial 137 with params: {'learning_rate': 0.00012659250612851074, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 15}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0084,1.267725,0.743355,0.383972,0.394545,0.372894
2,0.6499,1.026593,0.766269,0.480071,0.486115,0.474939
3,0.3039,0.989857,0.787351,0.62787,0.57992,0.587063
4,0.1609,1.032018,0.787351,0.666485,0.605481,0.622505
5,0.0947,1.047942,0.792851,0.688351,0.622149,0.638168
6,0.0621,1.120836,0.785518,0.749489,0.650178,0.676783
7,0.0421,1.197357,0.794684,0.794068,0.688508,0.712171
8,0.0316,1.207549,0.791017,0.795098,0.714387,0.733379
9,0.0258,1.251225,0.788268,0.781765,0.713202,0.729608
10,0.0189,1.301092,0.791934,0.809925,0.722951,0.741578


[I 2025-03-16 17:49:58,574] Trial 137 finished with value: 0.7506098650354259 and parameters: {'learning_rate': 0.00012659250612851074, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 15}. Best is trial 101 with value: 0.7580381076828203.


Trial 138 with params: {'learning_rate': 0.00032147533081042234, 'weight_decay': 0.006, 'adam_beta1': 0.93, 'warmup_steps': 44}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3787,1.011717,0.777269,0.496821,0.503515,0.491901
2,0.2102,1.067106,0.786434,0.726728,0.642232,0.661287
3,0.0736,1.159223,0.792851,0.750238,0.722743,0.722379
4,0.0377,1.233416,0.792851,0.81015,0.707814,0.729408
5,0.0266,1.240345,0.794684,0.800517,0.702265,0.73111
6,0.0197,1.333337,0.791934,0.804693,0.701285,0.732077
7,0.014,1.378937,0.791017,0.813969,0.713235,0.738207
8,0.0114,1.414233,0.793767,0.788478,0.718147,0.730689
9,0.0108,1.527082,0.780018,0.793872,0.715781,0.72874
10,0.01,1.535246,0.792851,0.791523,0.731971,0.740264


[I 2025-03-16 18:00:34,567] Trial 138 pruned. 


Trial 139 with params: {'learning_rate': 0.0001470306243406786, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 15}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8778,1.19085,0.756187,0.398756,0.418218,0.395815
2,0.55,0.998541,0.774519,0.511246,0.508103,0.49867
3,0.239,0.991286,0.787351,0.644451,0.597946,0.605462
4,0.1217,1.059864,0.790101,0.705372,0.622089,0.647908
5,0.0712,1.09277,0.792851,0.74689,0.681847,0.699036
6,0.0477,1.149117,0.783685,0.774387,0.671298,0.699472
7,0.0327,1.24162,0.793767,0.807699,0.70892,0.732779
8,0.0254,1.220495,0.802016,0.792915,0.738014,0.746296
9,0.0206,1.281229,0.787351,0.783143,0.729937,0.742069
10,0.0147,1.331828,0.794684,0.801712,0.731598,0.744047


[I 2025-03-16 18:16:37,524] Trial 139 finished with value: 0.7463632574954153 and parameters: {'learning_rate': 0.0001470306243406786, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 15}. Best is trial 101 with value: 0.7580381076828203.


Trial 140 with params: {'learning_rate': 0.00013224486121218954, 'weight_decay': 0.008, 'adam_beta1': 0.92, 'warmup_steps': 47}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0578,1.262759,0.745188,0.382843,0.396547,0.373449
2,0.6398,1.032404,0.768103,0.48088,0.48381,0.47676
3,0.2945,0.988354,0.783685,0.624227,0.583652,0.589868
4,0.1525,1.027273,0.790101,0.673821,0.606101,0.625579
5,0.0891,1.056237,0.790101,0.72174,0.644104,0.662484
6,0.0583,1.123869,0.791017,0.780738,0.672679,0.701422
7,0.0385,1.192315,0.799267,0.827008,0.696715,0.728467
8,0.0298,1.195493,0.799267,0.802216,0.716893,0.736478
9,0.0234,1.254764,0.790101,0.79557,0.699742,0.724015
10,0.0175,1.325528,0.7956,0.820197,0.715569,0.741557


[I 2025-03-16 18:32:29,238] Trial 140 finished with value: 0.740690457093409 and parameters: {'learning_rate': 0.00013224486121218954, 'weight_decay': 0.008, 'adam_beta1': 0.92, 'warmup_steps': 47}. Best is trial 101 with value: 0.7580381076828203.


Trial 141 with params: {'learning_rate': 7.141808307151675e-05, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 13}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5023,1.703081,0.681943,0.323944,0.305179,0.292087
2,1.1375,1.188271,0.75527,0.426055,0.422278,0.405032
3,0.6592,1.060088,0.769936,0.46965,0.477135,0.460502
4,0.4282,1.007941,0.769936,0.499218,0.492561,0.486089
5,0.2849,0.991124,0.779102,0.602473,0.546363,0.554695
6,0.1978,1.009257,0.788268,0.660708,0.594666,0.609327
7,0.1393,1.04199,0.788268,0.690981,0.616534,0.635775
8,0.105,1.045234,0.794684,0.719421,0.633779,0.655082
9,0.0816,1.085659,0.782768,0.703091,0.628635,0.647119
10,0.063,1.139007,0.789184,0.726103,0.651093,0.669483


[I 2025-03-16 18:37:40,644] Trial 141 pruned. 


Trial 142 with params: {'learning_rate': 0.0003766607477623454, 'weight_decay': 0.006, 'adam_beta1': 0.9, 'warmup_steps': 6}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1459,0.986591,0.775435,0.571671,0.541234,0.538351
2,0.1663,1.117572,0.788268,0.696294,0.665974,0.665445
3,0.0622,1.245067,0.786434,0.759368,0.726144,0.729238
4,0.0349,1.300824,0.786434,0.81112,0.70388,0.734273
5,0.0234,1.328176,0.796517,0.811087,0.71039,0.737505
6,0.0182,1.390029,0.775435,0.806548,0.677027,0.711601
7,0.0132,1.418555,0.780935,0.767424,0.696943,0.711711
8,0.0127,1.527336,0.779102,0.768002,0.697535,0.711939
9,0.0101,1.5355,0.776352,0.755542,0.705814,0.712459
10,0.0084,1.517018,0.782768,0.7728,0.696144,0.714931


[I 2025-03-16 18:42:59,335] Trial 142 pruned. 


Trial 143 with params: {'learning_rate': 0.00040117968531803847, 'weight_decay': 0.008, 'adam_beta1': 0.9, 'warmup_steps': 15}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.137,0.993255,0.780018,0.550957,0.547171,0.540467
2,0.1557,1.098529,0.785518,0.707756,0.652581,0.663938
3,0.0591,1.169235,0.791017,0.758868,0.72484,0.72547
4,0.0324,1.318295,0.789184,0.8066,0.693721,0.727398
5,0.0239,1.271994,0.796517,0.806373,0.707107,0.733775
6,0.0163,1.342072,0.800183,0.786425,0.704418,0.727428
7,0.0127,1.514403,0.791017,0.803135,0.714263,0.73598
8,0.0125,1.546135,0.779102,0.802435,0.707865,0.73238
9,0.0097,1.590392,0.780018,0.79558,0.71491,0.735132
10,0.008,1.622345,0.777269,0.791531,0.69092,0.721466


[I 2025-03-16 18:53:34,519] Trial 143 pruned. 


Trial 144 with params: {'learning_rate': 0.00018022936964075576, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 18}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7146,1.11193,0.759853,0.429791,0.446445,0.423249
2,0.4334,0.983629,0.780018,0.597813,0.545795,0.548821
3,0.1725,1.028205,0.783685,0.704342,0.631772,0.649346
4,0.0849,1.097979,0.796517,0.754397,0.661686,0.689648
5,0.0501,1.149755,0.796517,0.789857,0.71207,0.733505
6,0.0349,1.230175,0.782768,0.775133,0.666504,0.697567
7,0.0238,1.283372,0.792851,0.800852,0.69062,0.721478
8,0.0203,1.272397,0.790101,0.782101,0.718634,0.728795
9,0.0171,1.327727,0.790101,0.794861,0.715673,0.736224
10,0.0119,1.373839,0.791017,0.799354,0.713585,0.734986


[I 2025-03-16 19:09:34,149] Trial 144 finished with value: 0.7428583967330559 and parameters: {'learning_rate': 0.00018022936964075576, 'weight_decay': 0.007, 'adam_beta1': 0.9, 'warmup_steps': 18}. Best is trial 101 with value: 0.7580381076828203.


Trial 145 with params: {'learning_rate': 0.00014074166190486731, 'weight_decay': 0.007, 'adam_beta1': 0.91, 'warmup_steps': 38}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9721,1.222858,0.748854,0.38488,0.405142,0.3813
2,0.5912,1.017145,0.770852,0.489351,0.493473,0.484553
3,0.2635,0.991532,0.788268,0.651248,0.595188,0.606448
4,0.1343,1.044246,0.792851,0.717364,0.62615,0.650823
5,0.0783,1.075532,0.79835,0.767169,0.691195,0.7122
6,0.0516,1.146067,0.782768,0.774485,0.662996,0.6959
7,0.0348,1.216859,0.786434,0.806122,0.687725,0.716664
8,0.0266,1.217579,0.796517,0.783812,0.719046,0.731645
9,0.0219,1.286508,0.787351,0.78769,0.688259,0.716363
10,0.0159,1.349379,0.784601,0.801483,0.707703,0.729129


Exception in thread Thread-24023 (_pin_memory_loop):
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/pin_memory.py", line 59, in _pin_memory_loop
    do_one_step()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/pin_memory.py", line 35, in do_one_step
    r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
  File "/usr/lib/python3.10/multiprocessing/queues.py", line 122, in get
[W 2025-03-16 19:18:09,168] Trial 145 failed with parameters: {'learning_rate': 0.00014074166190486731, 'weight_decay': 0.007, 'adam_beta1': 0.91, 'warmup_steps': 38} because of the following error: KeyboardInterr

KeyboardInterrupt: 

In [51]:
print(best_trial3)

NameError: name 'best_trial3' is not defined

In [None]:
base.reset_seed()

In [None]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-distill_fine_aug_hp-search", logging_dir=f"~/logs/{DATASET}/bert-distill_fine_aug_hp-search", remove_unused_columns=False, epochs=num_epochs, batch_size=batch_size)

In [None]:
def hp_space(trial):
    params =  {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 5e-4, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0, 1e-2, step=1e-3),
        "adam_beta1" : trial.suggest_float("adam_beta1", 0.9, 0.99, step=0.01),
        "warmup_steps" : trial.suggest_int("warmup_steps", 0, warm_up),
        "lambda_param": trial.suggest_float("lambda_param",0,1,step=.1),
        "temperature": trial.suggest_float("temperature", 2,7, step=.5)
    }
    print(f"Trial {trial.number} with params: {params}")
    return params

In [None]:
pruner = optuna.pruners.HyperbandPruner(min_resource=min_r, max_resource=max_r, reduction_factor=2, bootstrap_count=2)
sampler = optuna.samplers.TPESampler(seed=42, multivariate=True)



In [None]:
trainer = base.DistilTrainer(
    args=training_args,
    train_dataset=train_aug,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    model_init = lambda: get_Bert(),
    #callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)
  

In [None]:
best_trial4 = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    compute_objective=lambda metrics: metrics["eval_f1"],
    pruner=pruner,
    sampler=sampler,
    study_name="Test-Distill-aug",
    n_trials=150
)

In [None]:
print(best_trial4)

BestRun(run_id='136', objective=0.7874461791210883, hyperparameters={'learning_rate': 0.0021806066601338593, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 50, 'lambda_param': 0.8, 'temperature': 4.0}, run_summary=None)


In [None]:
print("Best normal training score: ", best_trial)
print("Best distilation trianing score: ", best_trial2)
print("Best normal training score with augmentations: ", best_trial3)
print("Best distilation trianing score with augmentations: ",best_trial4)

Best normal training score:  BestRun(run_id='41', objective=0.7157156862853267, hyperparameters={'learning_rate': 0.004873101422020569, 'weight_decay': 0.001, 'adam_beta1': 0.93, 'warmup_steps': 4}, run_summary=None)
Best distilation trianing score:  BestRun(run_id='115', objective=0.7423955520558099, hyperparameters={'learning_rate': 0.004092058596290564, 'weight_decay': 0.008, 'adam_beta1': 0.92, 'warmup_steps': 2, 'lambda_param': 1.0, 'temperature': 4.0}, run_summary=None)
Best normal training score with augmentations:  BestRun(run_id='69', objective=0.7829410034428768, hyperparameters={'learning_rate': 0.0014622771684147115, 'weight_decay': 0.006, 'adam_beta1': 0.93, 'warmup_steps': 46}, run_summary=None)
Best distilation trianing score with augmentations:  BestRun(run_id='136', objective=0.7874461791210883, hyperparameters={'learning_rate': 0.0021806066601338593, 'weight_decay': 0.001, 'adam_beta1': 0.9, 'warmup_steps': 50, 'lambda_param': 0.8, 'temperature': 4.0}, run_summary=Non